| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| md.c : Multiple Devices driver for Linux |
| Copyright (C) 1998, 1999, 2000 Ingo Molnar |
| |
| completely rewritten, based on the MD driver code from Marc Zyngier |
| |
| Changes: |
| |
| - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar |
| - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> |
| - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> |
| - kerneld support by Boris Tobotras <boris@xtalk.msk.su> |
| - kmod support by: Cyrus Durgin |
| - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> |
| - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> |
| |
| - lots of fixes and improvements to the RAID1/RAID5 and generic |
| RAID code (such as request based resynchronization): |
| |
| Neil Brown <neilb@cse.unsw.edu.au>. |
| |
| - persistent bitmap code |
| Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. |
| |
| |
| Errors, Warnings, etc. |
| Please use: |
| pr_crit() for error conditions that risk data loss |
| pr_err() for error conditions that are unexpected, like an IO error |
| or internal inconsistency |
| pr_warn() for error conditions that could have been predicated, like |
| adding a device to an array when it has incompatible metadata |
| pr_info() for every interesting, very rare events, like an array starting |
| or stopping, or resync starting or stopping |
| pr_debug() for everything else. |
| |
| */ |
| |
| #include <linux/sched/mm.h> |
| #include <linux/sched/signal.h> |
| #include <linux/kthread.h> |
| #include <linux/blkdev.h> |
| #include <linux/blk-integrity.h> |
| #include <linux/badblocks.h> |
| #include <linux/sysctl.h> |
| #include <linux/seq_file.h> |
| #include <linux/fs.h> |
| #include <linux/poll.h> |
| #include <linux/ctype.h> |
| #include <linux/string.h> |
| #include <linux/hdreg.h> |
| #include <linux/proc_fs.h> |
| #include <linux/random.h> |
| #include <linux/major.h> |
| #include <linux/module.h> |
| #include <linux/reboot.h> |
| #include <linux/file.h> |
| #include <linux/compat.h> |
| #include <linux/delay.h> |
| #include <linux/raid/md_p.h> |
| #include <linux/raid/md_u.h> |
| #include <linux/raid/detect.h> |
| #include <linux/slab.h> |
| #include <linux/percpu-refcount.h> |
| #include <linux/part_stat.h> |
| |
| #include "md.h" |
| #include "md-bitmap.h" |
| #include "md-cluster.h" |
| |
| static const char *action_name[NR_SYNC_ACTIONS] = { |
| [ACTION_RESYNC] = "resync", |
| [ACTION_RECOVER] = "recover", |
| [ACTION_CHECK] = "check", |
| [ACTION_REPAIR] = "repair", |
| [ACTION_RESHAPE] = "reshape", |
| [ACTION_FROZEN] = "frozen", |
| [ACTION_IDLE] = "idle", |
| }; |
| |
| /* pers_list is a list of registered personalities protected by pers_lock. */ |
| static LIST_HEAD(pers_list); |
| static DEFINE_SPINLOCK(pers_lock); |
| |
| static const struct kobj_type md_ktype; |
| |
| const struct md_cluster_operations *md_cluster_ops; |
| EXPORT_SYMBOL(md_cluster_ops); |
| static struct module *md_cluster_mod; |
| |
| static DECLARE_WAIT_QUEUE_HEAD(resync_wait); |
| static struct workqueue_struct *md_wq; |
| |
| /* |
| * This workqueue is used for sync_work to register new sync_thread, and for |
| * del_work to remove rdev, and for event_work that is only set by dm-raid. |
| * |
| * Noted that sync_work will grab reconfig_mutex, hence never flush this |
| * workqueue whith reconfig_mutex grabbed. |
| */ |
| static struct workqueue_struct *md_misc_wq; |
| struct workqueue_struct *md_bitmap_wq; |
| |
| static int remove_and_add_spares(struct mddev *mddev, |
| struct md_rdev *this); |
| static void mddev_detach(struct mddev *mddev); |
| static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); |
| static void md_wakeup_thread_directly(struct md_thread __rcu *thread); |
| |
| /* |
| * Default number of read corrections we'll attempt on an rdev |
| * before ejecting it from the array. We divide the read error |
| * count by 2 for every hour elapsed between read errors. |
| */ |
| #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 |
| /* Default safemode delay: 200 msec */ |
| #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) |
| /* |
| * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' |
| * is 1000 KB/sec, so the extra system load does not show up that much. |
| * Increase it if you want to have more _guaranteed_ speed. Note that |
| * the RAID driver will use the maximum available bandwidth if the IO |
| * subsystem is idle. There is also an 'absolute maximum' reconstruction |
| * speed limit - in case reconstruction slows down your system despite |
| * idle IO detection. |
| * |
| * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. |
| * or /sys/block/mdX/md/sync_speed_{min,max} |
| */ |
| |
| static int sysctl_speed_limit_min = 1000; |
| static int sysctl_speed_limit_max = 200000; |
| static inline int speed_min(struct mddev *mddev) |
| { |
| return mddev->sync_speed_min ? |
| mddev->sync_speed_min : sysctl_speed_limit_min; |
| } |
| |
| static inline int speed_max(struct mddev *mddev) |
| { |
| return mddev->sync_speed_max ? |
| mddev->sync_speed_max : sysctl_speed_limit_max; |
| } |
| |
| static void rdev_uninit_serial(struct md_rdev *rdev) |
| { |
| if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) |
| return; |
| |
| kvfree(rdev->serial); |
| rdev->serial = NULL; |
| } |
| |
| static void rdevs_uninit_serial(struct mddev *mddev) |
| { |
| struct md_rdev *rdev; |
| |
| rdev_for_each(rdev, mddev) |
| rdev_uninit_serial(rdev); |
| } |
| |
| static int rdev_init_serial(struct md_rdev *rdev) |
| { |
| /* serial_nums equals with BARRIER_BUCKETS_NR */ |
| int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); |
| struct serial_in_rdev *serial = NULL; |
| |
| if (test_bit(CollisionCheck, &rdev->flags)) |
| return 0; |
| |
| serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, |
| GFP_KERNEL); |
| if (!serial) |
| return -ENOMEM; |
| |
| for (i = 0; i < serial_nums; i++) { |
| struct serial_in_rdev *serial_tmp = &serial[i]; |
| |
| spin_lock_init(&serial_tmp->serial_lock); |
| serial_tmp->serial_rb = RB_ROOT_CACHED; |
| init_waitqueue_head(&serial_tmp->serial_io_wait); |
| } |
| |
| rdev->serial = serial; |
| set_bit(CollisionCheck, &rdev->flags); |
| |
| return 0; |
| } |
| |
| static int rdevs_init_serial(struct mddev *mddev) |
| { |
| struct md_rdev *rdev; |
| int ret = 0; |
| |
| rdev_for_each(rdev, mddev) { |
| ret = rdev_init_serial(rdev); |
| if (ret) |
| break; |
| } |
| |
| /* Free all resources if pool is not existed */ |
| if (ret && !mddev->serial_info_pool) |
| rdevs_uninit_serial(mddev); |
| |
| return ret; |
| } |
| |
| /* |
| * rdev needs to enable serial stuffs if it meets the conditions: |
| * 1. it is multi-queue device flaged with writemostly. |
| * 2. the write-behind mode is enabled. |
| */ |
| static int rdev_need_serial(struct md_rdev *rdev) |
| { |
| return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && |
| rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && |
| test_bit(WriteMostly, &rdev->flags)); |
| } |
| |
| /* |
| * Init resource for rdev(s), then create serial_info_pool if: |
| * 1. rdev is the first device which return true from rdev_enable_serial. |
| * 2. rdev is NULL, means we want to enable serialization for all rdevs. |
| */ |
| void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| int ret = 0; |
| |
| if (rdev && !rdev_need_serial(rdev) && |
| !test_bit(CollisionCheck, &rdev->flags)) |
| return; |
| |
| if (!rdev) |
| ret = rdevs_init_serial(mddev); |
| else |
| ret = rdev_init_serial(rdev); |
| if (ret) |
| return; |
| |
| if (mddev->serial_info_pool == NULL) { |
| /* |
| * already in memalloc noio context by |
| * mddev_suspend() |
| */ |
| mddev->serial_info_pool = |
| mempool_create_kmalloc_pool(NR_SERIAL_INFOS, |
| sizeof(struct serial_info)); |
| if (!mddev->serial_info_pool) { |
| rdevs_uninit_serial(mddev); |
| pr_err("can't alloc memory pool for serialization\n"); |
| } |
| } |
| } |
| |
| /* |
| * Free resource from rdev(s), and destroy serial_info_pool under conditions: |
| * 1. rdev is the last device flaged with CollisionCheck. |
| * 2. when bitmap is destroyed while policy is not enabled. |
| * 3. for disable policy, the pool is destroyed only when no rdev needs it. |
| */ |
| void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| if (rdev && !test_bit(CollisionCheck, &rdev->flags)) |
| return; |
| |
| if (mddev->serial_info_pool) { |
| struct md_rdev *temp; |
| int num = 0; /* used to track if other rdevs need the pool */ |
| |
| rdev_for_each(temp, mddev) { |
| if (!rdev) { |
| if (!mddev->serialize_policy || |
| !rdev_need_serial(temp)) |
| rdev_uninit_serial(temp); |
| else |
| num++; |
| } else if (temp != rdev && |
| test_bit(CollisionCheck, &temp->flags)) |
| num++; |
| } |
| |
| if (rdev) |
| rdev_uninit_serial(rdev); |
| |
| if (num) |
| pr_info("The mempool could be used by other devices\n"); |
| else { |
| mempool_destroy(mddev->serial_info_pool); |
| mddev->serial_info_pool = NULL; |
| } |
| } |
| } |
| |
| static struct ctl_table_header *raid_table_header; |
| |
| static struct ctl_table raid_table[] = { |
| { |
| .procname = "speed_limit_min", |
| .data = &sysctl_speed_limit_min, |
| .maxlen = sizeof(int), |
| .mode = S_IRUGO|S_IWUSR, |
| .proc_handler = proc_dointvec, |
| }, |
| { |
| .procname = "speed_limit_max", |
| .data = &sysctl_speed_limit_max, |
| .maxlen = sizeof(int), |
| .mode = S_IRUGO|S_IWUSR, |
| .proc_handler = proc_dointvec, |
| }, |
| }; |
| |
| static int start_readonly; |
| |
| /* |
| * The original mechanism for creating an md device is to create |
| * a device node in /dev and to open it. This causes races with device-close. |
| * The preferred method is to write to the "new_array" module parameter. |
| * This can avoid races. |
| * Setting create_on_open to false disables the original mechanism |
| * so all the races disappear. |
| */ |
| static bool create_on_open = true; |
| |
| /* |
| * We have a system wide 'event count' that is incremented |
| * on any 'interesting' event, and readers of /proc/mdstat |
| * can use 'poll' or 'select' to find out when the event |
| * count increases. |
| * |
| * Events are: |
| * start array, stop array, error, add device, remove device, |
| * start build, activate spare |
| */ |
| static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); |
| static atomic_t md_event_count; |
| void md_new_event(void) |
| { |
| atomic_inc(&md_event_count); |
| wake_up(&md_event_waiters); |
| } |
| EXPORT_SYMBOL_GPL(md_new_event); |
| |
| /* |
| * Enables to iterate over all existing md arrays |
| * all_mddevs_lock protects this list. |
| */ |
| static LIST_HEAD(all_mddevs); |
| static DEFINE_SPINLOCK(all_mddevs_lock); |
| |
| static bool is_md_suspended(struct mddev *mddev) |
| { |
| return percpu_ref_is_dying(&mddev->active_io); |
| } |
| /* Rather than calling directly into the personality make_request function, |
| * IO requests come here first so that we can check if the device is |
| * being suspended pending a reconfiguration. |
| * We hold a refcount over the call to ->make_request. By the time that |
| * call has finished, the bio has been linked into some internal structure |
| * and so is visible to ->quiesce(), so we don't need the refcount any more. |
| */ |
| static bool is_suspended(struct mddev *mddev, struct bio *bio) |
| { |
| if (is_md_suspended(mddev)) |
| return true; |
| if (bio_data_dir(bio) != WRITE) |
| return false; |
| if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) |
| return false; |
| if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) |
| return false; |
| if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) |
| return false; |
| return true; |
| } |
| |
| bool md_handle_request(struct mddev *mddev, struct bio *bio) |
| { |
| check_suspended: |
| if (is_suspended(mddev, bio)) { |
| DEFINE_WAIT(__wait); |
| /* Bail out if REQ_NOWAIT is set for the bio */ |
| if (bio->bi_opf & REQ_NOWAIT) { |
| bio_wouldblock_error(bio); |
| return true; |
| } |
| for (;;) { |
| prepare_to_wait(&mddev->sb_wait, &__wait, |
| TASK_UNINTERRUPTIBLE); |
| if (!is_suspended(mddev, bio)) |
| break; |
| schedule(); |
| } |
| finish_wait(&mddev->sb_wait, &__wait); |
| } |
| if (!percpu_ref_tryget_live(&mddev->active_io)) |
| goto check_suspended; |
| |
| if (!mddev->pers->make_request(mddev, bio)) { |
| percpu_ref_put(&mddev->active_io); |
| if (!mddev->gendisk && mddev->pers->prepare_suspend) |
| return false; |
| goto check_suspended; |
| } |
| |
| percpu_ref_put(&mddev->active_io); |
| return true; |
| } |
| EXPORT_SYMBOL(md_handle_request); |
| |
| static void md_submit_bio(struct bio *bio) |
| { |
| const int rw = bio_data_dir(bio); |
| struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; |
| |
| if (mddev == NULL || mddev->pers == NULL) { |
| bio_io_error(bio); |
| return; |
| } |
| |
| if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { |
| bio_io_error(bio); |
| return; |
| } |
| |
| bio = bio_split_to_limits(bio); |
| if (!bio) |
| return; |
| |
| if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { |
| if (bio_sectors(bio) != 0) |
| bio->bi_status = BLK_STS_IOERR; |
| bio_endio(bio); |
| return; |
| } |
| |
| /* bio could be mergeable after passing to underlayer */ |
| bio->bi_opf &= ~REQ_NOMERGE; |
| |
| md_handle_request(mddev, bio); |
| } |
| |
| /* |
| * Make sure no new requests are submitted to the device, and any requests that |
| * have been submitted are completely handled. |
| */ |
| int mddev_suspend(struct mddev *mddev, bool interruptible) |
| { |
| int err = 0; |
| |
| /* |
| * hold reconfig_mutex to wait for normal io will deadlock, because |
| * other context can't update super_block, and normal io can rely on |
| * updating super_block. |
| */ |
| lockdep_assert_not_held(&mddev->reconfig_mutex); |
| |
| if (interruptible) |
| err = mutex_lock_interruptible(&mddev->suspend_mutex); |
| else |
| mutex_lock(&mddev->suspend_mutex); |
| if (err) |
| return err; |
| |
| if (mddev->suspended) { |
| WRITE_ONCE(mddev->suspended, mddev->suspended + 1); |
| mutex_unlock(&mddev->suspend_mutex); |
| return 0; |
| } |
| |
| percpu_ref_kill(&mddev->active_io); |
| if (interruptible) |
| err = wait_event_interruptible(mddev->sb_wait, |
| percpu_ref_is_zero(&mddev->active_io)); |
| else |
| wait_event(mddev->sb_wait, |
| percpu_ref_is_zero(&mddev->active_io)); |
| if (err) { |
| percpu_ref_resurrect(&mddev->active_io); |
| mutex_unlock(&mddev->suspend_mutex); |
| return err; |
| } |
| |
| /* |
| * For raid456, io might be waiting for reshape to make progress, |
| * allow new reshape to start while waiting for io to be done to |
| * prevent deadlock. |
| */ |
| WRITE_ONCE(mddev->suspended, mddev->suspended + 1); |
| |
| /* restrict memory reclaim I/O during raid array is suspend */ |
| mddev->noio_flag = memalloc_noio_save(); |
| |
| mutex_unlock(&mddev->suspend_mutex); |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(mddev_suspend); |
| |
| static void __mddev_resume(struct mddev *mddev, bool recovery_needed) |
| { |
| lockdep_assert_not_held(&mddev->reconfig_mutex); |
| |
| mutex_lock(&mddev->suspend_mutex); |
| WRITE_ONCE(mddev->suspended, mddev->suspended - 1); |
| if (mddev->suspended) { |
| mutex_unlock(&mddev->suspend_mutex); |
| return; |
| } |
| |
| /* entred the memalloc scope from mddev_suspend() */ |
| memalloc_noio_restore(mddev->noio_flag); |
| |
| percpu_ref_resurrect(&mddev->active_io); |
| wake_up(&mddev->sb_wait); |
| |
| if (recovery_needed) |
| set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| md_wakeup_thread(mddev->thread); |
| md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ |
| |
| mutex_unlock(&mddev->suspend_mutex); |
| } |
| |
| void mddev_resume(struct mddev *mddev) |
| { |
| return __mddev_resume(mddev, true); |
| } |
| EXPORT_SYMBOL_GPL(mddev_resume); |
| |
| /* sync bdev before setting device to readonly or stopping raid*/ |
| static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) |
| { |
| mutex_lock(&mddev->open_mutex); |
| if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { |
| mutex_unlock(&mddev->open_mutex); |
| return -EBUSY; |
| } |
| if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { |
| mutex_unlock(&mddev->open_mutex); |
| return -EBUSY; |
| } |
| mutex_unlock(&mddev->open_mutex); |
| |
| sync_blockdev(mddev->gendisk->part0); |
| return 0; |
| } |
| |
| /* |
| * Generic flush handling for md |
| */ |
| |
| static void md_end_flush(struct bio *bio) |
| { |
| struct md_rdev *rdev = bio->bi_private; |
| struct mddev *mddev = rdev->mddev; |
| |
| bio_put(bio); |
| |
| rdev_dec_pending(rdev, mddev); |
| |
| if (atomic_dec_and_test(&mddev->flush_pending)) |
| /* The pre-request flush has finished */ |
| queue_work(md_wq, &mddev->flush_work); |
| } |
| |
| static void md_submit_flush_data(struct work_struct *ws); |
| |
| static void submit_flushes(struct work_struct *ws) |
| { |
| struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
| struct md_rdev *rdev; |
| |
| mddev->start_flush = ktime_get_boottime(); |
| INIT_WORK(&mddev->flush_work, md_submit_flush_data); |
| atomic_set(&mddev->flush_pending, 1); |
| rcu_read_lock(); |
| rdev_for_each_rcu(rdev, mddev) |
| if (rdev->raid_disk >= 0 && |
| !test_bit(Faulty, &rdev->flags)) { |
| struct bio *bi; |
| |
| atomic_inc(&rdev->nr_pending); |
| rcu_read_unlock(); |
| bi = bio_alloc_bioset(rdev->bdev, 0, |
| REQ_OP_WRITE | REQ_PREFLUSH, |
| GFP_NOIO, &mddev->bio_set); |
| bi->bi_end_io = md_end_flush; |
| bi->bi_private = rdev; |
| atomic_inc(&mddev->flush_pending); |
| submit_bio(bi); |
| rcu_read_lock(); |
| } |
| rcu_read_unlock(); |
| if (atomic_dec_and_test(&mddev->flush_pending)) |
| queue_work(md_wq, &mddev->flush_work); |
| } |
| |
| static void md_submit_flush_data(struct work_struct *ws) |
| { |
| struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
| struct bio *bio = mddev->flush_bio; |
| |
| /* |
| * must reset flush_bio before calling into md_handle_request to avoid a |
| * deadlock, because other bios passed md_handle_request suspend check |
| * could wait for this and below md_handle_request could wait for those |
| * bios because of suspend check |
| */ |
| spin_lock_irq(&mddev->lock); |
| mddev->prev_flush_start = mddev->start_flush; |
| mddev->flush_bio = NULL; |
| spin_unlock_irq(&mddev->lock); |
| wake_up(&mddev->sb_wait); |
| |
| if (bio->bi_iter.bi_size == 0) { |
| /* an empty barrier - all done */ |
| bio_endio(bio); |
| } else { |
| bio->bi_opf &= ~REQ_PREFLUSH; |
| |
| /* |
| * make_requst() will never return error here, it only |
| * returns error in raid5_make_request() by dm-raid. |
| * Since dm always splits data and flush operation into |
| * two separate io, io size of flush submitted by dm |
| * always is 0, make_request() will not be called here. |
| */ |
| if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) |
| bio_io_error(bio); |
| } |
| |
| /* The pair is percpu_ref_get() from md_flush_request() */ |
| percpu_ref_put(&mddev->active_io); |
| } |
| |
| /* |
| * Manages consolidation of flushes and submitting any flushes needed for |
| * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is |
| * being finished in another context. Returns false if the flushing is |
| * complete but still needs the I/O portion of the bio to be processed. |
| */ |
| bool md_flush_request(struct mddev *mddev, struct bio *bio) |
| { |
| ktime_t req_start = ktime_get_boottime(); |
| spin_lock_irq(&mddev->lock); |
| /* flush requests wait until ongoing flush completes, |
| * hence coalescing all the pending requests. |
| */ |
| wait_event_lock_irq(mddev->sb_wait, |
| !mddev->flush_bio || |
| ktime_before(req_start, mddev->prev_flush_start), |
| mddev->lock); |
| /* new request after previous flush is completed */ |
| if (ktime_after(req_start, mddev->prev_flush_start)) { |
| WARN_ON(mddev->flush_bio); |
| /* |
| * Grab a reference to make sure mddev_suspend() will wait for |
| * this flush to be done. |
| * |
| * md_flush_reqeust() is called under md_handle_request() and |
| * 'active_io' is already grabbed, hence percpu_ref_is_zero() |
| * won't pass, percpu_ref_tryget_live() can't be used because |
| * percpu_ref_kill() can be called by mddev_suspend() |
| * concurrently. |
| */ |
| WARN_ON(percpu_ref_is_zero(&mddev->active_io)); |
| percpu_ref_get(&mddev->active_io); |
| mddev->flush_bio = bio; |
| spin_unlock_irq(&mddev->lock); |
| INIT_WORK(&mddev->flush_work, submit_flushes); |
| queue_work(md_wq, &mddev->flush_work); |
| return true; |
| } |
| |
| /* flush was performed for some other bio while we waited. */ |
| spin_unlock_irq(&mddev->lock); |
| if (bio->bi_iter.bi_size == 0) { |
| /* pure flush without data - all done */ |
| bio_endio(bio); |
| return true; |
| } |
| |
| bio->bi_opf &= ~REQ_PREFLUSH; |
| return false; |
| } |
| EXPORT_SYMBOL(md_flush_request); |
| |
| static inline struct mddev *mddev_get(struct mddev *mddev) |
| { |
| lockdep_assert_held(&all_mddevs_lock); |
| |
| if (test_bit(MD_DELETED, &mddev->flags)) |
| return NULL; |
| atomic_inc(&mddev->active); |
| return mddev; |
| } |
| |
| static void mddev_delayed_delete(struct work_struct *ws); |
| |
| static void __mddev_put(struct mddev *mddev) |
| { |
| if (mddev->raid_disks || !list_empty(&mddev->disks) || |
| mddev->ctime || mddev->hold_active) |
| return; |
| |
| /* Array is not configured at all, and not held active, so destroy it */ |
| set_bit(MD_DELETED, &mddev->flags); |
| |
| /* |
| * Call queue_work inside the spinlock so that flush_workqueue() after |
| * mddev_find will succeed in waiting for the work to be done. |
| */ |
| queue_work(md_misc_wq, &mddev->del_work); |
| } |
| |
| void mddev_put(struct mddev *mddev) |
| { |
| if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) |
| return; |
| |
| __mddev_put(mddev); |
| spin_unlock(&all_mddevs_lock); |
| } |
| |
| static void md_safemode_timeout(struct timer_list *t); |
| static void md_start_sync(struct work_struct *ws); |
| |
| static void active_io_release(struct percpu_ref *ref) |
| { |
| struct mddev *mddev = container_of(ref, struct mddev, active_io); |
| |
| wake_up(&mddev->sb_wait); |
| } |
| |
| static void no_op(struct percpu_ref *r) {} |
| |
| int mddev_init(struct mddev *mddev) |
| { |
| |
| if (percpu_ref_init(&mddev->active_io, active_io_release, |
| PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) |
| return -ENOMEM; |
| |
| if (percpu_ref_init(&mddev->writes_pending, no_op, |
| PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { |
| percpu_ref_exit(&mddev->active_io); |
| return -ENOMEM; |
| } |
| |
| /* We want to start with the refcount at zero */ |
| percpu_ref_put(&mddev->writes_pending); |
| |
| mutex_init(&mddev->open_mutex); |
| mutex_init(&mddev->reconfig_mutex); |
| mutex_init(&mddev->suspend_mutex); |
| mutex_init(&mddev->bitmap_info.mutex); |
| INIT_LIST_HEAD(&mddev->disks); |
| INIT_LIST_HEAD(&mddev->all_mddevs); |
| INIT_LIST_HEAD(&mddev->deleting); |
| timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); |
| atomic_set(&mddev->active, 1); |
| atomic_set(&mddev->openers, 0); |
| atomic_set(&mddev->sync_seq, 0); |
| spin_lock_init(&mddev->lock); |
| atomic_set(&mddev->flush_pending, 0); |
| init_waitqueue_head(&mddev->sb_wait); |
| init_waitqueue_head(&mddev->recovery_wait); |
| mddev->reshape_position = MaxSector; |
| mddev->reshape_backwards = 0; |
| mddev->last_sync_action = ACTION_IDLE; |
| mddev->resync_min = 0; |
| mddev->resync_max = MaxSector; |
| mddev->level = LEVEL_NONE; |
| |
| INIT_WORK(&mddev->sync_work, md_start_sync); |
| INIT_WORK(&mddev->del_work, mddev_delayed_delete); |
| |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(mddev_init); |
| |
| void mddev_destroy(struct mddev *mddev) |
| { |
| percpu_ref_exit(&mddev->active_io); |
| percpu_ref_exit(&mddev->writes_pending); |
| } |
| EXPORT_SYMBOL_GPL(mddev_destroy); |
| |
| static struct mddev *mddev_find_locked(dev_t unit) |
| { |
| struct mddev *mddev; |
| |
| list_for_each_entry(mddev, &all_mddevs, all_mddevs) |
| if (mddev->unit == unit) |
| return mddev; |
| |
| return NULL; |
| } |
| |
| /* find an unused unit number */ |
| static dev_t mddev_alloc_unit(void) |
| { |
| static int next_minor = 512; |
| int start = next_minor; |
| bool is_free = 0; |
| dev_t dev = 0; |
| |
| while (!is_free) { |
| dev = MKDEV(MD_MAJOR, next_minor); |
| next_minor++; |
| if (next_minor > MINORMASK) |
| next_minor = 0; |
| if (next_minor == start) |
| return 0; /* Oh dear, all in use. */ |
| is_free = !mddev_find_locked(dev); |
| } |
| |
| return dev; |
| } |
| |
| static struct mddev *mddev_alloc(dev_t unit) |
| { |
| struct mddev *new; |
| int error; |
| |
| if (unit && MAJOR(unit) != MD_MAJOR) |
| unit &= ~((1 << MdpMinorShift) - 1); |
| |
| new = kzalloc(sizeof(*new), GFP_KERNEL); |
| if (!new) |
| return ERR_PTR(-ENOMEM); |
| |
| error = mddev_init(new); |
| if (error) |
| goto out_free_new; |
| |
| spin_lock(&all_mddevs_lock); |
| if (unit) { |
| error = -EEXIST; |
| if (mddev_find_locked(unit)) |
| goto out_destroy_new; |
| new->unit = unit; |
| if (MAJOR(unit) == MD_MAJOR) |
| new->md_minor = MINOR(unit); |
| else |
| new->md_minor = MINOR(unit) >> MdpMinorShift; |
| new->hold_active = UNTIL_IOCTL; |
| } else { |
| error = -ENODEV; |
| new->unit = mddev_alloc_unit(); |
| if (!new->unit) |
| goto out_destroy_new; |
| new->md_minor = MINOR(new->unit); |
| new->hold_active = UNTIL_STOP; |
| } |
| |
| list_add(&new->all_mddevs, &all_mddevs); |
| spin_unlock(&all_mddevs_lock); |
| return new; |
| |
| out_destroy_new: |
| spin_unlock(&all_mddevs_lock); |
| mddev_destroy(new); |
| out_free_new: |
| kfree(new); |
| return ERR_PTR(error); |
| } |
| |
| static void mddev_free(struct mddev *mddev) |
| { |
| spin_lock(&all_mddevs_lock); |
| list_del(&mddev->all_mddevs); |
| spin_unlock(&all_mddevs_lock); |
| |
| mddev_destroy(mddev); |
| kfree(mddev); |
| } |
| |
| static const struct attribute_group md_redundancy_group; |
| |
| void mddev_unlock(struct mddev *mddev) |
| { |
| struct md_rdev *rdev; |
| struct md_rdev *tmp; |
| LIST_HEAD(delete); |
| |
| if (!list_empty(&mddev->deleting)) |
| list_splice_init(&mddev->deleting, &delete); |
| |
| if (mddev->to_remove) { |
| /* These cannot be removed under reconfig_mutex as |
| * an access to the files will try to take reconfig_mutex |
| * while holding the file unremovable, which leads to |
| * a deadlock. |
| * So hold set sysfs_active while the remove in happeing, |
| * and anything else which might set ->to_remove or my |
| * otherwise change the sysfs namespace will fail with |
| * -EBUSY if sysfs_active is still set. |
| * We set sysfs_active under reconfig_mutex and elsewhere |
| * test it under the same mutex to ensure its correct value |
| * is seen. |
| */ |
| const struct attribute_group *to_remove = mddev->to_remove; |
| mddev->to_remove = NULL; |
| mddev->sysfs_active = 1; |
| mutex_unlock(&mddev->reconfig_mutex); |
| |
| if (mddev->kobj.sd) { |
| if (to_remove != &md_redundancy_group) |
| sysfs_remove_group(&mddev->kobj, to_remove); |
| if (mddev->pers == NULL || |
| mddev->pers->sync_request == NULL) { |
| sysfs_remove_group(&mddev->kobj, &md_redundancy_group); |
| if (mddev->sysfs_action) |
| sysfs_put(mddev->sysfs_action); |
| if (mddev->sysfs_completed) |
| sysfs_put(mddev->sysfs_completed); |
| if (mddev->sysfs_degraded) |
| sysfs_put(mddev->sysfs_degraded); |
| mddev->sysfs_action = NULL; |
| mddev->sysfs_completed = NULL; |
| mddev->sysfs_degraded = NULL; |
| } |
| } |
| mddev->sysfs_active = 0; |
| } else |
| mutex_unlock(&mddev->reconfig_mutex); |
| |
| md_wakeup_thread(mddev->thread); |
| wake_up(&mddev->sb_wait); |
| |
| list_for_each_entry_safe(rdev, tmp, &delete, same_set) { |
| list_del_init(&rdev->same_set); |
| kobject_del(&rdev->kobj); |
| export_rdev(rdev, mddev); |
| } |
| } |
| EXPORT_SYMBOL_GPL(mddev_unlock); |
| |
| struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) |
| { |
| struct md_rdev *rdev; |
| |
| rdev_for_each_rcu(rdev, mddev) |
| if (rdev->desc_nr == nr) |
| return rdev; |
| |
| return NULL; |
| } |
| EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); |
| |
| static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) |
| { |
| struct md_rdev *rdev; |
| |
| rdev_for_each(rdev, mddev) |
| if (rdev->bdev->bd_dev == dev) |
| return rdev; |
| |
| return NULL; |
| } |
| |
| struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) |
| { |
| struct md_rdev *rdev; |
| |
| rdev_for_each_rcu(rdev, mddev) |
| if (rdev->bdev->bd_dev == dev) |
| return rdev; |
| |
| return NULL; |
| } |
| EXPORT_SYMBOL_GPL(md_find_rdev_rcu); |
| |
| static struct md_personality *find_pers(int level, char *clevel) |
| { |
| struct md_personality *pers; |
| list_for_each_entry(pers, &pers_list, list) { |
| if (level != LEVEL_NONE && pers->level == level) |
| return pers; |
| if (strcmp(pers->name, clevel)==0) |
| return pers; |
| } |
| return NULL; |
| } |
| |
| /* return the offset of the super block in 512byte sectors */ |
| static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) |
| { |
| return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); |
| } |
| |
| static int alloc_disk_sb(struct md_rdev *rdev) |
| { |
| rdev->sb_page = alloc_page(GFP_KERNEL); |
| if (!rdev->sb_page) |
| return -ENOMEM; |
| return 0; |
| } |
| |
| void md_rdev_clear(struct md_rdev *rdev) |
| { |
| if (rdev->sb_page) { |
| put_page(rdev->sb_page); |
| rdev->sb_loaded = 0; |
| rdev->sb_page = NULL; |
| rdev->sb_start = 0; |
| rdev->sectors = 0; |
| } |
| if (rdev->bb_page) { |
| put_page(rdev->bb_page); |
| rdev->bb_page = NULL; |
| } |
| badblocks_exit(&rdev->badblocks); |
| } |
| EXPORT_SYMBOL_GPL(md_rdev_clear); |
| |
| static void super_written(struct bio *bio) |
| { |
| struct md_rdev *rdev = bio->bi_private; |
| struct mddev *mddev = rdev->mddev; |
| |
| if (bio->bi_status) { |
| pr_err("md: %s gets error=%d\n", __func__, |
| blk_status_to_errno(bio->bi_status)); |
| md_error(mddev, rdev); |
| if (!test_bit(Faulty, &rdev->flags) |
| && (bio->bi_opf & MD_FAILFAST)) { |
| set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); |
| set_bit(LastDev, &rdev->flags); |
| } |
| } else |
| clear_bit(LastDev, &rdev->flags); |
| |
| bio_put(bio); |
| |
| rdev_dec_pending(rdev, mddev); |
| |
| if (atomic_dec_and_test(&mddev->pending_writes)) |
| wake_up(&mddev->sb_wait); |
| } |
| |
| void md_super_write(struct mddev *mddev, struct md_rdev *rdev, |
| sector_t sector, int size, struct page *page) |
| { |
| /* write first size bytes of page to sector of rdev |
| * Increment mddev->pending_writes before returning |
| * and decrement it on completion, waking up sb_wait |
| * if zero is reached. |
| * If an error occurred, call md_error |
| */ |
| struct bio *bio; |
| |
| if (!page) |
| return; |
| |
| if (test_bit(Faulty, &rdev->flags)) |
| return; |
| |
| bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, |
| 1, |
| REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META |
| | REQ_PREFLUSH | REQ_FUA, |
| GFP_NOIO, &mddev->sync_set); |
| |
| atomic_inc(&rdev->nr_pending); |
| |
| bio->bi_iter.bi_sector = sector; |
| __bio_add_page(bio, page, size, 0); |
| bio->bi_private = rdev; |
| bio->bi_end_io = super_written; |
| |
| if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && |
| test_bit(FailFast, &rdev->flags) && |
| !test_bit(LastDev, &rdev->flags)) |
| bio->bi_opf |= MD_FAILFAST; |
| |
| atomic_inc(&mddev->pending_writes); |
| submit_bio(bio); |
| } |
| |
| int md_super_wait(struct mddev *mddev) |
| { |
| /* wait for all superblock writes that were scheduled to complete */ |
| wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); |
| if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) |
| return -EAGAIN; |
| return 0; |
| } |
| |
| int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
| struct page *page, blk_opf_t opf, bool metadata_op) |
| { |
| struct bio bio; |
| struct bio_vec bvec; |
| |
| if (metadata_op && rdev->meta_bdev) |
| bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); |
| else |
| bio_init(&bio, rdev->bdev, &bvec, 1, opf); |
| |
| if (metadata_op) |
| bio.bi_iter.bi_sector = sector + rdev->sb_start; |
| else if (rdev->mddev->reshape_position != MaxSector && |
| (rdev->mddev->reshape_backwards == |
| (sector >= rdev->mddev->reshape_position))) |
| bio.bi_iter.bi_sector = sector + rdev->new_data_offset; |
| else |
| bio.bi_iter.bi_sector = sector + rdev->data_offset; |
| __bio_add_page(&bio, page, size, 0); |
| |
| submit_bio_wait(&bio); |
| |
| return !bio.bi_status; |
| } |
| EXPORT_SYMBOL_GPL(sync_page_io); |
| |
| static int read_disk_sb(struct md_rdev *rdev, int size) |
| { |
| if (rdev->sb_loaded) |
| return 0; |
| |
| if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) |
| goto fail; |
| rdev->sb_loaded = 1; |
| return 0; |
| |
| fail: |
| pr_err("md: disabled device %pg, could not read superblock.\n", |
| rdev->bdev); |
| return -EINVAL; |
| } |
| |
| static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
| { |
| return sb1->set_uuid0 == sb2->set_uuid0 && |
| sb1->set_uuid1 == sb2->set_uuid1 && |
| sb1->set_uuid2 == sb2->set_uuid2 && |
| sb1->set_uuid3 == sb2->set_uuid3; |
| } |
| |
| static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
| { |
| int ret; |
| mdp_super_t *tmp1, *tmp2; |
| |
| tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); |
| tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); |
| |
| if (!tmp1 || !tmp2) { |
| ret = 0; |
| goto abort; |
| } |
| |
| *tmp1 = *sb1; |
| *tmp2 = *sb2; |
| |
| /* |
| * nr_disks is not constant |
| */ |
| tmp1->nr_disks = 0; |
| tmp2->nr_disks = 0; |
| |
| ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); |
| abort: |
| kfree(tmp1); |
| kfree(tmp2); |
| return ret; |
| } |
| |
| static u32 md_csum_fold(u32 csum) |
| { |
| csum = (csum & 0xffff) + (csum >> 16); |
| return (csum & 0xffff) + (csum >> 16); |
| } |
| |
| static unsigned int calc_sb_csum(mdp_super_t *sb) |
| { |
| u64 newcsum = 0; |
| u32 *sb32 = (u32*)sb; |
| int i; |
| unsigned int disk_csum, csum; |
| |
| disk_csum = sb->sb_csum; |
| sb->sb_csum = 0; |
| |
| for (i = 0; i < MD_SB_BYTES/4 ; i++) |
| newcsum += sb32[i]; |
| csum = (newcsum & 0xffffffff) + (newcsum>>32); |
| |
| #ifdef CONFIG_ALPHA |
| /* This used to use csum_partial, which was wrong for several |
| * reasons including that different results are returned on |
| * different architectures. It isn't critical that we get exactly |
| * the same return value as before (we always csum_fold before |
| * testing, and that removes any differences). However as we |
| * know that csum_partial always returned a 16bit value on |
| * alphas, do a fold to maximise conformity to previous behaviour. |
| */ |
| sb->sb_csum = md_csum_fold(disk_csum); |
| #else |
| sb->sb_csum = disk_csum; |
| #endif |
| return csum; |
| } |
| |
| /* |
| * Handle superblock details. |
| * We want to be able to handle multiple superblock formats |
| * so we have a common interface to them all, and an array of |
| * different handlers. |
| * We rely on user-space to write the initial superblock, and support |
| * reading and updating of superblocks. |
| * Interface methods are: |
| * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) |
| * loads and validates a superblock on dev. |
| * if refdev != NULL, compare superblocks on both devices |
| * Return: |
| * 0 - dev has a superblock that is compatible with refdev |
| * 1 - dev has a superblock that is compatible and newer than refdev |
| * so dev should be used as the refdev in future |
| * -EINVAL superblock incompatible or invalid |
| * -othererror e.g. -EIO |
| * |
| * int validate_super(struct mddev *mddev, struct md_rdev *dev) |
| * Verify that dev is acceptable into mddev. |
| * The first time, mddev->raid_disks will be 0, and data from |
| * dev should be merged in. Subsequent calls check that dev |
| * is new enough. Return 0 or -EINVAL |
| * |
| * void sync_super(struct mddev *mddev, struct md_rdev *dev) |
| * Update the superblock for rdev with data in mddev |
| * This does not write to disc. |
| * |
| */ |
| |
| struct super_type { |
| char *name; |
| struct module *owner; |
| int (*load_super)(struct md_rdev *rdev, |
| struct md_rdev *refdev, |
| int minor_version); |
| int (*validate_super)(struct mddev *mddev, |
| struct md_rdev *freshest, |
| struct md_rdev *rdev); |
| void (*sync_super)(struct mddev *mddev, |
| struct md_rdev *rdev); |
| unsigned long long (*rdev_size_change)(struct md_rdev *rdev, |
| sector_t num_sectors); |
| int (*allow_new_offset)(struct md_rdev *rdev, |
| unsigned long long new_offset); |
| }; |
| |
| /* |
| * Check that the given mddev has no bitmap. |
| * |
| * This function is called from the run method of all personalities that do not |
| * support bitmaps. It prints an error message and returns non-zero if mddev |
| * has a bitmap. Otherwise, it returns 0. |
| * |
| */ |
| int md_check_no_bitmap(struct mddev *mddev) |
| { |
| if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) |
| return 0; |
| pr_warn("%s: bitmaps are not supported for %s\n", |
| mdname(mddev), mddev->pers->name); |
| return 1; |
| } |
| EXPORT_SYMBOL(md_check_no_bitmap); |
| |
| /* |
| * load_super for 0.90.0 |
| */ |
| static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) |
| { |
| mdp_super_t *sb; |
| int ret; |
| bool spare_disk = true; |
| |
| /* |
| * Calculate the position of the superblock (512byte sectors), |
| * it's at the end of the disk. |
| * |
| * It also happens to be a multiple of 4Kb. |
| */ |
| rdev->sb_start = calc_dev_sboffset(rdev); |
| |
| ret = read_disk_sb(rdev, MD_SB_BYTES); |
| if (ret) |
| return ret; |
| |
| ret = -EINVAL; |
| |
| sb = page_address(rdev->sb_page); |
| |
| if (sb->md_magic != MD_SB_MAGIC) { |
| pr_warn("md: invalid raid superblock magic on %pg\n", |
| rdev->bdev); |
| goto abort; |
| } |
| |
| if (sb->major_version != 0 || |
| sb->minor_version < 90 || |
| sb->minor_version > 91) { |
| pr_warn("Bad version number %d.%d on %pg\n", |
| sb->major_version, sb->minor_version, rdev->bdev); |
| goto abort; |
| } |
| |
| if (sb->raid_disks <= 0) |
| goto abort; |
| |
| if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { |
| pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); |
| goto abort; |
| } |
| |
| rdev->preferred_minor = sb->md_minor; |
| rdev->data_offset = 0; |
| rdev->new_data_offset = 0; |
| rdev->sb_size = MD_SB_BYTES; |
| rdev->badblocks.shift = -1; |
| |
| rdev->desc_nr = sb->this_disk.number; |
| |
| /* not spare disk */ |
| if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && |
| sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) |
| spare_disk = false; |
| |
| if (!refdev) { |
| if (!spare_disk) |
| ret = 1; |
| else |
| ret = 0; |
| } else { |
| __u64 ev1, ev2; |
| mdp_super_t *refsb = page_address(refdev->sb_page); |
| if (!md_uuid_equal(refsb, sb)) { |
| pr_warn("md: %pg has different UUID to %pg\n", |
| rdev->bdev, refdev->bdev); |
| goto abort; |
| } |
| if (!md_sb_equal(refsb, sb)) { |
| pr_warn("md: %pg has same UUID but different superblock to %pg\n", |
| rdev->bdev, refdev->bdev); |
| goto abort; |
| } |
| ev1 = md_event(sb); |
| ev2 = md_event(refsb); |
| |
| if (!spare_disk && ev1 > ev2) |
| ret = 1; |
| else |
| ret = 0; |
| } |
| rdev->sectors = rdev->sb_start; |
| /* Limit to 4TB as metadata cannot record more than that. |
| * (not needed for Linear and RAID0 as metadata doesn't |
| * record this size) |
| */ |
| if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) |
| rdev->sectors = (sector_t)(2ULL << 32) - 2; |
| |
| if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) |
| /* "this cannot possibly happen" ... */ |
| ret = -EINVAL; |
| |
| abort: |
| return ret; |
| } |
| |
| /* |
| * validate_super for 0.90.0 |
| * note: we are not using "freshest" for 0.9 superblock |
| */ |
| static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) |
| { |
| mdp_disk_t *desc; |
| mdp_super_t *sb = page_address(rdev->sb_page); |
| __u64 ev1 = md_event(sb); |
| |
| rdev->raid_disk = -1; |
| clear_bit(Faulty, &rdev->flags); |
| clear_bit(In_sync, &rdev->flags); |
| clear_bit(Bitmap_sync, &rdev->flags); |
| clear_bit(WriteMostly, &rdev->flags); |
| |
| if (mddev->raid_disks == 0) { |
| mddev->major_version = 0; |
| mddev->minor_version = sb->minor_version; |
| mddev->patch_version = sb->patch_version; |
| mddev->external = 0; |
| mddev->chunk_sectors = sb->chunk_size >> 9; |
| mddev->ctime = sb->ctime; |
| mddev->utime = sb->utime; |
| mddev->level = sb->level; |
| mddev->clevel[0] = 0; |
| mddev->layout = sb->layout; |
| mddev->raid_disks = sb->raid_disks; |
| mddev->dev_sectors = ((sector_t)sb->size) * 2; |
| mddev->events = ev1; |
| mddev->bitmap_info.offset = 0; |
| mddev->bitmap_info.space = 0; |
| /* bitmap can use 60 K after the 4K superblocks */ |
| mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
| mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); |
| mddev->reshape_backwards = 0; |
| |
| if (mddev->minor_version >= 91) { |
| mddev->reshape_position = sb->reshape_position; |
| mddev->delta_disks = sb->delta_disks; |
| mddev->new_level = sb->new_level; |
| mddev->new_layout = sb->new_layout; |
| mddev->new_chunk_sectors = sb->new_chunk >> 9; |
| if (mddev->delta_disks < 0) |
| mddev->reshape_backwards = 1; |
| } else { |
| mddev->reshape_position = MaxSector; |
| mddev->delta_disks = 0; |
| mddev->new_level = mddev->level; |
| mddev->new_layout = mddev->layout; |
| mddev->new_chunk_sectors = mddev->chunk_sectors; |
| } |
| if (mddev->level == 0) |
| mddev->layout = -1; |
| |
| if (sb->state & (1<<MD_SB_CLEAN)) |
| mddev->recovery_cp = MaxSector; |
| else { |
| if (sb->events_hi == sb->cp_events_hi && |
| sb->events_lo == sb->cp_events_lo) { |
| mddev->recovery_cp = sb->recovery_cp; |
| } else |
| mddev->recovery_cp = 0; |
| } |
| |
| memcpy(mddev->uuid+0, &sb->set_uuid0, 4); |
| memcpy(mddev->uuid+4, &sb->set_uuid1, 4); |
| memcpy(mddev->uuid+8, &sb->set_uuid2, 4); |
| memcpy(mddev->uuid+12,&sb->set_uuid3, 4); |
| |
| mddev->max_disks = MD_SB_DISKS; |
| |
| if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
| mddev->bitmap_info.file == NULL) { |
| mddev->bitmap_info.offset = |
| mddev->bitmap_info.default_offset; |
| mddev->bitmap_info.space = |
| mddev->bitmap_info.default_space; |
| } |
| |
| } else if (mddev->pers == NULL) { |
| /* Insist on good event counter while assembling, except |
| * for spares (which don't need an event count) */ |
| ++ev1; |
| if (sb->disks[rdev->desc_nr].state & ( |
| (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) |
| if (ev1 < mddev->events) |
| return -EINVAL; |
| } else if (mddev->bitmap) { |
| /* if adding to array with a bitmap, then we can accept an |
| * older device ... but not too old. |
| */ |
| if (ev1 < mddev->bitmap->events_cleared) |
| return 0; |
| if (ev1 < mddev->events) |
| set_bit(Bitmap_sync, &rdev->flags); |
| } else { |
| if (ev1 < mddev->events) |
| /* just a hot-add of a new device, leave raid_disk at -1 */ |
| return 0; |
| } |
| |
| desc = sb->disks + rdev->desc_nr; |
| |
| if (desc->state & (1<<MD_DISK_FAULTY)) |
| set_bit(Faulty, &rdev->flags); |
| else if (desc->state & (1<<MD_DISK_SYNC)) { |
| set_bit(In_sync, &rdev->flags); |
| rdev->raid_disk = desc->raid_disk; |
| rdev->saved_raid_disk = desc->raid_disk; |
| } else if (desc->state & (1<<MD_DISK_ACTIVE)) { |
| /* active but not in sync implies recovery up to |
| * reshape position. We don't know exactly where |
| * that is, so set to zero for now |
| */ |
| if (mddev->minor_version >= 91) { |
| rdev->recovery_offset = 0; |
| rdev->raid_disk = desc->raid_disk; |
| } |
| } |
| if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) |
| set_bit(WriteMostly, &rdev->flags); |
| if (desc->state & (1<<MD_DISK_FAILFAST)) |
| set_bit(FailFast, &rdev->flags); |
| return 0; |
| } |
| |
| /* |
| * sync_super for 0.90.0 |
| */ |
| static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| mdp_super_t *sb; |
| struct md_rdev *rdev2; |
| int next_spare = mddev->raid_disks; |
| |
| /* make rdev->sb match mddev data.. |
| * |
| * 1/ zero out disks |
| * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); |
| * 3/ any empty disks < next_spare become removed |
| * |
| * disks[0] gets initialised to REMOVED because |
| * we cannot be sure from other fields if it has |
| * been initialised or not. |
| */ |
| int i; |
| int active=0, working=0,failed=0,spare=0,nr_disks=0; |
| |
| rdev->sb_size = MD_SB_BYTES; |
| |
| sb = page_address(rdev->sb_page); |
| |
| memset(sb, 0, sizeof(*sb)); |
| |
| sb->md_magic = MD_SB_MAGIC; |
| sb->major_version = mddev->major_version; |
| sb->patch_version = mddev->patch_version; |
| sb->gvalid_words = 0; /* ignored */ |
| memcpy(&sb->set_uuid0, mddev->uuid+0, 4); |
| memcpy(&sb->set_uuid1, mddev->uuid+4, 4); |
| memcpy(&sb->set_uuid2, mddev->uuid+8, 4); |
| memcpy(&sb->set_uuid3, mddev->uuid+12,4); |
| |
| sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); |
| sb->level = mddev->level; |
| sb->size = mddev->dev_sectors / 2; |
| sb->raid_disks = mddev->raid_disks; |
| sb->md_minor = mddev->md_minor; |
| sb->not_persistent = 0; |
| sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); |
| sb->state = 0; |
| sb->events_hi = (mddev->events>>32); |
| sb->events_lo = (u32)mddev->events; |
| |
| if (mddev->reshape_position == MaxSector) |
| sb->minor_version = 90; |
| else { |
| sb->minor_version = 91; |
| sb->reshape_position = mddev->reshape_position; |
| sb->new_level = mddev->new_level; |
| sb->delta_disks = mddev->delta_disks; |
| sb->new_layout = mddev->new_layout; |
| sb->new_chunk = mddev->new_chunk_sectors << 9; |
| } |
| mddev->minor_version = sb->minor_version; |
| if (mddev->in_sync) |
| { |
| sb->recovery_cp = mddev->recovery_cp; |
| sb->cp_events_hi = (mddev->events>>32); |
| sb->cp_events_lo = (u32)mddev->events; |
| if (mddev->recovery_cp == MaxSector) |
| sb->state = (1<< MD_SB_CLEAN); |
| } else |
| sb->recovery_cp = 0; |
| |
| sb->layout = mddev->layout; |
| sb->chunk_size = mddev->chunk_sectors << 9; |
| |
| if (mddev->bitmap && mddev->bitmap_info.file == NULL) |
| sb->state |= (1<<MD_SB_BITMAP_PRESENT); |
| |
| sb->disks[0].state = (1<<MD_DISK_REMOVED); |
| rdev_for_each(rdev2, mddev) { |
| mdp_disk_t *d; |
| int desc_nr; |
| int is_active = test_bit(In_sync, &rdev2->flags); |
| |
| if (rdev2->raid_disk >= 0 && |
| sb->minor_version >= 91) |
| /* we have nowhere to store the recovery_offset, |
| * but if it is not below the reshape_position, |
| * we can piggy-back on that. |
| */ |
| is_active = 1; |
| if (rdev2->raid_disk < 0 || |
| test_bit(Faulty, &rdev2->flags)) |
| is_active = 0; |
| if (is_active) |
| desc_nr = rdev2->raid_disk; |
| else |
| desc_nr = next_spare++; |
| rdev2->desc_nr = desc_nr; |
| d = &sb->disks[rdev2->desc_nr]; |
| nr_disks++; |
| d->number = rdev2->desc_nr; |
| d->major = MAJOR(rdev2->bdev->bd_dev); |
| d->minor = MINOR(rdev2->bdev->bd_dev); |
| if (is_active) |
| d->raid_disk = rdev2->raid_disk; |
| else |
| d->raid_disk = rdev2->desc_nr; /* compatibility */ |
| if (test_bit(Faulty, &rdev2->flags)) |
| d->state = (1<<MD_DISK_FAULTY); |
| else if (is_active) { |
| d->state = (1<<MD_DISK_ACTIVE); |
| if (test_bit(In_sync, &rdev2->flags)) |
| d->state |= (1<<MD_DISK_SYNC); |
| active++; |
| working++; |
| } else { |
| d->state = 0; |
| spare++; |
| working++; |
| } |
| if (test_bit(WriteMostly, &rdev2->flags)) |
| d->state |= (1<<MD_DISK_WRITEMOSTLY); |
| if (test_bit(FailFast, &rdev2->flags)) |
| d->state |= (1<<MD_DISK_FAILFAST); |
| } |
| /* now set the "removed" and "faulty" bits on any missing devices */ |
| for (i=0 ; i < mddev->raid_disks ; i++) { |
| mdp_disk_t *d = &sb->disks[i]; |
| if (d->state == 0 && d->number == 0) { |
| d->number = i; |
| d->raid_disk = i; |
| d->state = (1<<MD_DISK_REMOVED); |
| d->state |= (1<<MD_DISK_FAULTY); |
| failed++; |
| } |
| } |
| sb->nr_disks = nr_disks; |
| sb->active_disks = active; |
| sb->working_disks = working; |
| sb->failed_disks = failed; |
| sb->spare_disks = spare; |
| |
| sb->this_disk = sb->disks[rdev->desc_nr]; |
| sb->sb_csum = calc_sb_csum(sb); |
| } |
| |
| /* |
| * rdev_size_change for 0.90.0 |
| */ |
| static unsigned long long |
| super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) |
| { |
| if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
| return 0; /* component must fit device */ |
| if (rdev->mddev->bitmap_info.offset) |
| return 0; /* can't move bitmap */ |
| rdev->sb_start = calc_dev_sboffset(rdev); |
| if (!num_sectors || num_sectors > rdev->sb_start) |
| num_sectors = rdev->sb_start; |
| /* Limit to 4TB as metadata cannot record more than that. |
| * 4TB == 2^32 KB, or 2*2^32 sectors. |
| */ |
| if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) |
| num_sectors = (sector_t)(2ULL << 32) - 2; |
| do { |
| md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
| rdev->sb_page); |
| } while (md_super_wait(rdev->mddev) < 0); |
| return num_sectors; |
| } |
| |
| static int |
| super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) |
| { |
| /* non-zero offset changes not possible with v0.90 */ |
| return new_offset == 0; |
| } |
| |
| /* |
| * version 1 superblock |
| */ |
| |
| static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) |
| { |
| __le32 disk_csum; |
| u32 csum; |
| unsigned long long newcsum; |
| int size = 256 + le32_to_cpu(sb->max_dev)*2; |
| __le32 *isuper = (__le32*)sb; |
| |
| disk_csum = sb->sb_csum; |
| sb->sb_csum = 0; |
| newcsum = 0; |
| for (; size >= 4; size -= 4) |
| newcsum += le32_to_cpu(*isuper++); |
| |
| if (size == 2) |
| newcsum += le16_to_cpu(*(__le16*) isuper); |
| |
| csum = (newcsum & 0xffffffff) + (newcsum >> 32); |
| sb->sb_csum = disk_csum; |
| return cpu_to_le32(csum); |
| } |
| |
| static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) |
| { |
| struct mdp_superblock_1 *sb; |
| int ret; |
| sector_t sb_start; |
| sector_t sectors; |
| int bmask; |
| bool spare_disk = true; |
| |
| /* |
| * Calculate the position of the superblock in 512byte sectors. |
| * It is always aligned to a 4K boundary and |
| * depeding on minor_version, it can be: |
| * 0: At least 8K, but less than 12K, from end of device |
| * 1: At start of device |
| * 2: 4K from start of device. |
| */ |
| switch(minor_version) { |
| case 0: |
| sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; |
| sb_start &= ~(sector_t)(4*2-1); |
| break; |
| case 1: |
| sb_start = 0; |
| break; |
| case 2: |
| sb_start = 8; |
| break; |
| default: |
| return -EINVAL; |
| } |
| rdev->sb_start = sb_start; |
| |
| /* superblock is rarely larger than 1K, but it can be larger, |
| * and it is safe to read 4k, so we do that |
| */ |
| ret = read_disk_sb(rdev, 4096); |
| if (ret) return ret; |
| |
| sb = page_address(rdev->sb_page); |
| |
| if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
| sb->major_version != cpu_to_le32(1) || |
| le32_to_cpu(sb->max_dev) > (4096-256)/2 || |
| le64_to_cpu(sb->super_offset) != rdev->sb_start || |
| (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) |
| return -EINVAL; |
| |
| if (calc_sb_1_csum(sb) != sb->sb_csum) { |
| pr_warn("md: invalid superblock checksum on %pg\n", |
| rdev->bdev); |
| return -EINVAL; |
| } |
| if (le64_to_cpu(sb->data_size) < 10) { |
| pr_warn("md: data_size too small on %pg\n", |
| rdev->bdev); |
| return -EINVAL; |
| } |
| if (sb->pad0 || |
| sb->pad3[0] || |
| memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) |
| /* Some padding is non-zero, might be a new feature */ |
| return -EINVAL; |
| |
| rdev->preferred_minor = 0xffff; |
| rdev->data_offset = le64_to_cpu(sb->data_offset); |
| rdev->new_data_offset = rdev->data_offset; |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && |
| (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) |
| rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); |
| atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); |
| |
| rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; |
| bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; |
| if (rdev->sb_size & bmask) |
| rdev->sb_size = (rdev->sb_size | bmask) + 1; |
| |
| if (minor_version |
| && rdev->data_offset < sb_start + (rdev->sb_size/512)) |
| return -EINVAL; |
| if (minor_version |
| && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) |
| return -EINVAL; |
| |
| rdev->desc_nr = le32_to_cpu(sb->dev_number); |
| |
| if (!rdev->bb_page) { |
| rdev->bb_page = alloc_page(GFP_KERNEL); |
| if (!rdev->bb_page) |
| return -ENOMEM; |
| } |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && |
| rdev->badblocks.count == 0) { |
| /* need to load the bad block list. |
| * Currently we limit it to one page. |
| */ |
| s32 offset; |
| sector_t bb_sector; |
| __le64 *bbp; |
| int i; |
| int sectors = le16_to_cpu(sb->bblog_size); |
| if (sectors > (PAGE_SIZE / 512)) |
| return -EINVAL; |
| offset = le32_to_cpu(sb->bblog_offset); |
| if (offset == 0) |
| return -EINVAL; |
| bb_sector = (long long)offset; |
| if (!sync_page_io(rdev, bb_sector, sectors << 9, |
| rdev->bb_page, REQ_OP_READ, true)) |
| return -EIO; |
| bbp = (__le64 *)page_address(rdev->bb_page); |
| rdev->badblocks.shift = sb->bblog_shift; |
| for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { |
| u64 bb = le64_to_cpu(*bbp); |
| int count = bb & (0x3ff); |
| u64 sector = bb >> 10; |
| sector <<= sb->bblog_shift; |
| count <<= sb->bblog_shift; |
| if (bb + 1 == 0) |
| break; |
| if (badblocks_set(&rdev->badblocks, sector, count, 1)) |
| return -EINVAL; |
| } |
| } else if (sb->bblog_offset != 0) |
| rdev->badblocks.shift = 0; |
| |
| if ((le32_to_cpu(sb->feature_map) & |
| (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { |
| rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); |
| rdev->ppl.size = le16_to_cpu(sb->ppl.size); |
| rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; |
| } |
| |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && |
| sb->level != 0) |
| return -EINVAL; |
| |
| /* not spare disk */ |
| if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && |
| (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || |
| le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) |
| spare_disk = false; |
| |
| if (!refdev) { |
| if (!spare_disk) |
| ret = 1; |
| else |
| ret = 0; |
| } else { |
| __u64 ev1, ev2; |
| struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); |
| |
| if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || |
| sb->level != refsb->level || |
| sb->layout != refsb->layout || |
| sb->chunksize != refsb->chunksize) { |
| pr_warn("md: %pg has strangely different superblock to %pg\n", |
| rdev->bdev, |
| refdev->bdev); |
| return -EINVAL; |
| } |
| ev1 = le64_to_cpu(sb->events); |
| ev2 = le64_to_cpu(refsb->events); |
| |
| if (!spare_disk && ev1 > ev2) |
| ret = 1; |
| else |
| ret = 0; |
| } |
| if (minor_version) |
| sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; |
| else |
| sectors = rdev->sb_start; |
| if (sectors < le64_to_cpu(sb->data_size)) |
| return -EINVAL; |
| rdev->sectors = le64_to_cpu(sb->data_size); |
| return ret; |
| } |
| |
| static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) |
| { |
| struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
| __u64 ev1 = le64_to_cpu(sb->events); |
| int role; |
| |
| rdev->raid_disk = -1; |
| clear_bit(Faulty, &rdev->flags); |
| clear_bit(In_sync, &rdev->flags); |
| clear_bit(Bitmap_sync, &rdev->flags); |
| clear_bit(WriteMostly, &rdev->flags); |
| |
| if (mddev->raid_disks == 0) { |
| mddev->major_version = 1; |
| mddev->patch_version = 0; |
| mddev->external = 0; |
| mddev->chunk_sectors = le32_to_cpu(sb->chunksize); |
| mddev->ctime = le64_to_cpu(sb->ctime); |
| mddev->utime = le64_to_cpu(sb->utime); |
| mddev->level = le32_to_cpu(sb->level); |
| mddev->clevel[0] = 0; |
| mddev->layout = le32_to_cpu(sb->layout); |
| mddev->raid_disks = le32_to_cpu(sb->raid_disks); |
| mddev->dev_sectors = le64_to_cpu(sb->size); |
| mddev->events = ev1; |
| mddev->bitmap_info.offset = 0; |
| mddev->bitmap_info.space = 0; |
| /* Default location for bitmap is 1K after superblock |
| * using 3K - total of 4K |
| */ |
| mddev->bitmap_info.default_offset = 1024 >> 9; |
| mddev->bitmap_info.default_space = (4096-1024) >> 9; |
| mddev->reshape_backwards = 0; |
| |
| mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
| memcpy(mddev->uuid, sb->set_uuid, 16); |
| |
| mddev->max_disks = (4096-256)/2; |
| |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
| mddev->bitmap_info.file == NULL) { |
| mddev->bitmap_info.offset = |
| (__s32)le32_to_cpu(sb->bitmap_offset); |
| /* Metadata doesn't record how much space is available. |
| * For 1.0, we assume we can use up to the superblock |
| * if before, else to 4K beyond superblock. |
| * For others, assume no change is possible. |
| */ |
| if (mddev->minor_version > 0) |
| mddev->bitmap_info.space = 0; |
| else if (mddev->bitmap_info.offset > 0) |
| mddev->bitmap_info.space = |
| 8 - mddev->bitmap_info.offset; |
| else |
| mddev->bitmap_info.space = |
| -mddev->bitmap_info.offset; |
| } |
| |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
| mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
| mddev->delta_disks = le32_to_cpu(sb->delta_disks); |
| mddev->new_level = le32_to_cpu(sb->new_level); |
| mddev->new_layout = le32_to_cpu(sb->new_layout); |
| mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); |
| if (mddev->delta_disks < 0 || |
| (mddev->delta_disks == 0 && |
| (le32_to_cpu(sb->feature_map) |
| & MD_FEATURE_RESHAPE_BACKWARDS))) |
| mddev->reshape_backwards = 1; |
| } else { |
| mddev->reshape_position = MaxSector; |
| mddev->delta_disks = 0; |
| mddev->new_level = mddev->level; |
| mddev->new_layout = mddev->layout; |
| mddev->new_chunk_sectors = mddev->chunk_sectors; |
| } |
| |
| if (mddev->level == 0 && |
| !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) |
| mddev->layout = -1; |
| |
| if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) |
| set_bit(MD_HAS_JOURNAL, &mddev->flags); |
| |
| if (le32_to_cpu(sb->feature_map) & |
| (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { |
| if (le32_to_cpu(sb->feature_map) & |
| (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) |
| return -EINVAL; |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && |
| (le32_to_cpu(sb->feature_map) & |
| MD_FEATURE_MULTIPLE_PPLS)) |
| return -EINVAL; |
| set_bit(MD_HAS_PPL, &mddev->flags); |
| } |
| } else if (mddev->pers == NULL) { |
| /* Insist of good event counter while assembling, except for |
| * spares (which don't need an event count). |
| * Similar to mdadm, we allow event counter difference of 1 |
| * from the freshest device. |
| */ |
| if (rdev->desc_nr >= 0 && |
| rdev->desc_nr < le32_to_cpu(sb->max_dev) && |
| (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || |
| le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) |
| if (ev1 + 1 < mddev->events) |
| return -EINVAL; |
| } else if (mddev->bitmap) { |
| /* If adding to array with a bitmap, then we can accept an |
| * older device, but not too old. |
| */ |
| if (ev1 < mddev->bitmap->events_cleared) |
| return 0; |
| if (ev1 < mddev->events) |
| set_bit(Bitmap_sync, &rdev->flags); |
| } else { |
| if (ev1 < mddev->events) |
| /* just a hot-add of a new device, leave raid_disk at -1 */ |
| return 0; |
| } |
| |
| if (rdev->desc_nr < 0 || |
| rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { |
| role = MD_DISK_ROLE_SPARE; |
| rdev->desc_nr = -1; |
| } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { |
| /* |
| * If we are assembling, and our event counter is smaller than the |
| * highest event counter, we cannot trust our superblock about the role. |
| * It could happen that our rdev was marked as Faulty, and all other |
| * superblocks were updated with +1 event counter. |
| * Then, before the next superblock update, which typically happens when |
| * remove_and_add_spares() removes the device from the array, there was |
| * a crash or reboot. |
| * If we allow current rdev without consulting the freshest superblock, |
| * we could cause data corruption. |
| * Note that in this case our event counter is smaller by 1 than the |
| * highest, otherwise, this rdev would not be allowed into array; |
| * both kernel and mdadm allow event counter difference of 1. |
| */ |
| struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); |
| u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); |
| |
| if (rdev->desc_nr >= freshest_max_dev) { |
| /* this is unexpected, better not proceed */ |
| pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", |
| mdname(mddev), rdev->bdev, rdev->desc_nr, |
| freshest->bdev, freshest_max_dev); |
| return -EUCLEAN; |
| } |
| |
| role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); |
| pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", |
| mdname(mddev), rdev->bdev, role, role, freshest->bdev); |
| } else { |
| role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); |
| } |
| switch (role) { |
| case MD_DISK_ROLE_SPARE: /* spare */ |
| break; |
| case MD_DISK_ROLE_FAULTY: /* faulty */ |
| set_bit(Faulty, &rdev->flags); |
| break; |
| case MD_DISK_ROLE_JOURNAL: /* journal device */ |
| if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { |
| /* journal device without journal feature */ |
| pr_warn("md: journal device provided without journal feature, ignoring the device\n"); |
| return -EINVAL; |
| } |
| set_bit(Journal, &rdev->flags); |
| rdev->journal_tail = le64_to_cpu(sb->journal_tail); |
| rdev->raid_disk = 0; |
| break; |
| default: |
| rdev->saved_raid_disk = role; |
| if ((le32_to_cpu(sb->feature_map) & |
| MD_FEATURE_RECOVERY_OFFSET)) { |
| rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); |
| if (!(le32_to_cpu(sb->feature_map) & |
| MD_FEATURE_RECOVERY_BITMAP)) |
| rdev->saved_raid_disk = -1; |
| } else { |
| /* |
| * If the array is FROZEN, then the device can't |
| * be in_sync with rest of array. |
| */ |
| if (!test_bit(MD_RECOVERY_FROZEN, |
| &mddev->recovery)) |
| set_bit(In_sync, &rdev->flags); |
| } |
| rdev->raid_disk = role; |
| break; |
| } |
| if (sb->devflags & WriteMostly1) |
| set_bit(WriteMostly, &rdev->flags); |
| if (sb->devflags & FailFast1) |
| set_bit(FailFast, &rdev->flags); |
| if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) |
| set_bit(Replacement, &rdev->flags); |
| |
| return 0; |
| } |
| |
| static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| struct mdp_superblock_1 *sb; |
| struct md_rdev *rdev2; |
| int max_dev, i; |
| /* make rdev->sb match mddev and rdev data. */ |
| |
| sb = page_address(rdev->sb_page); |
| |
| sb->feature_map = 0; |
| sb->pad0 = 0; |
| sb->recovery_offset = cpu_to_le64(0); |
| memset(sb->pad3, 0, sizeof(sb->pad3)); |
| |
| sb->utime = cpu_to_le64((__u64)mddev->utime); |
| sb->events = cpu_to_le64(mddev->events); |
| if (mddev->in_sync) |
| sb->resync_offset = cpu_to_le64(mddev->recovery_cp); |
| else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) |
| sb->resync_offset = cpu_to_le64(MaxSector); |
| else |
| sb->resync_offset = cpu_to_le64(0); |
| |
| sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); |
| |
| sb->raid_disks = cpu_to_le32(mddev->raid_disks); |
| sb->size = cpu_to_le64(mddev->dev_sectors); |
| sb->chunksize = cpu_to_le32(mddev->chunk_sectors); |
| sb->level = cpu_to_le32(mddev->level); |
| sb->layout = cpu_to_le32(mddev->layout); |
| if (test_bit(FailFast, &rdev->flags)) |
| sb->devflags |= FailFast1; |
| else |
| sb->devflags &= ~FailFast1; |
| |
| if (test_bit(WriteMostly, &rdev->flags)) |
| sb->devflags |= WriteMostly1; |
| else |
| sb->devflags &= ~WriteMostly1; |
| sb->data_offset = cpu_to_le64(rdev->data_offset); |
| sb->data_size = cpu_to_le64(rdev->sectors); |
| |
| if (mddev->bitmap && mddev->bitmap_info.file == NULL) { |
| sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); |
| sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); |
| } |
| |
| if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && |
| !test_bit(In_sync, &rdev->flags)) { |
| sb->feature_map |= |
| cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); |
| sb->recovery_offset = |
| cpu_to_le64(rdev->recovery_offset); |
| if (rdev->saved_raid_disk >= 0 && mddev->bitmap) |
| sb->feature_map |= |
| cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); |
| } |
| /* Note: recovery_offset and journal_tail share space */ |
| if (test_bit(Journal, &rdev->flags)) |
| sb->journal_tail = cpu_to_le64(rdev->journal_tail); |
| if (test_bit(Replacement, &rdev->flags)) |
| sb->feature_map |= |
| cpu_to_le32(MD_FEATURE_REPLACEMENT); |
| |
| if (mddev->reshape_position != MaxSector) { |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); |
| sb->reshape_position = cpu_to_le64(mddev->reshape_position); |
| sb->new_layout = cpu_to_le32(mddev->new_layout); |
| sb->delta_disks = cpu_to_le32(mddev->delta_disks); |
| sb->new_level = cpu_to_le32(mddev->new_level); |
| sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
| if (mddev->delta_disks == 0 && |
| mddev->reshape_backwards) |
| sb->feature_map |
| |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); |
| if (rdev->new_data_offset != rdev->data_offset) { |
| sb->feature_map |
| |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); |
| sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset |
| - rdev->data_offset)); |
| } |
| } |
| |
| if (mddev_is_clustered(mddev)) |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); |
| |
| if (rdev->badblocks.count == 0) |
| /* Nothing to do for bad blocks*/ ; |
| else if (sb->bblog_offset == 0) |
| /* Cannot record bad blocks on this device */ |
| md_error(mddev, rdev); |
| else { |
| struct badblocks *bb = &rdev->badblocks; |
| __le64 *bbp = (__le64 *)page_address(rdev->bb_page); |
| u64 *p = bb->page; |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); |
| if (bb->changed) { |
| unsigned seq; |
| |
| retry: |
| seq = read_seqbegin(&bb->lock); |
| |
| memset(bbp, 0xff, PAGE_SIZE); |
| |
| for (i = 0 ; i < bb->count ; i++) { |
| u64 internal_bb = p[i]; |
| u64 store_bb = ((BB_OFFSET(internal_bb) << 10) |
| | BB_LEN(internal_bb)); |
| bbp[i] = cpu_to_le64(store_bb); |
| } |
| bb->changed = 0; |
| if (read_seqretry(&bb->lock, seq)) |
| goto retry; |
| |
| bb->sector = (rdev->sb_start + |
| (int)le32_to_cpu(sb->bblog_offset)); |
| bb->size = le16_to_cpu(sb->bblog_size); |
| } |
| } |
| |
| max_dev = 0; |
| rdev_for_each(rdev2, mddev) |
| if (rdev2->desc_nr+1 > max_dev) |
| max_dev = rdev2->desc_nr+1; |
| |
| if (max_dev > le32_to_cpu(sb->max_dev)) { |
| int bmask; |
| sb->max_dev = cpu_to_le32(max_dev); |
| rdev->sb_size = max_dev * 2 + 256; |
| bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; |
| if (rdev->sb_size & bmask) |
| rdev->sb_size = (rdev->sb_size | bmask) + 1; |
| } else |
| max_dev = le32_to_cpu(sb->max_dev); |
| |
| for (i=0; i<max_dev;i++) |
| sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); |
| |
| if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); |
| |
| if (test_bit(MD_HAS_PPL, &mddev->flags)) { |
| if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) |
| sb->feature_map |= |
| cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); |
| else |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); |
| sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); |
| sb->ppl.size = cpu_to_le16(rdev->ppl.size); |
| } |
| |
| rdev_for_each(rdev2, mddev) { |
| i = rdev2->desc_nr; |
| if (test_bit(Faulty, &rdev2->flags)) |
| sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); |
| else if (test_bit(In_sync, &rdev2->flags)) |
| sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
| else if (test_bit(Journal, &rdev2->flags)) |
| sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); |
| else if (rdev2->raid_disk >= 0) |
| sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
| else |
| sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); |
| } |
| |
| sb->sb_csum = calc_sb_1_csum(sb); |
| } |
| |
| static sector_t super_1_choose_bm_space(sector_t dev_size) |
| { |
| sector_t bm_space; |
| |
| /* if the device is bigger than 8Gig, save 64k for bitmap |
| * usage, if bigger than 200Gig, save 128k |
| */ |
| if (dev_size < 64*2) |
| bm_space = 0; |
| else if (dev_size - 64*2 >= 200*1024*1024*2) |
| bm_space = 128*2; |
| else if (dev_size - 4*2 > 8*1024*1024*2) |
| bm_space = 64*2; |
| else |
| bm_space = 4*2; |
| return bm_space; |
| } |
| |
| static unsigned long long |
| super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) |
| { |
| struct mdp_superblock_1 *sb; |
| sector_t max_sectors; |
| if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
| return 0; /* component must fit device */ |
| if (rdev->data_offset != rdev->new_data_offset) |
| return 0; /* too confusing */ |
| if (rdev->sb_start < rdev->data_offset) { |
| /* minor versions 1 and 2; superblock before data */ |
| max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; |
| if (!num_sectors || num_sectors > max_sectors) |
| num_sectors = max_sectors; |
| } else if (rdev->mddev->bitmap_info.offset) { |
| /* minor version 0 with bitmap we can't move */ |
| return 0; |
| } else { |
| /* minor version 0; superblock after data */ |
| sector_t sb_start, bm_space; |
| sector_t dev_size = bdev_nr_sectors(rdev->bdev); |
| |
| /* 8K is for superblock */ |
| sb_start = dev_size - 8*2; |
| sb_start &= ~(sector_t)(4*2 - 1); |
| |
| bm_space = super_1_choose_bm_space(dev_size); |
| |
| /* Space that can be used to store date needs to decrease |
| * superblock bitmap space and bad block space(4K) |
| */ |
| max_sectors = sb_start - bm_space - 4*2; |
| |
| if (!num_sectors || num_sectors > max_sectors) |
| num_sectors = max_sectors; |
| rdev->sb_start = sb_start; |
| } |
| sb = page_address(rdev->sb_page); |
| sb->data_size = cpu_to_le64(num_sectors); |
| sb->super_offset = cpu_to_le64(rdev->sb_start); |
| sb->sb_csum = calc_sb_1_csum(sb); |
| do { |
| md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
| rdev->sb_page); |
| } while (md_super_wait(rdev->mddev) < 0); |
| return num_sectors; |
| |
| } |
| |
| static int |
| super_1_allow_new_offset(struct md_rdev *rdev, |
| unsigned long long new_offset) |
| { |
| /* All necessary checks on new >= old have been done */ |
| struct bitmap *bitmap; |
| if (new_offset >= rdev->data_offset) |
| return 1; |
| |
| /* with 1.0 metadata, there is no metadata to tread on |
| * so we can always move back */ |
| if (rdev->mddev->minor_version == 0) |
| return 1; |
| |
| /* otherwise we must be sure not to step on |
| * any metadata, so stay: |
| * 36K beyond start of superblock |
| * beyond end of badblocks |
| * beyond write-intent bitmap |
| */ |
| if (rdev->sb_start + (32+4)*2 > new_offset) |
| return 0; |
| bitmap = rdev->mddev->bitmap; |
| if (bitmap && !rdev->mddev->bitmap_info.file && |
| rdev->sb_start + rdev->mddev->bitmap_info.offset + |
| bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) |
| return 0; |
| if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) |
| return 0; |
| |
| return 1; |
| } |
| |
| static struct super_type super_types[] = { |
| [0] = { |
| .name = "0.90.0", |
| .owner = THIS_MODULE, |
| .load_super = super_90_load, |
| .validate_super = super_90_validate, |
| .sync_super = super_90_sync, |
| .rdev_size_change = super_90_rdev_size_change, |
| .allow_new_offset = super_90_allow_new_offset, |
| }, |
| [1] = { |
| .name = "md-1", |
| .owner = THIS_MODULE, |
| .load_super = super_1_load, |
| .validate_super = super_1_validate, |
| .sync_super = super_1_sync, |
| .rdev_size_change = super_1_rdev_size_change, |
| .allow_new_offset = super_1_allow_new_offset, |
| }, |
| }; |
| |
| static void sync_super(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| if (mddev->sync_super) { |
| mddev->sync_super(mddev, rdev); |
| return; |
| } |
| |
| BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); |
| |
| super_types[mddev->major_version].sync_super(mddev, rdev); |
| } |
| |
| static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) |
| { |
| struct md_rdev *rdev, *rdev2; |
| |
| rcu_read_lock(); |
| rdev_for_each_rcu(rdev, mddev1) { |
| if (test_bit(Faulty, &rdev->flags) || |
| test_bit(Journal, &rdev->flags) || |
| rdev->raid_disk == -1) |
| continue; |
| rdev_for_each_rcu(rdev2, mddev2) { |
| if (test_bit(Faulty, &rdev2->flags) || |
| test_bit(Journal, &rdev2->flags) || |
| rdev2->raid_disk == -1) |
| continue; |
| if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { |
| rcu_read_unlock(); |
| return 1; |
| } |
| } |
| } |
| rcu_read_unlock(); |
| return 0; |
| } |
| |
| static LIST_HEAD(pending_raid_disks); |
| |
| /* |
| * Try to register data integrity profile for an mddev |
| * |
| * This is called when an array is started and after a disk has been kicked |
| * from the array. It only succeeds if all working and active component devices |
| * are integrity capable with matching profiles. |
| */ |
| int md_integrity_register(struct mddev *mddev) |
| { |
| if (list_empty(&mddev->disks)) |
| return 0; /* nothing to do */ |
| if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) |
| return 0; /* shouldn't register */ |
| |
| pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); |
| if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || |
| (mddev->level != 1 && mddev->level != 10 && |
| bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { |
| /* |
| * No need to handle the failure of bioset_integrity_create, |
| * because the function is called by md_run() -> pers->run(), |
| * md_run calls bioset_exit -> bioset_integrity_free in case |
| * of failure case. |
| */ |
| pr_err("md: failed to create integrity pool for %s\n", |
| mdname(mddev)); |
| return -EINVAL; |
| } |
| return 0; |
| } |
| EXPORT_SYMBOL(md_integrity_register); |
| |
| static bool rdev_read_only(struct md_rdev *rdev) |
| { |
| return bdev_read_only(rdev->bdev) || |
| (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); |
| } |
| |
| static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) |
| { |
| char b[BDEVNAME_SIZE]; |
| int err; |
| |
| /* prevent duplicates */ |
| if (find_rdev(mddev, rdev->bdev->bd_dev)) |
| return -EEXIST; |
| |
| if (rdev_read_only(rdev) && mddev->pers) |
| return -EROFS; |
| |
| /* make sure rdev->sectors exceeds mddev->dev_sectors */ |
| if (!test_bit(Journal, &rdev->flags) && |
| rdev->sectors && |
| (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { |
| if (mddev->pers) { |
| /* Cannot change size, so fail |
| * If mddev->level <= 0, then we don't care |
| * about aligning sizes (e.g. linear) |
| */ |
| if (mddev->level > 0) |
| return -ENOSPC; |
| } else |
| mddev->dev_sectors = rdev->sectors; |
| } |
| |
| /* Verify rdev->desc_nr is unique. |
| * If it is -1, assign a free number, else |
| * check number is not in use |
| */ |
| rcu_read_lock(); |
| if (rdev->desc_nr < 0) { |
| int choice = 0; |
| if (mddev->pers) |
| choice = mddev->raid_disks; |
| while (md_find_rdev_nr_rcu(mddev, choice)) |
| choice++; |
| rdev->desc_nr = choice; |
| } else { |
| if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { |
| rcu_read_unlock(); |
| return -EBUSY; |
| } |
| } |
| rcu_read_unlock(); |
| if (!test_bit(Journal, &rdev->flags) && |
| mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { |
| pr_warn("md: %s: array is limited to %d devices\n", |
| mdname(mddev), mddev->max_disks); |
| return -EBUSY; |
| } |
| snprintf(b, sizeof(b), "%pg", rdev->bdev); |
| strreplace(b, '/', '!'); |
| |
| rdev->mddev = mddev; |
| pr_debug("md: bind<%s>\n", b); |
| |
| if (mddev->raid_disks) |
| mddev_create_serial_pool(mddev, rdev); |
| |
| if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) |
| goto fail; |
| |
| /* failure here is OK */ |
| err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); |
| rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); |
| rdev->sysfs_unack_badblocks = |
| sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); |
| rdev->sysfs_badblocks = |
| sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); |
| |
| list_add_rcu(&rdev->same_set, &mddev->disks); |
| bd_link_disk_holder(rdev->bdev, mddev->gendisk); |
| |
| /* May as well allow recovery to be retried once */ |
| mddev->recovery_disabled++; |
| |
| return 0; |
| |
| fail: |
| pr_warn("md: failed to register dev-%s for %s\n", |
| b, mdname(mddev)); |
| mddev_destroy_serial_pool(mddev, rdev); |
| return err; |
| } |
| |
| void md_autodetect_dev(dev_t dev); |
| |
| /* just for claiming the bdev */ |
| static struct md_rdev claim_rdev; |
| |
| static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) |
| { |
| pr_debug("md: export_rdev(%pg)\n", rdev->bdev); |
| md_rdev_clear(rdev); |
| #ifndef MODULE |
| if (test_bit(AutoDetected, &rdev->flags)) |
| md_autodetect_dev(rdev->bdev->bd_dev); |
| #endif |
| fput(rdev->bdev_file); |
| rdev->bdev = NULL; |
| kobject_put(&rdev->kobj); |
| } |
| |
| static void md_kick_rdev_from_array(struct md_rdev *rdev) |
| { |
| struct mddev *mddev = rdev->mddev; |
| |
| bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); |
| list_del_rcu(&rdev->same_set); |
| pr_debug("md: unbind<%pg>\n", rdev->bdev); |
| mddev_destroy_serial_pool(rdev->mddev, rdev); |
| WRITE_ONCE(rdev->mddev, NULL); |
| sysfs_remove_link(&rdev->kobj, "block"); |
| sysfs_put(rdev->sysfs_state); |
| sysfs_put(rdev->sysfs_unack_badblocks); |
| sysfs_put(rdev->sysfs_badblocks); |
| rdev->sysfs_state = NULL; |
| rdev->sysfs_unack_badblocks = NULL; |
| rdev->sysfs_badblocks = NULL; |
| rdev->badblocks.count = 0; |
| |
| synchronize_rcu(); |
| |
| /* |
| * kobject_del() will wait for all in progress writers to be done, where |
| * reconfig_mutex is held, hence it can't be called under |
| * reconfig_mutex and it's delayed to mddev_unlock(). |
| */ |
| list_add(&rdev->same_set, &mddev->deleting); |
| } |
| |
| static void export_array(struct mddev *mddev) |
| { |
| struct md_rdev *rdev; |
| |
| while (!list_empty(&mddev->disks)) { |
| rdev = list_first_entry(&mddev->disks, struct md_rdev, |
| same_set); |
| md_kick_rdev_from_array(rdev); |
| } |
| mddev->raid_disks = 0; |
| mddev->major_version = 0; |
| } |
| |
| static bool set_in_sync(struct mddev *mddev) |
| { |
| lockdep_assert_held(&mddev->lock); |
| if (!mddev->in_sync) { |
| mddev->sync_checkers++; |
| spin_unlock(&mddev->lock); |
| percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); |
| spin_lock(&mddev->lock); |
| if (!mddev->in_sync && |
| percpu_ref_is_zero(&mddev->writes_pending)) { |
| mddev->in_sync = 1; |
| /* |
| * Ensure ->in_sync is visible before we clear |
| * ->sync_checkers. |
| */ |
| smp_mb(); |
| set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| sysfs_notify_dirent_safe(mddev->sysfs_state); |
| } |
| if (--mddev->sync_checkers == 0) |
| percpu_ref_switch_to_percpu(&mddev->writes_pending); |
| } |
| if (mddev->safemode == 1) |
| mddev->safemode = 0; |
| return mddev->in_sync; |
| } |
| |
| static void sync_sbs(struct mddev *mddev, int nospares) |
| { |
| /* Update each superblock (in-memory image), but |
| * if we are allowed to, skip spares which already |
| * have the right event counter, or have one earlier |
| * (which would mean they aren't being marked as dirty |
| * with the rest of the array) |
| */ |
| struct md_rdev *rdev; |
| rdev_for_each(rdev, mddev) { |
| if (rdev->sb_events == mddev->events || |
| (nospares && |
| rdev->raid_disk < 0 && |
| rdev->sb_events+1 == mddev->events)) { |
| /* Don't update this superblock */ |
| rdev->sb_loaded = 2; |
| } else { |
| sync_super(mddev, rdev); |
| rdev->sb_loaded = 1; |
| } |
| } |
| } |
| |
| static bool does_sb_need_changing(struct mddev *mddev) |
| { |
| struct md_rdev *rdev = NULL, *iter; |
| struct mdp_superblock_1 *sb; |
| int role; |
| |
| /* Find a good rdev */ |
| rdev_for_each(iter, mddev) |
| if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { |
| rdev = iter; |
| break; |
| } |
| |
| /* No good device found. */ |
| if (!rdev) |
| return false; |
| |
| sb = page_address(rdev->sb_page); |
| /* Check if a device has become faulty or a spare become active */ |
| rdev_for_each(rdev, mddev) { |
| role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); |
| /* Device activated? */ |
| if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && |
| !test_bit(Faulty, &rdev->flags)) |
| return true; |
| /* Device turned faulty? */ |
| if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) |
| return true; |
| } |
| |
| /* Check if any mddev parameters have changed */ |
| if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || |
| (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || |
| (mddev->layout != le32_to_cpu(sb->layout)) || |
| (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || |
| (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) |
| return true; |
| |
| return false; |
| } |
| |
| void md_update_sb(struct mddev *mddev, int force_change) |
| { |
| struct md_rdev *rdev; |
| int sync_req; |
| int nospares = 0; |
| int any_badblocks_changed = 0; |
| int ret = -1; |
| |
| if (!md_is_rdwr(mddev)) { |
| if (force_change) |
| set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| return; |
| } |
| |
| repeat: |
| if (mddev_is_clustered(mddev)) { |
| if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) |
| force_change = 1; |
| if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) |
| nospares = 1; |
| ret = md_cluster_ops->metadata_update_start(mddev); |
| /* Has someone else has updated the sb */ |
| if (!does_sb_need_changing(mddev)) { |
| if (ret == 0) |
| md_cluster_ops->metadata_update_cancel(mddev); |
| bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
| BIT(MD_SB_CHANGE_DEVS) | |
| BIT(MD_SB_CHANGE_CLEAN)); |
| return; |
| } |
| } |
| |
| /* |
| * First make sure individual recovery_offsets are correct |
| * curr_resync_completed can only be used during recovery. |
| * During reshape/resync it might use array-addresses rather |
| * that device addresses. |
| */ |
| rdev_for_each(rdev, mddev) { |
| if (rdev->raid_disk >= 0 && |
| mddev->delta_disks >= 0 && |
| test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
| test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && |
| !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
| !test_bit(Journal, &rdev->flags) && |
| !test_bit(In_sync, &rdev->flags) && |
| mddev->curr_resync_completed > rdev->recovery_offset) |
| rdev->recovery_offset = mddev->curr_resync_completed; |
| |
| } |
| if (!mddev->persistent) { |
| clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| if (!mddev->external) { |
| clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
| rdev_for_each(rdev, mddev) { |
| if (rdev->badblocks.changed) { |
| rdev->badblocks.changed = 0; |
| ack_all_badblocks(&rdev->badblocks); |
| md_error(mddev, rdev); |
| } |
| clear_bit(Blocked, &rdev->flags); |
| clear_bit(BlockedBadBlocks, &rdev->flags); |
| wake_up(&rdev->blocked_wait); |
| } |
| } |
| wake_up(&mddev->sb_wait); |
| return; |
| } |
| |
| spin_lock(&mddev->lock); |
| |
| mddev->utime = ktime_get_real_seconds(); |
| |
| if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) |
| force_change = 1; |
| if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) |
| /* just a clean<-> dirty transition, possibly leave spares alone, |
| * though if events isn't the right even/odd, we will have to do |
| * spares after all |
| */ |
| nospares = 1; |
| if (force_change) |
| nospares = 0; |
| if (mddev->degraded) |
| /* If the array is degraded, then skipping spares is both |
| * dangerous and fairly pointless. |
| * Dangerous because a device that was removed from the array |
| * might have a event_count that still looks up-to-date, |
| * so it can be re-added without a resync. |
| * Pointless because if there are any spares to skip, |
| * then a recovery will happen and soon that array won't |
| * be degraded any more and the spare can go back to sleep then. |
| */ |
| nospares = 0; |
| |
| sync_req = mddev->in_sync; |
| |
| /* If this is just a dirty<->clean transition, and the array is clean |
| * and 'events' is odd, we can roll back to the previous clean state */ |
| if (nospares |
| && (mddev->in_sync && mddev->recovery_cp == MaxSector) |
| && mddev->can_decrease_events |
| && mddev->events != 1) { |
| mddev->events--; |
| mddev->can_decrease_events = 0; |
| } else { |
| /* otherwise we have to go forward and ... */ |
| mddev->events ++; |
| mddev->can_decrease_events = nospares; |
| } |
| |
| /* |
| * This 64-bit counter should never wrap. |
| * Either we are in around ~1 trillion A.C., assuming |
| * 1 reboot per second, or we have a bug... |
| */ |
| WARN_ON(mddev->events == 0); |
| |
| rdev_for_each(rdev, mddev) { |
| if (rdev->badblocks.changed) |
| any_badblocks_changed++; |
| if (test_bit(Faulty, &rdev->flags)) |
| set_bit(FaultRecorded, &rdev->flags); |
| } |
| |
| sync_sbs(mddev, nospares); |
| spin_unlock(&mddev->lock); |
| |
| pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", |
| mdname(mddev), mddev->in_sync); |
| |
| mddev_add_trace_msg(mddev, "md md_update_sb"); |
| rewrite: |
| md_bitmap_update_sb(mddev->bitmap); |
| rdev_for_each(rdev, mddev) { |
| if (rdev->sb_loaded != 1) |
| continue; /* no noise on spare devices */ |
| |
| if (!test_bit(Faulty, &rdev->flags)) { |
| md_super_write(mddev,rdev, |
| rdev->sb_start, rdev->sb_size, |
| rdev->sb_page); |
| pr_debug("md: (write) %pg's sb offset: %llu\n", |
| rdev->bdev, |
| (unsigned long long)rdev->sb_start); |
| rdev->sb_events = mddev->events; |
| if (rdev->badblocks.size) { |
| md_super_write(mddev, rdev, |
| rdev->badblocks.sector, |
| rdev->badblocks.size << 9, |
| rdev->bb_page); |
| rdev->badblocks.size = 0; |
| } |
| |
| } else |
| pr_debug("md: %pg (skipping faulty)\n", |
| rdev->bdev); |
| } |
| if (md_super_wait(mddev) < 0) |
| goto rewrite; |
| /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ |
| |
| if (mddev_is_clustered(mddev) && ret == 0) |
| md_cluster_ops->metadata_update_finish(mddev); |
| |
| if (mddev->in_sync != sync_req || |
| !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
| BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) |
| /* have to write it out again */ |
| goto repeat; |
| wake_up(&mddev->sb_wait); |
| if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
| sysfs_notify_dirent_safe(mddev->sysfs_completed); |
| |
| rdev_for_each(rdev, mddev) { |
| if (test_and_clear_bit(FaultRecorded, &rdev->flags)) |
| clear_bit(Blocked, &rdev->flags); |
| |
| if (any_badblocks_changed) |
| ack_all_badblocks(&rdev->badblocks); |
| clear_bit(BlockedBadBlocks, &rdev->flags); |
| wake_up(&rdev->blocked_wait); |
| } |
| } |
| EXPORT_SYMBOL(md_update_sb); |
| |
| static int add_bound_rdev(struct md_rdev *rdev) |
| { |
| struct mddev *mddev = rdev->mddev; |
| int err = 0; |
| bool add_journal = test_bit(Journal, &rdev->flags); |
| |
| if (!mddev->pers->hot_remove_disk || add_journal) { |
| /* If there is hot_add_disk but no hot_remove_disk |
| * then added disks for geometry changes, |
| * and should be added immediately. |
| */ |
| super_types[mddev->major_version]. |
| validate_super(mddev, NULL/*freshest*/, rdev); |
| err = mddev->pers->hot_add_disk(mddev, rdev); |
| if (err) { |
| md_kick_rdev_from_array(rdev); |
| return err; |
| } |
| } |
| sysfs_notify_dirent_safe(rdev->sysfs_state); |
| |
| set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| if (mddev->degraded) |
| set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
| set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| md_new_event(); |
| return 0; |
| } |
| |
| /* words written to sysfs files may, or may not, be \n terminated. |
| * We want to accept with case. For this we use cmd_match. |
| */ |
| static int cmd_match(const char *cmd, const char *str) |
| { |
| /* See if cmd, written into a sysfs file, matches |
| * str. They must either be the same, or cmd can |
| * have a trailing newline |
| */ |
| while (*cmd && *str && *cmd == *str) { |
| cmd++; |
| str++; |
| } |
| if (*cmd == '\n') |
| cmd++; |
| if (*str || *cmd) |
| return 0; |
| return 1; |
| } |
| |
| struct rdev_sysfs_entry { |
| struct attribute attr; |
| ssize_t (*show)(struct md_rdev *, char *); |
| ssize_t (*store)(struct md_rdev *, const char *, size_t); |
| }; |
| |
| static ssize_t |
| state_show(struct md_rdev *rdev, char *page) |
| { |
| char *sep = ","; |
| size_t len = 0; |
| unsigned long flags = READ_ONCE(rdev->flags); |
| |
| if (test_bit(Faulty, &flags) || |
| (!test_bit(ExternalBbl, &flags) && |
|