Merge tag 'disintegrate-main-20121013' of git://git.infradead.org/users/dhowells/linux-headers
Pull UAPI disintegration for include/linux/{,byteorder/}*.h from David Howells:
"The patches contained herein do the following:
(1) Remove kernel-only stuff in linux/ppp-comp.h from the UAPI. I checked
this with Paul Mackerras before I created the patch and he suggested some
extra bits to unexport.
(2) Remove linux/blk_types.h entirely from the UAPI as none of it is userspace
applicable, and remove from the UAPI that part of linux/fs.h that was the
reason for linux/blk_types.h being exported in the first place. I
discussed this with Jens Axboe before creating the patch.
(3) The big patch of the series to disintegrate include/linux/*.h as a unit.
This could be split up, though there would be collisions in moving stuff
between the two Kbuild files when the parts are merged as that file is
sorted alphabetically rather than being grouped by subsystem.
Of this set of headers, 17 files have changed in the UAPI exported region
since the 4th and only 8 since the 9th so there isn't much change in this
area - as one might expect.
It should be pretty obvious and straightforward if it does come to fixing
up: stuff in __KERNEL__ guards stays where it is and stuff outside moves
to the same file in the include/uapi/linux/ directory.
If a new file appears then things get a bit more complicated as the
"headers +=" line has to move to include/uapi/linux/Kbuild. Only one new
file has appeared since the 9th and I judge this type of event relatively
unlikely.
(4) A patch to disintegrate include/linux/byteorder/*.h as a unit.
Signed-off-by: David Howells <dhowells@redhat.com>"
* tag 'disintegrate-main-20121013' of git://git.infradead.org/users/dhowells/linux-headers:
UAPI: (Scripted) Disintegrate include/linux/byteorder
UAPI: (Scripted) Disintegrate include/linux
UAPI: Unexport linux/blk_types.h
UAPI: Unexport part of linux/ppp-comp.h
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 1c18449..728c38c 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -132,3 +132,12 @@
which are 'A'live, and the array is 2/490221568 complete with recovery.
Faulty or missing devices are marked 'D'. Devices that are out-of-sync
are marked 'a'.
+
+
+Version History
+---------------
+1.0.0 Initial version. Support for RAID 4/5/6
+1.1.0 Added support for RAID 1
+1.2.0 Handle creation of arrays that contain failed devices.
+1.3.0 Added support for RAID 10
+1.3.1 Allow device replacement/rebuild for RAID 10
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 7140b6b..78de680 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -1,6 +1,4 @@
-include include/asm-generic/Kbuild.asm
-header-y += elf.h
header-y += ucontext.h
generic-y += atomic.h
diff --git a/arch/openrisc/include/asm/elf.h b/arch/openrisc/include/asm/elf.h
index 225a7ff..f4aa8a5 100644
--- a/arch/openrisc/include/asm/elf.h
+++ b/arch/openrisc/include/asm/elf.h
@@ -15,60 +15,12 @@
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
-
#ifndef __ASM_OPENRISC_ELF_H
#define __ASM_OPENRISC_ELF_H
-/*
- * This files is partially exported to userspace. This allows us to keep
- * the ELF bits in one place which should assist in keeping the kernel and
- * userspace in sync.
- */
-
-/*
- * ELF register definitions..
- */
-
-/* for struct user_regs_struct definition */
-#include <asm/ptrace.h>
-
-/* The OR1K relocation types... not all relevant for module loader */
-#define R_OR32_NONE 0
-#define R_OR32_32 1
-#define R_OR32_16 2
-#define R_OR32_8 3
-#define R_OR32_CONST 4
-#define R_OR32_CONSTH 5
-#define R_OR32_JUMPTARG 6
-#define R_OR32_VTINHERIT 7
-#define R_OR32_VTENTRY 8
-
-typedef unsigned long elf_greg_t;
-
-/*
- * Note that NGREG is defined to ELF_NGREG in include/linux/elfcore.h, and is
- * thus exposed to user-space.
- */
-#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-
-/* A placeholder; OR32 does not have fp support yes, so no fp regs for now. */
-typedef unsigned long elf_fpregset_t;
-
-/* This should be moved to include/linux/elf.h */
-#define EM_OR32 0x8472
-#define EM_OPENRISC 92 /* OpenRISC 32-bit embedded processor */
-
-/*
- * These are used to set parameters in the core dumps.
- */
-#define ELF_ARCH EM_OR32
-#define ELF_CLASS ELFCLASS32
-#define ELF_DATA ELFDATA2MSB
-
-#ifdef __KERNEL__
#include <linux/types.h>
+#include <uapi/asm/elf.h>
/*
* This is used to ensure we don't load something for the wrong architecture.
@@ -113,5 +65,4 @@
#define SET_PERSONALITY(ex) \
set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
-#endif /* __KERNEL__ */
#endif
diff --git a/arch/openrisc/include/asm/ptrace.h b/arch/openrisc/include/asm/ptrace.h
index 8555c0c..6ca1726 100644
--- a/arch/openrisc/include/asm/ptrace.h
+++ b/arch/openrisc/include/asm/ptrace.h
@@ -15,25 +15,12 @@
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
-
#ifndef __ASM_OPENRISC_PTRACE_H
#define __ASM_OPENRISC_PTRACE_H
-#ifndef __ASSEMBLY__
-/*
- * This is the layout of the regset returned by the GETREGSET ptrace call
- */
-struct user_regs_struct {
- /* GPR R0-R31... */
- unsigned long gpr[32];
- unsigned long pc;
- unsigned long sr;
-};
-#endif
-
-#ifdef __KERNEL__
#include <asm/spr_defs.h>
+#include <uapi/asm/ptrace.h>
/*
* Make kernel PTrace/register structures opaque to userspace... userspace can
@@ -134,6 +121,4 @@
#define PT_ORIG_GPR11 132
#define PT_SYSCALLNO 136
-#endif /* __KERNEL__ */
-
#endif /* __ASM_OPENRISC_PTRACE_H */
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index baebb3d..80761eb 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -1,3 +1,10 @@
# UAPI Header export list
include include/uapi/asm-generic/Kbuild.asm
+header-y += byteorder.h
+header-y += elf.h
+header-y += kvm_para.h
+header-y += param.h
+header-y += ptrace.h
+header-y += sigcontext.h
+header-y += unistd.h
diff --git a/arch/openrisc/include/asm/byteorder.h b/arch/openrisc/include/uapi/asm/byteorder.h
similarity index 100%
rename from arch/openrisc/include/asm/byteorder.h
rename to arch/openrisc/include/uapi/asm/byteorder.h
diff --git a/arch/openrisc/include/uapi/asm/elf.h b/arch/openrisc/include/uapi/asm/elf.h
new file mode 100644
index 0000000..f02ea58
--- /dev/null
+++ b/arch/openrisc/include/uapi/asm/elf.h
@@ -0,0 +1,69 @@
+/*
+ * OpenRISC Linux
+ *
+ * Linux architectural port borrowing liberally from similar works of
+ * others. All original copyrights apply as per the original source
+ * declaration.
+ *
+ * OpenRISC implementation:
+ * Copyright (C) 2003 Matjaz Breskvar <phoenix@bsemi.com>
+ * Copyright (C) 2010-2011 Jonas Bonn <jonas@southpole.se>
+ * et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _UAPI__ASM_OPENRISC_ELF_H
+#define _UAPI__ASM_OPENRISC_ELF_H
+
+/*
+ * This files is partially exported to userspace. This allows us to keep
+ * the ELF bits in one place which should assist in keeping the kernel and
+ * userspace in sync.
+ */
+
+/*
+ * ELF register definitions..
+ */
+
+/* for struct user_regs_struct definition */
+#include <asm/ptrace.h>
+
+/* The OR1K relocation types... not all relevant for module loader */
+#define R_OR32_NONE 0
+#define R_OR32_32 1
+#define R_OR32_16 2
+#define R_OR32_8 3
+#define R_OR32_CONST 4
+#define R_OR32_CONSTH 5
+#define R_OR32_JUMPTARG 6
+#define R_OR32_VTINHERIT 7
+#define R_OR32_VTENTRY 8
+
+typedef unsigned long elf_greg_t;
+
+/*
+ * Note that NGREG is defined to ELF_NGREG in include/linux/elfcore.h, and is
+ * thus exposed to user-space.
+ */
+#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+/* A placeholder; OR32 does not have fp support yes, so no fp regs for now. */
+typedef unsigned long elf_fpregset_t;
+
+/* This should be moved to include/linux/elf.h */
+#define EM_OR32 0x8472
+#define EM_OPENRISC 92 /* OpenRISC 32-bit embedded processor */
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_ARCH EM_OR32
+#define ELF_CLASS ELFCLASS32
+#define ELF_DATA ELFDATA2MSB
+
+#endif /* _UAPI__ASM_OPENRISC_ELF_H */
diff --git a/arch/openrisc/include/asm/kvm_para.h b/arch/openrisc/include/uapi/asm/kvm_para.h
similarity index 100%
rename from arch/openrisc/include/asm/kvm_para.h
rename to arch/openrisc/include/uapi/asm/kvm_para.h
diff --git a/arch/openrisc/include/asm/param.h b/arch/openrisc/include/uapi/asm/param.h
similarity index 100%
rename from arch/openrisc/include/asm/param.h
rename to arch/openrisc/include/uapi/asm/param.h
diff --git a/arch/openrisc/include/uapi/asm/ptrace.h b/arch/openrisc/include/uapi/asm/ptrace.h
new file mode 100644
index 0000000..9760bd1
--- /dev/null
+++ b/arch/openrisc/include/uapi/asm/ptrace.h
@@ -0,0 +1,35 @@
+/*
+ * OpenRISC Linux
+ *
+ * Linux architectural port borrowing liberally from similar works of
+ * others. All original copyrights apply as per the original source
+ * declaration.
+ *
+ * OpenRISC implementation:
+ * Copyright (C) 2003 Matjaz Breskvar <phoenix@bsemi.com>
+ * Copyright (C) 2010-2011 Jonas Bonn <jonas@southpole.se>
+ * et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _UAPI__ASM_OPENRISC_PTRACE_H
+#define _UAPI__ASM_OPENRISC_PTRACE_H
+
+#ifndef __ASSEMBLY__
+/*
+ * This is the layout of the regset returned by the GETREGSET ptrace call
+ */
+struct user_regs_struct {
+ /* GPR R0-R31... */
+ unsigned long gpr[32];
+ unsigned long pc;
+ unsigned long sr;
+};
+#endif
+
+
+#endif /* _UAPI__ASM_OPENRISC_PTRACE_H */
diff --git a/arch/openrisc/include/asm/sigcontext.h b/arch/openrisc/include/uapi/asm/sigcontext.h
similarity index 100%
rename from arch/openrisc/include/asm/sigcontext.h
rename to arch/openrisc/include/uapi/asm/sigcontext.h
diff --git a/arch/openrisc/include/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h
similarity index 100%
rename from arch/openrisc/include/asm/unistd.h
rename to arch/openrisc/include/uapi/asm/unistd.h
diff --git a/crypto/xor.c b/crypto/xor.c
index 65c7b41..35d6b3a 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -56,11 +56,11 @@
EXPORT_SYMBOL(xor_blocks);
/* Set of all registered templates. */
-static struct xor_block_template *template_list;
+static struct xor_block_template *__initdata template_list;
#define BENCH_SIZE (PAGE_SIZE)
-static void
+static void __init
do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
{
int speed;
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 94e7f6b..7155945 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -163,20 +163,17 @@
* As devices are only added or removed when raid_disk is < 0 and
* nr_pending is 0 and In_sync is clear, the entries we return will
* still be in the same position on the list when we re-enter
- * list_for_each_continue_rcu.
+ * list_for_each_entry_continue_rcu.
*/
- struct list_head *pos;
rcu_read_lock();
if (rdev == NULL)
/* start at the beginning */
- pos = &mddev->disks;
+ rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set);
else {
/* release the previous rdev and start from there. */
rdev_dec_pending(rdev, mddev);
- pos = &rdev->same_set;
}
- list_for_each_continue_rcu(pos, &mddev->disks) {
- rdev = list_entry(pos, struct md_rdev, same_set);
+ list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags)) {
/* this is a usable devices */
@@ -473,14 +470,10 @@
{
bitmap_super_t *sb;
unsigned long chunksize, daemon_sleep, write_behind;
- int err = -EINVAL;
bitmap->storage.sb_page = alloc_page(GFP_KERNEL);
- if (IS_ERR(bitmap->storage.sb_page)) {
- err = PTR_ERR(bitmap->storage.sb_page);
- bitmap->storage.sb_page = NULL;
- return err;
- }
+ if (bitmap->storage.sb_page == NULL)
+ return -ENOMEM;
bitmap->storage.sb_page->index = 0;
sb = kmap_atomic(bitmap->storage.sb_page);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 982e3e3..45d94a7 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -338,6 +338,84 @@
}
/*
+ * validate_rebuild_devices
+ * @rs
+ *
+ * Determine if the devices specified for rebuild can result in a valid
+ * usable array that is capable of rebuilding the given devices.
+ *
+ * Returns: 0 on success, -EINVAL on failure.
+ */
+static int validate_rebuild_devices(struct raid_set *rs)
+{
+ unsigned i, rebuild_cnt = 0;
+ unsigned rebuilds_per_group, copies, d;
+
+ if (!(rs->print_flags & DMPF_REBUILD))
+ return 0;
+
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+ rebuild_cnt++;
+
+ switch (rs->raid_type->level) {
+ case 1:
+ if (rebuild_cnt >= rs->md.raid_disks)
+ goto too_many;
+ break;
+ case 4:
+ case 5:
+ case 6:
+ if (rebuild_cnt > rs->raid_type->parity_devs)
+ goto too_many;
+ break;
+ case 10:
+ copies = raid10_md_layout_to_copies(rs->md.layout);
+ if (rebuild_cnt < copies)
+ break;
+
+ /*
+ * It is possible to have a higher rebuild count for RAID10,
+ * as long as the failed devices occur in different mirror
+ * groups (i.e. different stripes).
+ *
+ * Right now, we only allow for "near" copies. When other
+ * formats are added, we will have to check those too.
+ *
+ * When checking "near" format, make sure no adjacent devices
+ * have failed beyond what can be handled. In addition to the
+ * simple case where the number of devices is a multiple of the
+ * number of copies, we must also handle cases where the number
+ * of devices is not a multiple of the number of copies.
+ * E.g. dev1 dev2 dev3 dev4 dev5
+ * A A B B C
+ * C D D E E
+ */
+ rebuilds_per_group = 0;
+ for (i = 0; i < rs->md.raid_disks * copies; i++) {
+ d = i % rs->md.raid_disks;
+ if (!test_bit(In_sync, &rs->dev[d].rdev.flags) &&
+ (++rebuilds_per_group >= copies))
+ goto too_many;
+ if (!((i + 1) % copies))
+ rebuilds_per_group = 0;
+ }
+ break;
+ default:
+ DMERR("The rebuild parameter is not supported for %s",
+ rs->raid_type->name);
+ rs->ti->error = "Rebuild not supported for this RAID type";
+ return -EINVAL;
+ }
+
+ return 0;
+
+too_many:
+ rs->ti->error = "Too many rebuild devices specified";
+ return -EINVAL;
+}
+
+/*
* Possible arguments are...
* <chunk_size> [optional_args]
*
@@ -365,7 +443,7 @@
{
char *raid10_format = "near";
unsigned raid10_copies = 2;
- unsigned i, rebuild_cnt = 0;
+ unsigned i;
unsigned long value, region_size = 0;
sector_t sectors_per_dev = rs->ti->len;
sector_t max_io_len;
@@ -461,31 +539,7 @@
/* Parameters that take a numeric value are checked here */
if (!strcasecmp(key, "rebuild")) {
- rebuild_cnt++;
-
- switch (rs->raid_type->level) {
- case 1:
- if (rebuild_cnt >= rs->md.raid_disks) {
- rs->ti->error = "Too many rebuild devices specified";
- return -EINVAL;
- }
- break;
- case 4:
- case 5:
- case 6:
- if (rebuild_cnt > rs->raid_type->parity_devs) {
- rs->ti->error = "Too many rebuild devices specified for given RAID type";
- return -EINVAL;
- }
- break;
- case 10:
- default:
- DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
- rs->ti->error = "Rebuild not supported for this RAID type";
- return -EINVAL;
- }
-
- if (value > rs->md.raid_disks) {
+ if (value >= rs->md.raid_disks) {
rs->ti->error = "Invalid rebuild index given";
return -EINVAL;
}
@@ -608,6 +662,9 @@
}
rs->md.dev_sectors = sectors_per_dev;
+ if (validate_rebuild_devices(rs))
+ return -EINVAL;
+
/* Assume there are no metadata devices until the drives are parsed */
rs->md.persistent = 0;
rs->md.external = 1;
@@ -960,6 +1017,19 @@
freshest = NULL;
rdev_for_each_safe(rdev, tmp, mddev) {
+ /*
+ * Skipping super_load due to DMPF_SYNC will cause
+ * the array to undergo initialization again as
+ * though it were new. This is the intended effect
+ * of the "sync" directive.
+ *
+ * When reshaping capability is added, we must ensure
+ * that the "sync" directive is disallowed during the
+ * reshape.
+ */
+ if (rs->print_flags & DMPF_SYNC)
+ continue;
+
if (!rdev->meta_bdev)
continue;
@@ -1360,7 +1430,7 @@
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 3, 0},
+ .version = {1, 3, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index fa211d8..2101483 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -138,6 +138,7 @@
struct linear_conf *conf;
struct md_rdev *rdev;
int i, cnt;
+ bool discard_supported = false;
conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
GFP_KERNEL);
@@ -171,6 +172,8 @@
conf->array_sectors += rdev->sectors;
cnt++;
+ if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ discard_supported = true;
}
if (cnt != raid_disks) {
printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
@@ -178,6 +181,11 @@
goto out;
}
+ if (!discard_supported)
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
/*
* Here we calculate the device offsets.
*/
@@ -244,7 +252,9 @@
if (!newconf)
return -ENOMEM;
- oldconf = rcu_dereference(mddev->private);
+ oldconf = rcu_dereference_protected(mddev->private,
+ lockdep_is_held(
+ &mddev->reconfig_mutex));
mddev->raid_disks++;
rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
@@ -256,7 +266,10 @@
static int linear_stop (struct mddev *mddev)
{
- struct linear_conf *conf = mddev->private;
+ struct linear_conf *conf =
+ rcu_dereference_protected(mddev->private,
+ lockdep_is_held(
+ &mddev->reconfig_mutex));
/*
* We do not require rcu protection here since
@@ -326,6 +339,14 @@
bio->bi_sector = bio->bi_sector - start_sector
+ tmp_dev->rdev->data_offset;
rcu_read_unlock();
+
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ return;
+ }
+
generic_make_request(bio);
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 95c8801..9ab768a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -674,7 +674,18 @@
return NULL;
}
-static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
+static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each_rcu(rdev, mddev)
+ if (rdev->desc_nr == nr)
+ return rdev;
+
+ return NULL;
+}
+
+static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
{
struct md_rdev *rdev;
@@ -685,6 +696,17 @@
return NULL;
}
+static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each_rcu(rdev, mddev)
+ if (rdev->bdev->bd_dev == dev)
+ return rdev;
+
+ return NULL;
+}
+
static struct md_personality *find_pers(int level, char *clevel)
{
struct md_personality *pers;
@@ -2022,8 +2044,14 @@
/* Disable data integrity if non-capable/non-matching disk is being added */
void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
- struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
- struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
+ struct blk_integrity *bi_rdev;
+ struct blk_integrity *bi_mddev;
+
+ if (!mddev->gendisk)
+ return;
+
+ bi_rdev = bdev_get_integrity(rdev->bdev);
+ bi_mddev = blk_get_integrity(mddev->gendisk);
if (!bi_mddev) /* nothing to do */
return;
@@ -3754,6 +3782,8 @@
return -EINVAL;
mddev->recovery_cp = n;
+ if (mddev->pers)
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
return len;
}
static struct md_sysfs_entry md_resync_start =
@@ -4231,6 +4261,13 @@
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
}
+ if (mddev->ro == 2) {
+ /* A write to sync_action is enough to justify
+ * canceling read-auto mode
+ */
+ mddev->ro = 0;
+ md_wakeup_thread(mddev->sync_thread);
+ }
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
@@ -4241,7 +4278,8 @@
mismatch_cnt_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%llu\n",
- (unsigned long long) mddev->resync_mismatches);
+ (unsigned long long)
+ atomic64_read(&mddev->resync_mismatches));
}
static struct md_sysfs_entry md_scan_mode =
@@ -4362,6 +4400,10 @@
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
+ if (mddev->curr_resync == 1 ||
+ mddev->curr_resync == 2)
+ return sprintf(page, "delayed\n");
+
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors;
@@ -5207,7 +5249,7 @@
mddev->new_layout = 0;
mddev->new_chunk_sectors = 0;
mddev->curr_resync = 0;
- mddev->resync_mismatches = 0;
+ atomic64_set(&mddev->resync_mismatches, 0);
mddev->suspend_lo = mddev->suspend_hi = 0;
mddev->sync_speed_min = mddev->sync_speed_max = 0;
mddev->recovery = 0;
@@ -5509,8 +5551,9 @@
int nr,working,insync,failed,spare;
struct md_rdev *rdev;
- nr=working=insync=failed=spare=0;
- rdev_for_each(rdev, mddev) {
+ nr = working = insync = failed = spare = 0;
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
nr++;
if (test_bit(Faulty, &rdev->flags))
failed++;
@@ -5522,6 +5565,7 @@
spare++;
}
}
+ rcu_read_unlock();
info.major_version = mddev->major_version;
info.minor_version = mddev->minor_version;
@@ -5605,7 +5649,8 @@
if (copy_from_user(&info, arg, sizeof(info)))
return -EFAULT;
- rdev = find_rdev_nr(mddev, info.number);
+ rcu_read_lock();
+ rdev = find_rdev_nr_rcu(mddev, info.number);
if (rdev) {
info.major = MAJOR(rdev->bdev->bd_dev);
info.minor = MINOR(rdev->bdev->bd_dev);
@@ -5624,6 +5669,7 @@
info.raid_disk = -1;
info.state = (1<<MD_DISK_REMOVED);
}
+ rcu_read_unlock();
if (copy_to_user(arg, &info, sizeof(info)))
return -EFAULT;
@@ -6232,18 +6278,22 @@
static int set_disk_faulty(struct mddev *mddev, dev_t dev)
{
struct md_rdev *rdev;
+ int err = 0;
if (mddev->pers == NULL)
return -ENODEV;
- rdev = find_rdev(mddev, dev);
+ rcu_read_lock();
+ rdev = find_rdev_rcu(mddev, dev);
if (!rdev)
- return -ENODEV;
-
- md_error(mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- return -EBUSY;
- return 0;
+ err = -ENODEV;
+ else {
+ md_error(mddev, rdev);
+ if (!test_bit(Faulty, &rdev->flags))
+ err = -EBUSY;
+ }
+ rcu_read_unlock();
+ return err;
}
/*
@@ -6315,6 +6365,27 @@
goto abort;
}
+ /* Some actions do not requires the mutex */
+ switch (cmd) {
+ case GET_ARRAY_INFO:
+ if (!mddev->raid_disks && !mddev->external)
+ err = -ENODEV;
+ else
+ err = get_array_info(mddev, argp);
+ goto abort;
+
+ case GET_DISK_INFO:
+ if (!mddev->raid_disks && !mddev->external)
+ err = -ENODEV;
+ else
+ err = get_disk_info(mddev, argp);
+ goto abort;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, new_decode_dev(arg));
+ goto abort;
+ }
+
err = mddev_lock(mddev);
if (err) {
printk(KERN_INFO
@@ -6387,18 +6458,10 @@
*/
switch (cmd)
{
- case GET_ARRAY_INFO:
- err = get_array_info(mddev, argp);
- goto done_unlock;
-
case GET_BITMAP_FILE:
err = get_bitmap_file(mddev, argp);
goto done_unlock;
- case GET_DISK_INFO:
- err = get_disk_info(mddev, argp);
- goto done_unlock;
-
case RESTART_ARRAY_RW:
err = restart_array(mddev);
goto done_unlock;
@@ -6480,10 +6543,6 @@
err = hot_add_disk(mddev, new_decode_dev(arg));
goto done_unlock;
- case SET_DISK_FAULTY:
- err = set_disk_faulty(mddev, new_decode_dev(arg));
- goto done_unlock;
-
case RUN_ARRAY:
err = do_md_run(mddev);
goto done_unlock;
@@ -6641,7 +6700,7 @@
clear_bit(THREAD_WAKEUP, &thread->flags);
if (!kthread_should_stop())
- thread->run(thread->mddev);
+ thread->run(thread);
}
return 0;
@@ -6656,8 +6715,8 @@
}
}
-struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev,
- const char *name)
+struct md_thread *md_register_thread(void (*run) (struct md_thread *),
+ struct mddev *mddev, const char *name)
{
struct md_thread *thread;
@@ -6752,7 +6811,11 @@
int scale;
unsigned int per_milli;
- resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
+ if (mddev->curr_resync <= 3)
+ resync = 0;
+ else
+ resync = mddev->curr_resync
+ - atomic_read(&mddev->recovery_active);
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
@@ -6978,7 +7041,7 @@
if (mddev->curr_resync > 2) {
status_resync(seq, mddev);
seq_printf(seq, "\n ");
- } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
+ } else if (mddev->curr_resync >= 1)
seq_printf(seq, "\tresync=DELAYED\n ");
else if (mddev->recovery_cp < MaxSector)
seq_printf(seq, "\tresync=PENDING\n ");
@@ -7206,8 +7269,9 @@
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
-void md_do_sync(struct mddev *mddev)
+void md_do_sync(struct md_thread *thread)
{
+ struct mddev *mddev = thread->mddev;
struct mddev *mddev2;
unsigned int currspeed = 0,
window;
@@ -7311,7 +7375,7 @@
* which defaults to physical size, but can be virtual size
*/
max_sectors = mddev->resync_max_sectors;
- mddev->resync_mismatches = 0;
+ atomic64_set(&mddev->resync_mismatches, 0);
/* we don't use the checkpoint if there's a bitmap */
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
j = mddev->resync_min;
@@ -7367,8 +7431,11 @@
"md: resuming %s of %s from checkpoint.\n",
desc, mdname(mddev));
mddev->curr_resync = j;
- }
+ } else
+ mddev->curr_resync = 3; /* no longer delayed */
mddev->curr_resync_completed = j;
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ md_new_event(mddev);
blk_start_plug(&plug);
while (j < max_sectors) {
@@ -7421,7 +7488,8 @@
break;
j += sectors;
- if (j>1) mddev->curr_resync = j;
+ if (j > 2)
+ mddev->curr_resync = j;
mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
/* this is the earliest that rebuild will be
@@ -7543,8 +7611,6 @@
int spares = 0;
int removed = 0;
- mddev->curr_resync_completed = 0;
-
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
@@ -7739,6 +7805,7 @@
/* Set RUNNING before clearing NEEDED to avoid
* any transients in the value of "sync_action".
*/
+ mddev->curr_resync_completed = 0;
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
/* Clear some bits that don't mean anything, but
* might be left set
@@ -7752,7 +7819,7 @@
/* no recovery is running.
* remove any failed drives, then
* add spares if possible.
- * Spare are also removed and re-added, to allow
+ * Spares are also removed and re-added, to allow
* the personality to fail the re-add.
*/
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f385b03..af443ab 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -282,7 +282,7 @@
sector_t resync_max_sectors; /* may be set by personality */
- sector_t resync_mismatches; /* count of sectors where
+ atomic64_t resync_mismatches; /* count of sectors where
* parity/replica mismatch found
*/
@@ -540,12 +540,13 @@
list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
struct md_thread {
- void (*run) (struct mddev *mddev);
+ void (*run) (struct md_thread *thread);
struct mddev *mddev;
wait_queue_head_t wqueue;
unsigned long flags;
struct task_struct *tsk;
unsigned long timeout;
+ void *private;
};
#define THREAD_WAKEUP 0
@@ -584,7 +585,7 @@
extern int register_md_personality(struct md_personality *p);
extern int unregister_md_personality(struct md_personality *p);
extern struct md_thread *md_register_thread(
- void (*run)(struct mddev *mddev),
+ void (*run)(struct md_thread *thread),
struct mddev *mddev,
const char *name);
extern void md_unregister_thread(struct md_thread **threadp);
@@ -603,7 +604,7 @@
extern void md_super_wait(struct mddev *mddev);
extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct page *page, int rw, bool metadata_op);
-extern void md_do_sync(struct mddev *mddev);
+extern void md_do_sync(struct md_thread *thread);
extern void md_new_event(struct mddev *mddev);
extern int md_allow_write(struct mddev *mddev);
extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 61a1833..1642eae 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -335,8 +335,9 @@
* 3. Performs writes following reads for array syncronising.
*/
-static void multipathd (struct mddev *mddev)
+static void multipathd(struct md_thread *thread)
{
+ struct mddev *mddev = thread->mddev;
struct multipath_bh *mp_bh;
struct bio *bio;
unsigned long flags;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index a9e4fa9..24b3597 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -88,6 +88,7 @@
char b[BDEVNAME_SIZE];
char b2[BDEVNAME_SIZE];
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+ bool discard_supported = false;
if (!conf)
return -ENOMEM;
@@ -195,6 +196,9 @@
if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1;
cnt++;
+
+ if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
+ discard_supported = true;
}
if (cnt != mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
@@ -272,6 +276,11 @@
blk_queue_io_opt(mddev->queue,
(mddev->chunk_sectors << 9) * mddev->raid_disks);
+ if (!discard_supported)
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
pr_debug("md/raid0:%s: done.\n", mdname(mddev));
*private_conf = conf;
@@ -423,6 +432,7 @@
return -EINVAL;
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
@@ -510,7 +520,7 @@
sector_t sector = bio->bi_sector;
struct bio_pair *bp;
/* Sanity check -- queue functions should prevent this happening */
- if (bio->bi_vcnt != 1 ||
+ if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
bio->bi_idx != 0)
goto bad_map;
/* This is a one page bio that upper layers
@@ -536,6 +546,13 @@
bio->bi_sector = sector_offset + zone->dev_start +
tmp_dev->data_offset;
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ return;
+ }
+
generic_make_request(bio);
return;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 611b5f7..8034fbd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -333,9 +333,10 @@
spin_unlock_irqrestore(&conf->device_lock, flags);
}
- if (uptodate)
+ if (uptodate) {
raid_end_bio_io(r1_bio);
- else {
+ rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+ } else {
/*
* oops, read error:
*/
@@ -349,9 +350,8 @@
(unsigned long long)r1_bio->sector);
set_bit(R1BIO_ReadError, &r1_bio->state);
reschedule_retry(r1_bio);
+ /* don't drop the reference on read_disk yet */
}
-
- rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
}
static void close_write(struct r1bio *r1_bio)
@@ -781,7 +781,12 @@
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
bio->bi_next = NULL;
- generic_make_request(bio);
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ else
+ generic_make_request(bio);
bio = next;
}
} else
@@ -994,6 +999,8 @@
const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
+ const unsigned long do_discard = (bio->bi_rw
+ & (REQ_DISCARD | REQ_SECURE));
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL;
@@ -1295,7 +1302,7 @@
conf->mirrors[i].rdev->data_offset);
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_rw = WRITE | do_flush_fua | do_sync;
+ mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
@@ -1549,6 +1556,8 @@
clear_bit(Unmerged, &rdev->flags);
}
md_integrity_add_rdev(rdev, mddev);
+ if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
print_conf(conf);
return err;
}
@@ -1867,7 +1876,7 @@
} else
j = 0;
if (j >= 0)
- mddev->resync_mismatches += r1_bio->sectors;
+ atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
&& test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
/* No need to write to this device. */
@@ -2220,6 +2229,7 @@
unfreeze_array(conf);
} else
md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+ rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
bio = r1_bio->bios[r1_bio->read_disk];
bdevname(bio->bi_bdev, b);
@@ -2285,8 +2295,9 @@
}
}
-static void raid1d(struct mddev *mddev)
+static void raid1d(struct md_thread *thread)
{
+ struct mddev *mddev = thread->mddev;
struct r1bio *r1_bio;
unsigned long flags;
struct r1conf *conf = mddev->private;
@@ -2783,6 +2794,7 @@
int i;
struct md_rdev *rdev;
int ret;
+ bool discard_supported = false;
if (mddev->level != 1) {
printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
@@ -2812,6 +2824,8 @@
continue;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
+ if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ discard_supported = true;
}
mddev->degraded = 0;
@@ -2846,6 +2860,13 @@
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
+
+ if (discard_supported)
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ else
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
}
ret = md_integrity_register(mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0138a72..906ccbd 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -911,7 +911,12 @@
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
bio->bi_next = NULL;
- generic_make_request(bio);
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ else
+ generic_make_request(bio);
bio = next;
}
} else
@@ -1050,6 +1055,44 @@
return rdev->new_data_offset;
}
+struct raid10_plug_cb {
+ struct blk_plug_cb cb;
+ struct bio_list pending;
+ int pending_cnt;
+};
+
+static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
+ cb);
+ struct mddev *mddev = plug->cb.data;
+ struct r10conf *conf = mddev->private;
+ struct bio *bio;
+
+ if (from_schedule) {
+ spin_lock_irq(&conf->device_lock);
+ bio_list_merge(&conf->pending_bio_list, &plug->pending);
+ conf->pending_count += plug->pending_cnt;
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(mddev->thread);
+ kfree(plug);
+ return;
+ }
+
+ /* we aren't scheduling, so we can do the write-out directly. */
+ bio = bio_list_get(&plug->pending);
+ bitmap_unplug(mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = next;
+ }
+ kfree(plug);
+}
+
static void make_request(struct mddev *mddev, struct bio * bio)
{
struct r10conf *conf = mddev->private;
@@ -1061,8 +1104,12 @@
const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
+ const unsigned long do_discard = (bio->bi_rw
+ & (REQ_DISCARD | REQ_SECURE));
unsigned long flags;
struct md_rdev *blocked_rdev;
+ struct blk_plug_cb *cb;
+ struct raid10_plug_cb *plug = NULL;
int sectors_handled;
int max_sectors;
int sectors;
@@ -1081,7 +1128,7 @@
|| conf->prev.near_copies < conf->prev.raid_disks))) {
struct bio_pair *bp;
/* Sanity check -- queue functions should prevent this happening */
- if (bio->bi_vcnt != 1 ||
+ if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
bio->bi_idx != 0)
goto bad_map;
/* This is a one page bio that upper layers
@@ -1410,15 +1457,26 @@
conf->mirrors[d].rdev));
mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua;
+ mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
+
+ cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
+ if (cb)
+ plug = container_of(cb, struct raid10_plug_cb, cb);
+ else
+ plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
+ if (plug) {
+ bio_list_add(&plug->pending, mbio);
+ plug->pending_cnt++;
+ } else {
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ }
spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!mddev_check_plugged(mddev))
+ if (!plug)
md_wakeup_thread(mddev->thread);
if (!r10_bio->devs[i].repl_bio)
@@ -1439,7 +1497,7 @@
conf->mirrors[d].replacement));
mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua;
+ mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -1638,7 +1696,7 @@
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
count++;
- sysfs_notify_dirent(tmp->rdev->sysfs_state);
+ sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
@@ -1725,6 +1783,9 @@
clear_bit(Unmerged, &rdev->flags);
}
md_integrity_add_rdev(rdev, mddev);
+ if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
print_conf(conf);
return err;
}
@@ -1952,7 +2013,7 @@
break;
if (j == vcnt)
continue;
- mddev->resync_mismatches += r10_bio->sectors;
+ atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
/* Don't fix anything. */
continue;
@@ -2673,8 +2734,9 @@
}
}
-static void raid10d(struct mddev *mddev)
+static void raid10d(struct md_thread *thread)
{
+ struct mddev *mddev = thread->mddev;
struct r10bio *r10_bio;
unsigned long flags;
struct r10conf *conf = mddev->private;
@@ -3158,7 +3220,7 @@
else {
bad_sectors -= (sector - first_bad);
if (max_sync > bad_sectors)
- max_sync = max_sync;
+ max_sync = bad_sectors;
continue;
}
}
@@ -3482,6 +3544,7 @@
sector_t size;
sector_t min_offset_diff = 0;
int first = 1;
+ bool discard_supported = false;
if (mddev->private == NULL) {
conf = setup_conf(mddev);
@@ -3498,6 +3561,8 @@
chunk_size = mddev->chunk_sectors << 9;
if (mddev->queue) {
+ blk_queue_max_discard_sectors(mddev->queue,
+ mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, chunk_size);
if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
@@ -3543,8 +3608,16 @@
rdev->data_offset << 9);
disk->head_position = 0;
+
+ if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ discard_supported = true;
}
+ if (discard_supported)
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ else
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
/* need to check that every block has at least one working mirror */
if (!enough(conf, -1)) {
printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0689173..c5439dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -551,6 +551,8 @@
rw = WRITE_FUA;
else
rw = WRITE;
+ if (test_bit(R5_Discard, &sh->dev[i].flags))
+ rw |= REQ_DISCARD;
} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = READ;
else if (test_and_clear_bit(R5_WantReplace,
@@ -1174,8 +1176,11 @@
set_bit(R5_WantFUA, &dev->flags);
if (wbi->bi_rw & REQ_SYNC)
set_bit(R5_SyncIO, &dev->flags);
- tx = async_copy_data(1, wbi, dev->page,
- dev->sector, tx);
+ if (wbi->bi_rw & REQ_DISCARD)
+ set_bit(R5_Discard, &dev->flags);
+ else
+ tx = async_copy_data(1, wbi, dev->page,
+ dev->sector, tx);
wbi = r5_next_bio(wbi, dev->sector);
}
}
@@ -1191,7 +1196,7 @@
int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx;
int i;
- bool fua = false, sync = false;
+ bool fua = false, sync = false, discard = false;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
@@ -1199,13 +1204,15 @@
for (i = disks; i--; ) {
fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
+ discard |= test_bit(R5_Discard, &sh->dev[i].flags);
}
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (dev->written || i == pd_idx || i == qd_idx) {
- set_bit(R5_UPTODATE, &dev->flags);
+ if (!discard)
+ set_bit(R5_UPTODATE, &dev->flags);
if (fua)
set_bit(R5_WantFUA, &dev->flags);
if (sync)
@@ -1241,6 +1248,18 @@
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ for (i = 0; i < sh->disks; i++) {
+ if (pd_idx == i)
+ continue;
+ if (!test_bit(R5_Discard, &sh->dev[i].flags))
+ break;
+ }
+ if (i >= sh->disks) {
+ atomic_inc(&sh->count);
+ set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+ ops_complete_reconstruct(sh);
+ return;
+ }
/* check if prexor is active which means only process blocks
* that are part of a read-modify-write (written)
*/
@@ -1285,10 +1304,24 @@
{
struct async_submit_ctl submit;
struct page **blocks = percpu->scribble;
- int count;
+ int count, i;
pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+ for (i = 0; i < sh->disks; i++) {
+ if (sh->pd_idx == i || sh->qd_idx == i)
+ continue;
+ if (!test_bit(R5_Discard, &sh->dev[i].flags))
+ break;
+ }
+ if (i >= sh->disks) {
+ atomic_inc(&sh->count);
+ set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+ set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+ ops_complete_reconstruct(sh);
+ return;
+ }
+
count = set_syndrome_sources(blocks, sh);
atomic_inc(&sh->count);
@@ -2408,11 +2441,11 @@
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
}
- spin_unlock_irq(&sh->stripe_lock);
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
(unsigned long long)(*bip)->bi_sector,
(unsigned long long)sh->sector, dd_idx);
+ spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap && firstwrite) {
bitmap_startwrite(conf->mddev->bitmap, sh->sector,
@@ -2479,10 +2512,8 @@
bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
spin_unlock_irq(&sh->stripe_lock);
- if (bi) {
- s->to_write--;
+ if (bi)
bitmap_end = 1;
- }
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@@ -2524,11 +2555,12 @@
if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
(!test_bit(R5_Insync, &sh->dev[i].flags) ||
test_bit(R5_ReadError, &sh->dev[i].flags))) {
+ spin_lock_irq(&sh->stripe_lock);
bi = sh->dev[i].toread;
sh->dev[i].toread = NULL;
+ spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- if (bi) s->to_read--;
while (bi && bi->bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi =
@@ -2741,7 +2773,8 @@
if (sh->dev[i].written) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
- test_bit(R5_UPTODATE, &dev->flags)) {
+ (test_bit(R5_UPTODATE, &dev->flags) ||
+ test_and_clear_bit(R5_Discard, &dev->flags))) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
pr_debug("Return write for disc %d\n", i);
@@ -2775,12 +2808,25 @@
int disks)
{
int rmw = 0, rcw = 0, i;
- if (conf->max_degraded == 2) {
- /* RAID6 requires 'rcw' in current implementation
- * Calculate the real rcw later - for now fake it
+ sector_t recovery_cp = conf->mddev->recovery_cp;
+
+ /* RAID6 requires 'rcw' in current implementation.
+ * Otherwise, check whether resync is now happening or should start.
+ * If yes, then the array is dirty (after unclean shutdown or
+ * initial creation), so parity in some stripes might be inconsistent.
+ * In this case, we need to always do reconstruct-write, to ensure
+ * that in case of drive failure or read-error correction, we
+ * generate correct data from the parity.
+ */
+ if (conf->max_degraded == 2 ||
+ (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
+ /* Calculate the real rcw later - for now make it
* look like rcw is cheaper
*/
rcw = 1; rmw = 2;
+ pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
+ conf->max_degraded, (unsigned long long)recovery_cp,
+ (unsigned long long)sh->sector);
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
@@ -2932,7 +2978,7 @@
*/
set_bit(STRIPE_INSYNC, &sh->state);
else {
- conf->mddev->resync_mismatches += STRIPE_SECTORS;
+ atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
@@ -3084,7 +3130,7 @@
*/
}
} else {
- conf->mddev->resync_mismatches += STRIPE_SECTORS;
+ atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
@@ -3459,10 +3505,12 @@
if (s.written &&
(s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
&& !test_bit(R5_LOCKED, &pdev->flags)
- && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+ && (test_bit(R5_UPTODATE, &pdev->flags) ||
+ test_bit(R5_Discard, &pdev->flags))))) &&
(s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
&& !test_bit(R5_LOCKED, &qdev->flags)
- && test_bit(R5_UPTODATE, &qdev->flags)))))
+ && (test_bit(R5_UPTODATE, &qdev->flags) ||
+ test_bit(R5_Discard, &qdev->flags))))))
handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
/* Now we might consider reading some blocks, either to check/generate
@@ -3489,9 +3537,11 @@
/* All the 'written' buffers and the parity block are ready to
* be written back to disk
*/
- BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
+ !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
BUG_ON(sh->qd_idx >= 0 &&
- !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
+ !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
+ !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_LOCKED, &dev->flags) &&
@@ -4072,6 +4122,88 @@
release_stripe(sh);
}
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
+{
+ struct r5conf *conf = mddev->private;
+ sector_t logical_sector, last_sector;
+ struct stripe_head *sh;
+ int remaining;
+ int stripe_sectors;
+
+ if (mddev->reshape_position != MaxSector)
+ /* Skip discard while reshape is happening */
+ return;
+
+ logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+ bi->bi_next = NULL;
+ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+ stripe_sectors = conf->chunk_sectors *
+ (conf->raid_disks - conf->max_degraded);
+ logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+ stripe_sectors);
+ sector_div(last_sector, stripe_sectors);
+
+ logical_sector *= conf->chunk_sectors;
+ last_sector *= conf->chunk_sectors;
+
+ for (; logical_sector < last_sector;
+ logical_sector += STRIPE_SECTORS) {
+ DEFINE_WAIT(w);
+ int d;
+ again:
+ sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+ prepare_to_wait(&conf->wait_for_overlap, &w,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock_irq(&sh->stripe_lock);
+ for (d = 0; d < conf->raid_disks; d++) {
+ if (d == sh->pd_idx || d == sh->qd_idx)
+ continue;
+ if (sh->dev[d].towrite || sh->dev[d].toread) {
+ set_bit(R5_Overlap, &sh->dev[d].flags);
+ spin_unlock_irq(&sh->stripe_lock);
+ release_stripe(sh);
+ schedule();
+ goto again;
+ }
+ }
+ finish_wait(&conf->wait_for_overlap, &w);
+ for (d = 0; d < conf->raid_disks; d++) {
+ if (d == sh->pd_idx || d == sh->qd_idx)
+ continue;
+ sh->dev[d].towrite = bi;
+ set_bit(R5_OVERWRITE, &sh->dev[d].flags);
+ raid5_inc_bi_active_stripes(bi);
+ }
+ spin_unlock_irq(&sh->stripe_lock);
+ if (conf->mddev->bitmap) {
+ for (d = 0;
+ d < conf->raid_disks - conf->max_degraded;
+ d++)
+ bitmap_startwrite(mddev->bitmap,
+ sh->sector,
+ STRIPE_SECTORS,
+ 0);
+ sh->bm_seq = conf->seq_flush + 1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe_plug(mddev, sh);
+ }
+
+ remaining = raid5_dec_bi_active_stripes(bi);
+ if (remaining == 0) {
+ md_write_end(mddev);
+ bio_endio(bi, 0);
+ }
+}
+
static void make_request(struct mddev *mddev, struct bio * bi)
{
struct r5conf *conf = mddev->private;
@@ -4094,6 +4226,11 @@
chunk_aligned_read(mddev,bi))
return;
+ if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+ make_discard_request(mddev, bi);
+ return;
+ }
+
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bi->bi_sector + (bi->bi_size>>9);
bi->bi_next = NULL;
@@ -4630,8 +4767,9 @@
* During the scan, completed stripes are saved for us by the interrupt
* handler, so that they will not have to wait for our next wakeup.
*/
-static void raid5d(struct mddev *mddev)
+static void raid5d(struct md_thread *thread)
{
+ struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
int handled;
struct blk_plug plug;
@@ -5366,6 +5504,7 @@
if (mddev->queue) {
int chunk_size;
+ bool discard_supported = true;
/* read-ahead size must cover two whole stripes, which
* is 2 * (datadisks) * chunksize where 'n' is the
* number of raid devices
@@ -5385,13 +5524,48 @@
blk_queue_io_min(mddev->queue, chunk_size);
blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded));
+ /*
+ * We can only discard a whole stripe. It doesn't make sense to
+ * discard data disk but write parity disk
+ */
+ stripe = stripe * PAGE_SIZE;
+ mddev->queue->limits.discard_alignment = stripe;
+ mddev->queue->limits.discard_granularity = stripe;
+ /*
+ * unaligned part of discard request will be ignored, so can't
+ * guarantee discard_zerors_data
+ */
+ mddev->queue->limits.discard_zeroes_data = 0;
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
+ /*
+ * discard_zeroes_data is required, otherwise data
+ * could be lost. Consider a scenario: discard a stripe
+ * (the stripe could be inconsistent if
+ * discard_zeroes_data is 0); write one disk of the
+ * stripe (the stripe could be inconsistent again
+ * depending on which disks are used to calculate
+ * parity); the disk is broken; The stripe data of this
+ * disk is lost.
+ */
+ if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+ !bdev_get_queue(rdev->bdev)->
+ limits.discard_zeroes_data)
+ discard_supported = false;
}
+
+ if (discard_supported &&
+ mddev->queue->limits.max_discard_sectors >= stripe &&
+ mddev->queue->limits.discard_granularity >= stripe)
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ else
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
}
return 0;
@@ -5702,7 +5876,8 @@
if (!check_stripe_cache(mddev))
return -ENOSPC;
- return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
+ return resize_stripes(conf, (conf->previous_raid_disks
+ + mddev->delta_disks));
}
static int raid5_start_reshape(struct mddev *mddev)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a9fc249..18b2c4a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -298,6 +298,7 @@
R5_WantReplace, /* We need to update the replacement, we have read
* data in, and now is a good time to write it out.
*/
+ R5_Discard, /* Discard the stripe */
};
/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e9ebb47..81e407d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,8 +2952,8 @@
struct btrfs_inode_item *item,
struct inode *inode, int log_inode_only)
{
- btrfs_set_inode_uid(leaf, item, inode->i_uid);
- btrfs_set_inode_gid(leaf, item, inode->i_gid);
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
btrfs_set_inode_mode(leaf, item, inode->i_mode);
btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 11efd83..9fbea87 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -45,7 +45,7 @@
break;
case ACL_GROUP:
gid = make_kgid(from, le32_to_cpu(entry->e_id));
- entry->e_id = cpu_to_le32(from_kuid(to, uid));
+ entry->e_id = cpu_to_le32(from_kgid(to, gid));
break;
default:
break;
diff --git a/include/linux/spi/Kbuild b/include/linux/spi/Kbuild
index d375a08..e69de29 100644
--- a/include/linux/spi/Kbuild
+++ b/include/linux/spi/Kbuild
@@ -1 +0,0 @@
-header-y += spidev.h
diff --git a/include/uapi/linux/spi/Kbuild b/include/uapi/linux/spi/Kbuild
index aafaa5aa5..0cc747e 100644
--- a/include/uapi/linux/spi/Kbuild
+++ b/include/uapi/linux/spi/Kbuild
@@ -1 +1,2 @@
# UAPI Header export list
+header-y += spidev.h
diff --git a/include/linux/spi/spidev.h b/include/uapi/linux/spi/spidev.h
similarity index 100%
rename from include/linux/spi/spidev.h
rename to include/uapi/linux/spi/spidev.h
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 9d49ee6..ba033f0 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -591,7 +591,7 @@
atomic_read(&sk->sk_refcnt),
sk_rmem_alloc_get(sk),
sk_wmem_alloc_get(sk),
- sock_i_uid(sk),
+ from_kuid(seq_user_ns(seq), sock_i_uid(sk)),
sock_i_ino(sk),
&src_baswapped,
&dst_baswapped,