Blame - fs/ocfs2/cluster/quorum.c - linux

blob: 189c111bc37139b66b38707f470233806ee33096 [file] [log] [blame]

Thomas Gleixner	328970d	2019-05-24 12:04:05 +0200	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-or-later
Masahiro Yamada	fa60ce2	2021-05-06 18:06:44 -0700	[diff] [blame]	2	/*
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	3	*
				4	* Copyright (C) 2005 Oracle. All rights reserved.
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	5	*/
				6
				7	/* This quorum hack is only here until we transition to some more rational
				8	* approach that is driven from userspace. Honest. No foolin'.
				9	*
				10	* Imagine two nodes lose network connectivity to each other but they're still
				11	* up and operating in every other way. Presumably a network timeout indicates
				12	* that a node is broken and should be recovered. They can't both recover each
				13	* other and both carry on without serialising their access to the file system.
				14	* They need to decide who is authoritative. Now extend that problem to
				15	* arbitrary groups of nodes losing connectivity between each other.
				16	*
				17	* So we declare that a node which has given up on connecting to a majority
				18	* of nodes who are still heartbeating will fence itself.
				19	*
				20	* There are huge opportunities for races here. After we give up on a node's
				21	* connection we need to wait long enough to give heartbeat an opportunity
				22	* to declare the node as truly dead. We also need to be careful with the
				23	* race between when we see a node start heartbeating and when we connect
				24	* to it.
				25	*
				26	* So nodes that are in this transtion put a hold on the quorum decision
				27	* with a counter. As they fall out of this transition they drop the count
				28	* and if they're the last, they fire off the decision.
				29	*/
				30	#include <linux/kernel.h>
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	31	#include <linux/workqueue.h>
Sunil Mushran	bebe6f1	2007-04-17 13:53:38 -0700	[diff] [blame]	32	#include <linux/reboot.h>
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	33
				34	#include "heartbeat.h"
				35	#include "nodemanager.h"
				36	#define MLOG_MASK_PREFIX ML_QUORUM
				37	#include "masklog.h"
				38	#include "quorum.h"
				39
				40	static struct o2quo_state {
				41	spinlock_t qs_lock;
				42	struct work_struct qs_work;
				43	int qs_pending;
				44	int qs_heartbeating;
				45	unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
				46	int qs_connected;
				47	unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
				48	int qs_holds;
				49	unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
				50	} o2quo_state;
				51
				52	/* this is horribly heavy-handed. It should instead flip the file
				53	* system RO and call some userspace script. */
				54	static void o2quo_fence_self(void)
				55	{
				56	/* panic spins with interrupts enabled. with preempt
				57	* threads can still schedule, etc, etc */
				58	o2hb_stop_all_regions();
Sunil Mushran	bebe6f1	2007-04-17 13:53:38 -0700	[diff] [blame]	59
Sunil Mushran	f6656d2	2009-11-17 16:29:19 -0800	[diff] [blame]	60	switch (o2nm_single_cluster->cl_fence_method) {
				61	case O2NM_FENCE_PANIC:
				62	panic("*** ocfs2 is very sorry to be fencing this system by "
				63	"panicing ***\n");
				64	break;
				65	default:
				66	WARN_ON(o2nm_single_cluster->cl_fence_method >=
				67	O2NM_FENCE_METHODS);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	68	fallthrough;
Sunil Mushran	f6656d2	2009-11-17 16:29:19 -0800	[diff] [blame]	69	case O2NM_FENCE_RESET:
				70	printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
				71	"system by restarting ***\n");
				72	emergency_restart();
				73	break;
zhengbin	5b43d64	2020-01-30 22:11:36 -0800	[diff] [blame]	74	}
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	75	}
				76
ChenGang	e926d8a	2019-07-11 20:52:55 -0700	[diff] [blame]	77	/* Indicate that a timeout occurred on a heartbeat region write. The
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	78	* other nodes in the cluster may consider us dead at that time so we
				79	* want to "fence" ourselves so that we don't scribble on the disk
				80	* after they think they've recovered us. This can't solve all
				81	* problems related to writeout after recovery but this hack can at
				82	* least close some of those gaps. When we have real fencing, this can
				83	* go away as our node would be fenced externally before other nodes
				84	* begin recovery. */
				85	void o2quo_disk_timeout(void)
				86	{
				87	o2quo_fence_self();
				88	}
				89
David Howells	c402895	2006-11-22 14:57:56 +0000	[diff] [blame]	90	static void o2quo_make_decision(struct work_struct *work)
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	91	{
				92	int quorum;
				93	int lowest_hb, lowest_reachable = 0, fence = 0;
				94	struct o2quo_state *qs = &o2quo_state;
				95
				96	spin_lock(&qs->qs_lock);
				97
				98	lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
				99	if (lowest_hb != O2NM_MAX_NODES)
				100	lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
				101
				102	mlog(0, "heartbeating: %d, connected: %d, "
				103	"lowest: %d (%sreachable)\n", qs->qs_heartbeating,
				104	qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
				105
				106	if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) \|\|
				107	qs->qs_heartbeating == 1)
				108	goto out;
				109
				110	if (qs->qs_heartbeating & 1) {
				111	/* the odd numbered cluster case is straight forward --
				112	* if we can't talk to the majority we're hosed */
				113	quorum = (qs->qs_heartbeating + 1)/2;
				114	if (qs->qs_connected < quorum) {
				115	mlog(ML_ERROR, "fencing this node because it is "
				116	"only connected to %u nodes and %u is needed "
				117	"to make a quorum out of %u heartbeating nodes\n",
				118	qs->qs_connected, quorum,
				119	qs->qs_heartbeating);
				120	fence = 1;
				121	}
				122	} else {
				123	/* the even numbered cluster adds the possibility of each half
				124	* of the cluster being able to talk amongst themselves.. in
				125	* that case we're hosed if we can't talk to the group that has
				126	* the lowest numbered node */
				127	quorum = qs->qs_heartbeating / 2;
				128	if (qs->qs_connected < quorum) {
				129	mlog(ML_ERROR, "fencing this node because it is "
				130	"only connected to %u nodes and %u is needed "
				131	"to make a quorum out of %u heartbeating nodes\n",
				132	qs->qs_connected, quorum,
				133	qs->qs_heartbeating);
				134	fence = 1;
				135	}
				136	else if ((qs->qs_connected == quorum) &&
				137	!lowest_reachable) {
				138	mlog(ML_ERROR, "fencing this node because it is "
				139	"connected to a half-quorum of %u out of %u "
				140	"nodes which doesn't include the lowest active "
				141	"node %u\n", quorum, qs->qs_heartbeating,
				142	lowest_hb);
				143	fence = 1;
				144	}
				145	}
				146
				147	out:
Junxiao Bi	8c7b638	2014-08-29 15:19:04 -0700	[diff] [blame]	148	if (fence) {
				149	spin_unlock(&qs->qs_lock);
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	150	o2quo_fence_self();
Junxiao Bi	8c7b638	2014-08-29 15:19:04 -0700	[diff] [blame]	151	} else {
				152	mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
				153	"connected: %d, lowest: %d (%sreachable)\n",
				154	qs->qs_heartbeating, qs->qs_connected, lowest_hb,
				155	lowest_reachable ? "" : "un");
				156	spin_unlock(&qs->qs_lock);
				157
				158	}
				159
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	160	}
				161
				162	static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
				163	{
				164	assert_spin_locked(&qs->qs_lock);
				165
				166	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
				167	qs->qs_holds++;
				168	mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
				169	"node %u\n", node);
				170	mlog(0, "node %u, %d total\n", node, qs->qs_holds);
				171	}
				172	}
				173
				174	static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
				175	{
				176	assert_spin_locked(&qs->qs_lock);
				177
				178	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
				179	mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
				180	if (--qs->qs_holds == 0) {
				181	if (qs->qs_pending) {
				182	qs->qs_pending = 0;
				183	schedule_work(&qs->qs_work);
				184	}
				185	}
				186	mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
				187	node, qs->qs_holds);
				188	}
				189	}
				190
				191	/* as a node comes up we delay the quorum decision until we know the fate of
				192	* the connection. the hold will be droped in conn_up or hb_down. it might be
				193	* perpetuated by con_err until hb_down. if we already have a conn, we might
				194	* be dropping a hold that conn_up got. */
				195	void o2quo_hb_up(u8 node)
				196	{
				197	struct o2quo_state *qs = &o2quo_state;
				198
				199	spin_lock(&qs->qs_lock);
				200
				201	qs->qs_heartbeating++;
				202	mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
				203	"node %u\n", node);
				204	mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
				205	set_bit(node, qs->qs_hb_bm);
				206
				207	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
				208
				209	if (!test_bit(node, qs->qs_conn_bm))
				210	o2quo_set_hold(qs, node);
				211	else
				212	o2quo_clear_hold(qs, node);
				213
				214	spin_unlock(&qs->qs_lock);
				215	}
				216
				217	/* hb going down releases any holds we might have had due to this node from
				218	* conn_up, conn_err, or hb_up */
				219	void o2quo_hb_down(u8 node)
				220	{
				221	struct o2quo_state *qs = &o2quo_state;
				222
				223	spin_lock(&qs->qs_lock);
				224
				225	qs->qs_heartbeating--;
				226	mlog_bug_on_msg(qs->qs_heartbeating < 0,
				227	"node %u, %d heartbeating\n",
				228	node, qs->qs_heartbeating);
				229	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
				230	clear_bit(node, qs->qs_hb_bm);
				231
				232	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
				233
				234	o2quo_clear_hold(qs, node);
				235
				236	spin_unlock(&qs->qs_lock);
				237	}
				238
				239	/* this tells us that we've decided that the node is still heartbeating
				240	* even though we've lost it's conn. it must only be called after conn_err
				241	* and indicates that we must now make a quorum decision in the future,
				242	* though we might be doing so after waiting for holds to drain. Here
				243	* we'll be dropping the hold from conn_err. */
				244	void o2quo_hb_still_up(u8 node)
				245	{
				246	struct o2quo_state *qs = &o2quo_state;
				247
				248	spin_lock(&qs->qs_lock);
				249
				250	mlog(0, "node %u\n", node);
				251
				252	qs->qs_pending = 1;
				253	o2quo_clear_hold(qs, node);
				254
				255	spin_unlock(&qs->qs_lock);
				256	}
				257
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	258	/* This is analogous to hb_up. as a node's connection comes up we delay the
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	259	* quorum decision until we see it heartbeating. the hold will be droped in
				260	* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
Jie Liu	b4d8ed4	2013-07-03 15:01:07 -0700	[diff] [blame]	261	* it's already heartbeating we might be dropping a hold that conn_up got.
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	262	* */
				263	void o2quo_conn_up(u8 node)
				264	{
				265	struct o2quo_state *qs = &o2quo_state;
				266
				267	spin_lock(&qs->qs_lock);
				268
				269	qs->qs_connected++;
				270	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
				271	"node %u\n", node);
				272	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
				273	set_bit(node, qs->qs_conn_bm);
				274
				275	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
				276
				277	if (!test_bit(node, qs->qs_hb_bm))
				278	o2quo_set_hold(qs, node);
				279	else
				280	o2quo_clear_hold(qs, node);
				281
				282	spin_unlock(&qs->qs_lock);
				283	}
				284
				285	/* we've decided that we won't ever be connecting to the node again. if it's
				286	* still heartbeating we grab a hold that will delay decisions until either the
				287	* node stops heartbeating from hb_down or the caller decides that the node is
				288	* still up and calls still_up */
				289	void o2quo_conn_err(u8 node)
				290	{
				291	struct o2quo_state *qs = &o2quo_state;
				292
				293	spin_lock(&qs->qs_lock);
				294
				295	if (test_bit(node, qs->qs_conn_bm)) {
				296	qs->qs_connected--;
				297	mlog_bug_on_msg(qs->qs_connected < 0,
				298	"node %u, connected %d\n",
				299	node, qs->qs_connected);
				300
				301	clear_bit(node, qs->qs_conn_bm);
Yang Zhang	fc2af28	2018-01-31 16:14:33 -0800	[diff] [blame]	302
				303	if (test_bit(node, qs->qs_hb_bm))
				304	o2quo_set_hold(qs, node);
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	305	}
				306
				307	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
				308
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	309
				310	spin_unlock(&qs->qs_lock);
				311	}
				312
				313	void o2quo_init(void)
				314	{
				315	struct o2quo_state *qs = &o2quo_state;
				316
				317	spin_lock_init(&qs->qs_lock);
David Howells	c402895	2006-11-22 14:57:56 +0000	[diff] [blame]	318	INIT_WORK(&qs->qs_work, o2quo_make_decision);
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	319	}
				320
				321	void o2quo_exit(void)
				322	{
Tejun Heo	9b00a81	2010-12-24 15:59:06 +0100	[diff] [blame]	323	struct o2quo_state *qs = &o2quo_state;
				324
Tejun Heo	4382973	2012-08-20 14:51:24 -0700	[diff] [blame]	325	flush_work(&qs->qs_work);
Zach Brown	98211489	2005-12-15 14:31:23 -0800	[diff] [blame]	326	}