| // SPDX-License-Identifier: GPL-2.0-only |
| /****************************************************************************** |
| ******************************************************************************* |
| ** |
| ** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. |
| ** |
| ** |
| ******************************************************************************* |
| ******************************************************************************/ |
| |
| #include "dlm_internal.h" |
| #include "lockspace.h" |
| #include "member.h" |
| #include "recoverd.h" |
| #include "recover.h" |
| #include "rcom.h" |
| #include "config.h" |
| #include "midcomms.h" |
| #include "lowcomms.h" |
| |
| int dlm_slots_version(const struct dlm_header *h) |
| { |
| if ((le32_to_cpu(h->h_version) & 0x0000FFFF) < DLM_HEADER_SLOTS) |
| return 0; |
| return 1; |
| } |
| |
| void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, |
| struct dlm_member *memb) |
| { |
| struct rcom_config *rf = (struct rcom_config *)rc->rc_buf; |
| |
| if (!dlm_slots_version(&rc->rc_header)) |
| return; |
| |
| memb->slot = le16_to_cpu(rf->rf_our_slot); |
| memb->generation = le32_to_cpu(rf->rf_generation); |
| } |
| |
| void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc) |
| { |
| struct dlm_slot *slot; |
| struct rcom_slot *ro; |
| int i; |
| |
| ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); |
| |
| /* ls_slots array is sparse, but not rcom_slots */ |
| |
| for (i = 0; i < ls->ls_slots_size; i++) { |
| slot = &ls->ls_slots[i]; |
| if (!slot->nodeid) |
| continue; |
| ro->ro_nodeid = cpu_to_le32(slot->nodeid); |
| ro->ro_slot = cpu_to_le16(slot->slot); |
| ro++; |
| } |
| } |
| |
| #define SLOT_DEBUG_LINE 128 |
| |
| static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, |
| struct rcom_slot *ro0, struct dlm_slot *array, |
| int array_size) |
| { |
| char line[SLOT_DEBUG_LINE]; |
| int len = SLOT_DEBUG_LINE - 1; |
| int pos = 0; |
| int ret, i; |
| |
| memset(line, 0, sizeof(line)); |
| |
| if (array) { |
| for (i = 0; i < array_size; i++) { |
| if (!array[i].nodeid) |
| continue; |
| |
| ret = snprintf(line + pos, len - pos, " %d:%d", |
| array[i].slot, array[i].nodeid); |
| if (ret >= len - pos) |
| break; |
| pos += ret; |
| } |
| } else if (ro0) { |
| for (i = 0; i < num_slots; i++) { |
| ret = snprintf(line + pos, len - pos, " %d:%d", |
| ro0[i].ro_slot, ro0[i].ro_nodeid); |
| if (ret >= len - pos) |
| break; |
| pos += ret; |
| } |
| } |
| |
| log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line); |
| } |
| |
| int dlm_slots_copy_in(struct dlm_ls *ls) |
| { |
| struct dlm_member *memb; |
| struct dlm_rcom *rc = ls->ls_recover_buf; |
| struct rcom_config *rf = (struct rcom_config *)rc->rc_buf; |
| struct rcom_slot *ro0, *ro; |
| int our_nodeid = dlm_our_nodeid(); |
| int i, num_slots; |
| uint32_t gen; |
| |
| if (!dlm_slots_version(&rc->rc_header)) |
| return -1; |
| |
| gen = le32_to_cpu(rf->rf_generation); |
| if (gen <= ls->ls_generation) { |
| log_error(ls, "dlm_slots_copy_in gen %u old %u", |
| gen, ls->ls_generation); |
| } |
| ls->ls_generation = gen; |
| |
| num_slots = le16_to_cpu(rf->rf_num_slots); |
| if (!num_slots) |
| return -1; |
| |
| ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); |
| |
| log_slots(ls, gen, num_slots, ro0, NULL, 0); |
| |
| list_for_each_entry(memb, &ls->ls_nodes, list) { |
| for (i = 0, ro = ro0; i < num_slots; i++, ro++) { |
| if (le32_to_cpu(ro->ro_nodeid) != memb->nodeid) |
| continue; |
| memb->slot = le16_to_cpu(ro->ro_slot); |
| memb->slot_prev = memb->slot; |
| break; |
| } |
| |
| if (memb->nodeid == our_nodeid) { |
| if (ls->ls_slot && ls->ls_slot != memb->slot) { |
| log_error(ls, "dlm_slots_copy_in our slot " |
| "changed %d %d", ls->ls_slot, |
| memb->slot); |
| return -1; |
| } |
| |
| if (!ls->ls_slot) |
| ls->ls_slot = memb->slot; |
| } |
| |
| if (!memb->slot) { |
| log_error(ls, "dlm_slots_copy_in nodeid %d no slot", |
| memb->nodeid); |
| return -1; |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* for any nodes that do not support slots, we will not have set memb->slot |
| in wait_status_all(), so memb->slot will remain -1, and we will not |
| assign slots or set ls_num_slots here */ |
| |
| int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, |
| struct dlm_slot **slots_out, uint32_t *gen_out) |
| { |
| struct dlm_member *memb; |
| struct dlm_slot *array; |
| int our_nodeid = dlm_our_nodeid(); |
| int array_size, max_slots, i; |
| int need = 0; |
| int max = 0; |
| int num = 0; |
| uint32_t gen = 0; |
| |
| /* our own memb struct will have slot -1 gen 0 */ |
| |
| list_for_each_entry(memb, &ls->ls_nodes, list) { |
| if (memb->nodeid == our_nodeid) { |
| memb->slot = ls->ls_slot; |
| memb->generation = ls->ls_generation; |
| break; |
| } |
| } |
| |
| list_for_each_entry(memb, &ls->ls_nodes, list) { |
| if (memb->generation > gen) |
| gen = memb->generation; |
| |
| /* node doesn't support slots */ |
| |
| if (memb->slot == -1) |
| return -1; |
| |
| /* node needs a slot assigned */ |
| |
| if (!memb->slot) |
| need++; |
| |
| /* node has a slot assigned */ |
| |
| num++; |
| |
| if (!max || max < memb->slot) |
| max = memb->slot; |
| |
| /* sanity check, once slot is assigned it shouldn't change */ |
| |
| if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) { |
| log_error(ls, "nodeid %d slot changed %d %d", |
| memb->nodeid, memb->slot_prev, memb->slot); |
| return -1; |
| } |
| memb->slot_prev = memb->slot; |
| } |
| |
| array_size = max + need; |
| array = kcalloc(array_size, sizeof(*array), GFP_NOFS); |
| if (!array) |
| return -ENOMEM; |
| |
| num = 0; |
| |
| /* fill in slots (offsets) that are used */ |
| |
| list_for_each_entry(memb, &ls->ls_nodes, list) { |
| if (!memb->slot) |
| continue; |
| |
| if (memb->slot > array_size) { |
| log_error(ls, "invalid slot number %d", memb->slot); |
| kfree(array); |
| return -1; |
| } |
| |
| array[memb->slot - 1].nodeid = memb->nodeid; |
| array[memb->slot - 1].slot = memb->slot; |
| num++; |
| } |
| |
| /* assign new slots from unused offsets */ |
| |
| list_for_each_entry(memb, &ls->ls_nodes, list) { |
| if (memb->slot) |
| continue; |
| |
| for (i = 0; i < array_size; i++) { |
| if (array[i].nodeid) |
| continue; |
| |
| memb->slot = i + 1; |
| memb->slot_prev = memb->slot; |
| array[i].nodeid = memb->nodeid; |
| array[i].slot = memb->slot; |
| num++; |
| |
| if (!ls->ls_slot && memb->nodeid == our_nodeid) |
| ls->ls_slot = memb->slot; |
| break; |
| } |
| |
| if (!memb->slot) { |
| log_error(ls, "no free slot found"); |
| kfree(array); |
| return -1; |
| } |
| } |
| |
| gen++; |
| |
| log_slots(ls, gen, num, NULL, array, array_size); |
| |
| max_slots = (DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom) - |
| sizeof(struct rcom_config)) / sizeof(struct rcom_slot); |
| |
| if (num > max_slots) { |
| log_error(ls, "num_slots %d exceeds max_slots %d", |
| num, max_slots); |
| kfree(array); |
| return -1; |
| } |
| |
| *gen_out = gen; |
| *slots_out = array; |
| *slots_size = array_size; |
| *num_slots = num; |
| return 0; |
| } |
| |
| static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) |
| { |
| struct dlm_member *memb = NULL; |
| struct list_head *tmp; |
| struct list_head *newlist = &new->list; |
| struct list_head *head = &ls->ls_nodes; |
| |
| list_for_each(tmp, head) { |
| memb = list_entry(tmp, struct dlm_member, list); |
| if (new->nodeid < memb->nodeid) |
| break; |
| } |
| |
| if (!memb) |
| list_add_tail(newlist, head); |
| else { |
| /* FIXME: can use list macro here */ |
| newlist->prev = tmp->prev; |
| newlist->next = tmp; |
| tmp->prev->next = newlist; |
| tmp->prev = newlist; |
| } |
| } |
| |
| static int add_remote_member(int nodeid) |
| { |
| int error; |
| |
| if (nodeid == dlm_our_nodeid()) |
| return 0; |
| |
| error = dlm_lowcomms_connect_node(nodeid); |
| if (error < 0) |
| return error; |
| |
| dlm_midcomms_add_member(nodeid); |
| return 0; |
| } |
| |
| static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) |
| { |
| struct dlm_member *memb; |
| int error; |
| |
| memb = kzalloc(sizeof(*memb), GFP_NOFS); |
| if (!memb) |
| return -ENOMEM; |
| |
| memb->nodeid = node->nodeid; |
| memb->weight = node->weight; |
| memb->comm_seq = node->comm_seq; |
| |
| error = add_remote_member(node->nodeid); |
| if (error < 0) { |
| kfree(memb); |
| return error; |
| } |
| |
| add_ordered_member(ls, memb); |
| ls->ls_num_nodes++; |
| return 0; |
| } |
| |
| static struct dlm_member *find_memb(struct list_head *head, int nodeid) |
| { |
| struct dlm_member *memb; |
| |
| list_for_each_entry(memb, head, list) { |
| if (memb->nodeid == nodeid) |
| return memb; |
| } |
| return NULL; |
| } |
| |
| int dlm_is_member(struct dlm_ls *ls, int nodeid) |
| { |
| if (find_memb(&ls->ls_nodes, nodeid)) |
| return 1; |
| return 0; |
| } |
| |
| int dlm_is_removed(struct dlm_ls *ls, int nodeid) |
| { |
| if (find_memb(&ls->ls_nodes_gone, nodeid)) |
| return 1; |
| return 0; |
| } |
| |
| static void clear_memb_list(struct list_head *head, |
| void (*after_del)(int nodeid)) |
| { |
| struct dlm_member *memb; |
| |
| while (!list_empty(head)) { |
| memb = list_entry(head->next, struct dlm_member, list); |
| list_del(&memb->list); |
| if (after_del) |
| after_del(memb->nodeid); |
| kfree(memb); |
| } |
| } |
| |
| static void remove_remote_member(int nodeid) |
| { |
| if (nodeid == dlm_our_nodeid()) |
| return; |
| |
| dlm_midcomms_remove_member(nodeid); |
| } |
| |
| void dlm_clear_members(struct dlm_ls *ls) |
| { |
| clear_memb_list(&ls->ls_nodes, remove_remote_member); |
| ls->ls_num_nodes = 0; |
| } |
| |
| void dlm_clear_members_gone(struct dlm_ls *ls) |
| { |
| clear_memb_list(&ls->ls_nodes_gone, NULL); |
| } |
| |
| static void make_member_array(struct dlm_ls *ls) |
| { |
| struct dlm_member *memb; |
| int i, w, x = 0, total = 0, all_zero = 0, *array; |
| |
| kfree(ls->ls_node_array); |
| ls->ls_node_array = NULL; |
| |
| list_for_each_entry(memb, &ls->ls_nodes, list) { |
| if (memb->weight) |
| total += memb->weight; |
| } |
| |
| /* all nodes revert to weight of 1 if all have weight 0 */ |
| |
| if (!total) { |
| total = ls->ls_num_nodes; |
| all_zero = 1; |
| } |
| |
| ls->ls_total_weight = total; |
| array = kmalloc_array(total, sizeof(*array), GFP_NOFS); |
| if (!array) |
| return; |
| |
| list_for_each_entry(memb, &ls->ls_nodes, list) { |
| if (!all_zero && !memb->weight) |
| continue; |
| |
| if (all_zero) |
| w = 1; |
| else |
| w = memb->weight; |
| |
| DLM_ASSERT(x < total, printk("total %d x %d\n", total, x);); |
| |
| for (i = 0; i < w; i++) |
| array[x++] = memb->nodeid; |
| } |
| |
| ls->ls_node_array = array; |
| } |
| |
| /* send a status request to all members just to establish comms connections */ |
| |
| static int ping_members(struct dlm_ls *ls, uint64_t seq) |
| { |
| struct dlm_member *memb; |
| int error = 0; |
| |
| list_for_each_entry(memb, &ls->ls_nodes, list) { |
| if (dlm_recovery_stopped(ls)) { |
| error = -EINTR; |
| break; |
| } |
| error = dlm_rcom_status(ls, memb->nodeid, 0, seq); |
| if (error) |
| break; |
| } |
| if (error) |
| log_rinfo(ls, "ping_members aborted %d last nodeid %d", |
| error, ls->ls_recover_nodeid); |
| return error; |
| } |
| |
| static void dlm_lsop_recover_prep(struct dlm_ls *ls) |
| { |
| if (!ls->ls_ops || !ls->ls_ops->recover_prep) |
| return; |
| ls->ls_ops->recover_prep(ls->ls_ops_arg); |
| } |
| |
| static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) |
| { |
| struct dlm_slot slot; |
| uint32_t seq; |
| int error; |
| |
| if (!ls->ls_ops || !ls->ls_ops->recover_slot) |
| return; |
| |
| /* if there is no comms connection with this node |
| or the present comms connection is newer |
| than the one when this member was added, then |
| we consider the node to have failed (versus |
| being removed due to dlm_release_lockspace) */ |
| |
| error = dlm_comm_seq(memb->nodeid, &seq); |
| |
| if (!error && seq == memb->comm_seq) |
| return; |
| |
| slot.nodeid = memb->nodeid; |
| slot.slot = memb->slot; |
| |
| ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot); |
| } |
| |
| void dlm_lsop_recover_done(struct dlm_ls *ls) |
| { |
| struct dlm_member *memb; |
| struct dlm_slot *slots; |
| int i, num; |
| |
| if (!ls->ls_ops || !ls->ls_ops->recover_done) |
| return; |
| |
| num = ls->ls_num_nodes; |
| slots = kcalloc(num, sizeof(*slots), GFP_KERNEL); |
| if (!slots) |
| return; |
| |
| i = 0; |
| list_for_each_entry(memb, &ls->ls_nodes, list) { |
| if (i == num) { |
| log_error(ls, "dlm_lsop_recover_done bad num %d", num); |
| goto out; |
| } |
| slots[i].nodeid = memb->nodeid; |
| slots[i].slot = memb->slot; |
| i++; |
| } |
| |
| ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num, |
| ls->ls_slot, ls->ls_generation); |
| out: |
| kfree(slots); |
| } |
| |
| static struct dlm_config_node *find_config_node(struct dlm_recover *rv, |
| int nodeid) |
| { |
| int i; |
| |
| for (i = 0; i < rv->nodes_count; i++) { |
| if (rv->nodes[i].nodeid == nodeid) |
| return &rv->nodes[i]; |
| } |
| return NULL; |
| } |
| |
| int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) |
| { |
| struct dlm_member *memb, *safe; |
| struct dlm_config_node *node; |
| int i, error, neg = 0, low = -1; |
| |
| /* previously removed members that we've not finished removing need to |
| * count as a negative change so the "neg" recovery steps will happen |
| * |
| * This functionality must report all member changes to lsops or |
| * midcomms layer and must never return before. |
| */ |
| |
| list_for_each_entry(memb, &ls->ls_nodes_gone, list) { |
| log_rinfo(ls, "prev removed member %d", memb->nodeid); |
| neg++; |
| } |
| |
| /* move departed members from ls_nodes to ls_nodes_gone */ |
| |
| list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { |
| node = find_config_node(rv, memb->nodeid); |
| if (node && !node->new) |
| continue; |
| |
| if (!node) { |
| log_rinfo(ls, "remove member %d", memb->nodeid); |
| } else { |
| /* removed and re-added */ |
| log_rinfo(ls, "remove member %d comm_seq %u %u", |
| memb->nodeid, memb->comm_seq, node->comm_seq); |
| } |
| |
| neg++; |
| list_move(&memb->list, &ls->ls_nodes_gone); |
| remove_remote_member(memb->nodeid); |
| ls->ls_num_nodes--; |
| dlm_lsop_recover_slot(ls, memb); |
| } |
| |
| /* add new members to ls_nodes */ |
| |
| for (i = 0; i < rv->nodes_count; i++) { |
| node = &rv->nodes[i]; |
| if (dlm_is_member(ls, node->nodeid)) |
| continue; |
| error = dlm_add_member(ls, node); |
| if (error) |
| return error; |
| |
| log_rinfo(ls, "add member %d", node->nodeid); |
| } |
| |
| list_for_each_entry(memb, &ls->ls_nodes, list) { |
| if (low == -1 || memb->nodeid < low) |
| low = memb->nodeid; |
| } |
| ls->ls_low_nodeid = low; |
| |
| make_member_array(ls); |
| *neg_out = neg; |
| |
| error = ping_members(ls, rv->seq); |
| log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); |
| return error; |
| } |
| |
| /* Userspace guarantees that dlm_ls_stop() has completed on all nodes before |
| dlm_ls_start() is called on any of them to start the new recovery. */ |
| |
| int dlm_ls_stop(struct dlm_ls *ls) |
| { |
| int new; |
| |
| /* |
| * Prevent dlm_recv from being in the middle of something when we do |
| * the stop. This includes ensuring dlm_recv isn't processing a |
| * recovery message (rcom), while dlm_recoverd is aborting and |
| * resetting things from an in-progress recovery. i.e. we want |
| * dlm_recoverd to abort its recovery without worrying about dlm_recv |
| * processing an rcom at the same time. Stopping dlm_recv also makes |
| * it easy for dlm_receive_message() to check locking stopped and add a |
| * message to the requestqueue without races. |
| */ |
| |
| write_lock_bh(&ls->ls_recv_active); |
| |
| /* |
| * Abort any recovery that's in progress (see RECOVER_STOP, |
| * dlm_recovery_stopped()) and tell any other threads running in the |
| * dlm to quit any processing (see RUNNING, dlm_locking_stopped()). |
| */ |
| |
| spin_lock_bh(&ls->ls_recover_lock); |
| set_bit(LSFL_RECOVER_STOP, &ls->ls_flags); |
| new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags); |
| if (new) |
| timer_delete_sync(&ls->ls_scan_timer); |
| ls->ls_recover_seq++; |
| |
| /* activate requestqueue and stop processing */ |
| write_lock_bh(&ls->ls_requestqueue_lock); |
| set_bit(LSFL_RECV_MSG_BLOCKED, &ls->ls_flags); |
| write_unlock_bh(&ls->ls_requestqueue_lock); |
| spin_unlock_bh(&ls->ls_recover_lock); |
| |
| /* |
| * Let dlm_recv run again, now any normal messages will be saved on the |
| * requestqueue for later. |
| */ |
| |
| write_unlock_bh(&ls->ls_recv_active); |
| |
| /* |
| * This in_recovery lock does two things: |
| * 1) Keeps this function from returning until all threads are out |
| * of locking routines and locking is truly stopped. |
| * 2) Keeps any new requests from being processed until it's unlocked |
| * when recovery is complete. |
| */ |
| |
| if (new) { |
| set_bit(LSFL_RECOVER_DOWN, &ls->ls_flags); |
| wake_up_process(ls->ls_recoverd_task); |
| wait_event(ls->ls_recover_lock_wait, |
| test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)); |
| } |
| |
| /* |
| * The recoverd suspend/resume makes sure that dlm_recoverd (if |
| * running) has noticed RECOVER_STOP above and quit processing the |
| * previous recovery. |
| */ |
| |
| dlm_recoverd_suspend(ls); |
| |
| spin_lock_bh(&ls->ls_recover_lock); |
| kfree(ls->ls_slots); |
| ls->ls_slots = NULL; |
| ls->ls_num_slots = 0; |
| ls->ls_slots_size = 0; |
| ls->ls_recover_status = 0; |
| spin_unlock_bh(&ls->ls_recover_lock); |
| |
| dlm_recoverd_resume(ls); |
| |
| if (!ls->ls_recover_begin) |
| ls->ls_recover_begin = jiffies; |
| |
| /* call recover_prep ops only once and not multiple times |
| * for each possible dlm_ls_stop() when recovery is already |
| * stopped. |
| * |
| * If we successful was able to clear LSFL_RUNNING bit and |
| * it was set we know it is the first dlm_ls_stop() call. |
| */ |
| if (new) |
| dlm_lsop_recover_prep(ls); |
| |
| return 0; |
| } |
| |
| int dlm_ls_start(struct dlm_ls *ls) |
| { |
| struct dlm_recover *rv, *rv_old; |
| struct dlm_config_node *nodes = NULL; |
| int error, count; |
| |
| rv = kzalloc(sizeof(*rv), GFP_NOFS); |
| if (!rv) |
| return -ENOMEM; |
| |
| error = dlm_config_nodes(ls->ls_name, &nodes, &count); |
| if (error < 0) |
| goto fail_rv; |
| |
| spin_lock_bh(&ls->ls_recover_lock); |
| |
| /* the lockspace needs to be stopped before it can be started */ |
| |
| if (!dlm_locking_stopped(ls)) { |
| spin_unlock_bh(&ls->ls_recover_lock); |
| log_error(ls, "start ignored: lockspace running"); |
| error = -EINVAL; |
| goto fail; |
| } |
| |
| rv->nodes = nodes; |
| rv->nodes_count = count; |
| rv->seq = ++ls->ls_recover_seq; |
| rv_old = ls->ls_recover_args; |
| ls->ls_recover_args = rv; |
| spin_unlock_bh(&ls->ls_recover_lock); |
| |
| if (rv_old) { |
| log_error(ls, "unused recovery %llx %d", |
| (unsigned long long)rv_old->seq, rv_old->nodes_count); |
| kfree(rv_old->nodes); |
| kfree(rv_old); |
| } |
| |
| set_bit(LSFL_RECOVER_WORK, &ls->ls_flags); |
| wake_up_process(ls->ls_recoverd_task); |
| return 0; |
| |
| fail: |
| kfree(nodes); |
| fail_rv: |
| kfree(rv); |
| return error; |
| } |
| |