diff --git a/configure.ac b/configure.ac index 2650f19..db3c944 100644 --- a/configure.ac +++ b/configure.ac @@ -640,7 +640,7 @@ AC_MSG_RESULT(using $GLIBCONFIG) AC_CHECK_LIB(glib-2.0, g_hash_table_get_values) if test "x$ac_cv_lib_glib_2_0_g_hash_table_get_values" != x""yes; then - AC_MSG_WARN(Your version of Glib is too old, you should have at least 2.14) + AC_DEFINE_UNQUOTED(NEED_G_HASH_ITER, 1, glib-2.0 has no hashtable iterators) fi AC_CHECK_LIB(glib-2.0, g_list_free_full) diff --git a/crmd/election.c b/crmd/election.c index 6c874a9..17b6643 100644 --- a/crmd/election.c +++ b/crmd/election.c @@ -166,7 +166,7 @@ struct election_data_s { }; static void -log_member_uname(gpointer key, gpointer value, gpointer user_data) +log_member_name(gpointer key, gpointer value, gpointer user_data) { const crm_node_t *node = value; @@ -208,7 +208,7 @@ do_election_check(long long action, char *data = NULL; data = strdup("member"); - g_hash_table_foreach(crm_peer_cache, log_member_uname, data); + g_hash_table_foreach(crm_peer_cache, log_member_name, data); free(data); data = strdup("voted"); diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt b/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt index 3ff3697..77cac5f 100644 --- a/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt +++ b/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt @@ -457,13 +457,21 @@ tell the cluster where it can be located (only on the DRBD Primary) and when it is allowed to start (after the Primary was promoted). ifdef::pcs[] +We are going to take a shortcut when creating the resource this time though. +Instead of explicitly saying we want the 'ocf:heartbeat:Filesystem' script, we +are only going to ask for 'Filesystem'. We can do this because we know there is only +one resource script named 'Filesystem' available to pacemaker, and that pcs is smart +enough to fill in the 'ocf:heartbeat' portion for us correctly in the configuration. +If there were multiple 'Filesystem' scripts from different ocf providers, we would need +to specify the exact one we wanted to use. + Once again we will queue up our changes to a file and then push the new configuration to the cluster as the final step. [source,Bash] ---- # pcs cluster cib fs_cfg -# pcs -f fs_cfg resource create WebFS ocf:heartbeat:Filesystem \ +# pcs -f fs_cfg resource create WebFS Filesystem \ device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" \ fstype="ext4" # pcs -f fs_cfg constraint colocation add WebFS WebDataClone INFINITY with-rsc-role=Master diff --git a/fencing/commands.c b/fencing/commands.c index 2bd9440..aac1c9a 100644 --- a/fencing/commands.c +++ b/fencing/commands.c @@ -47,8 +47,6 @@ GHashTable *device_list = NULL; GHashTable *topology = NULL; GList *cmd_list = NULL; -extern GHashTable *remote_op_list; - static int active_children = 0; static gboolean stonith_device_dispatch(gpointer user_data); static void st_child_done(GPid pid, int rc, const char *output, gpointer user_data); @@ -66,7 +64,7 @@ typedef struct async_command_s { char *origin; char *client; char *client_name; - char *remote; + char *remote_op_id; char *victim; char *action; @@ -120,7 +118,7 @@ static void free_async_command(async_command_t *cmd) free(cmd->device); free(cmd->action); free(cmd->victim); - free(cmd->remote); + free(cmd->remote_op_id); free(cmd->client); free(cmd->client_name); free(cmd->origin); @@ -144,7 +142,7 @@ static async_command_t *create_async_command(xmlNode *msg) cmd->timeout = cmd->default_timeout; cmd->origin = crm_element_value_copy(msg, F_ORIG); - cmd->remote = crm_element_value_copy(msg, F_STONITH_REMOTE); + cmd->remote_op_id = crm_element_value_copy(msg, F_STONITH_REMOTE_OP_ID); cmd->client = crm_element_value_copy(msg, F_STONITH_CLIENTID); cmd->client_name = crm_element_value_copy(msg, F_STONITH_CLIENTNAME); cmd->op = crm_element_value_copy(msg, F_STONITH_OPERATION); @@ -170,7 +168,7 @@ static int stonith_manual_ack(xmlNode *msg, remote_fencing_op_t *op) } cmd->device = strdup("manual_ack"); - cmd->remote = strdup(op->id); + cmd->remote_op_id = strdup(op->id); crm_notice("Injecting manual confirmation that %s is safely off/down", crm_element_value(dev, F_STONITH_TARGET)); @@ -245,8 +243,21 @@ static void schedule_stonith_command(async_command_t *cmd, stonith_device_t *dev cmd->device = strdup(device->id); cmd->timeout = get_action_timeout(device, cmd->action, cmd->default_timeout); - crm_debug("Scheduling %s on %s for %s (timeout=%dms)", cmd->action, device->id, - cmd->remote?cmd->remote:cmd->client, cmd->timeout); + if (cmd->remote_op_id) { + crm_debug("Scheduling %s on %s for remote peer %s with op id (%s) (timeout=%dms)", + cmd->action, + device->id, + cmd->origin, + cmd->remote_op_id, + cmd->timeout); + } else { + crm_debug("Scheduling %s on %s for %s (timeout=%dms)", + cmd->action, + device->id, + cmd->client, + cmd->timeout); + } + device->pending_ops = g_list_append(device->pending_ops, cmd); mainloop_set_trigger(device->work); } @@ -955,10 +966,10 @@ stonith_send_async_reply(async_command_t *cmd, const char *output, int rc, GPid crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); crm_xml_add(notify_data, F_STONITH_OPERATION, cmd->op); crm_xml_add(notify_data, F_STONITH_DELEGATE, cmd->device); - crm_xml_add(notify_data, F_STONITH_REMOTE, cmd->remote); + crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); - do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data, NULL); + do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); } free_xml(reply); @@ -1158,7 +1169,7 @@ xmlNode *stonith_construct_reply(xmlNode *request, char *output, xmlNode *data, F_STONITH_CALLID, F_STONITH_CLIENTID, F_STONITH_CLIENTNAME, - F_STONITH_REMOTE, + F_STONITH_REMOTE_OP_ID, F_STONITH_CALLOPTS }; @@ -1197,7 +1208,7 @@ stonith_construct_async_reply(async_command_t *cmd, const char *output, xmlNode crm_xml_add(reply, F_STONITH_OPERATION, cmd->op); crm_xml_add(reply, F_STONITH_DEVICE, cmd->device); - crm_xml_add(reply, F_STONITH_REMOTE, cmd->remote); + crm_xml_add(reply, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); crm_xml_add(reply, F_STONITH_CLIENTID, cmd->client); crm_xml_add(reply, F_STONITH_CLIENTNAME, cmd->client_name); crm_xml_add(reply, F_STONITH_TARGET, cmd->victim); @@ -1218,13 +1229,53 @@ stonith_construct_async_reply(async_command_t *cmd, const char *output, xmlNode return reply; } -void -stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode *request, const char *remote) +/*! + * \internal + * \brief Determine if we need to use an alternate node to + * fence the target. If so return that node's uname + * + * \retval NULL, no alternate host + * \retval uname, uname of alternate host to use + */ +static const char * +check_alternate_host(const char *target) +{ + const char *alternate_host = NULL; + + if(g_hash_table_lookup(topology, target) && safe_str_eq(target, stonith_our_uname)) { + GHashTableIter gIter; + crm_node_t *entry = NULL; + int membership = crm_proc_plugin | crm_proc_heartbeat | crm_proc_cpg; + + g_hash_table_iter_init(&gIter, crm_peer_cache); + while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { + crm_trace("Checking for %s.%d != %s", + entry->uname, entry->id, target); + if(entry->uname + && (entry->processes & membership) + && safe_str_neq(entry->uname, target)) { + alternate_host = entry->uname; + break; + } + } + if(alternate_host == NULL) { + crm_err("No alternate host available to handle complex self fencing request"); + g_hash_table_iter_init(&gIter, crm_peer_cache); + while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { + crm_notice("Peer[%d] %s", entry->id, entry->uname); + } + } + } + + return alternate_host; +} + +static int +handle_request(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode *request, const char *remote_peer) { int call_options = 0; int rc = -EOPNOTSUPP; - gboolean is_reply = FALSE; gboolean always_reply = FALSE; xmlNode *reply = NULL; @@ -1233,19 +1284,13 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * char *output = NULL; const char *op = crm_element_value(request, F_STONITH_OPERATION); const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); - - crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); - if(get_xpath_object("//"T_STONITH_REPLY, request, LOG_DEBUG_3)) { - is_reply = TRUE; - } - crm_debug("Processing %s%s from %s (%16x)", op, is_reply?" reply":"", - client?client->name:remote, call_options); + crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); if(is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(client == NULL || client->request_id == id); } - + if(crm_str_eq(op, CRM_OP_REGISTER, TRUE)) { xmlNode *reply = create_xml_node(NULL, "reply"); @@ -1255,7 +1300,7 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * crm_ipcs_send(client->channel, id, reply, FALSE); client->request_id = 0; free_xml(reply); - return; + return 0; } else if(crm_str_eq(op, STONITH_OP_EXEC, TRUE)) { rc = stonith_device_action(request, &output); @@ -1267,31 +1312,18 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); do_stonith_async_timeout_update(client_id, call_id, op_timeout); - return; - - } else if(is_reply && crm_str_eq(op, STONITH_OP_QUERY, TRUE)) { - process_remote_stonith_query(request); - return; + return 0; } else if(crm_str_eq(op, STONITH_OP_QUERY, TRUE)) { - if (remote) { + if (remote_peer) { create_remote_stonith_op(client_id, request, TRUE); /* Record it for the future notification */ } rc = stonith_query(request, &data); always_reply = TRUE; if(!data) { - return; + return 0; } - } else if(is_reply && crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) { - process_remote_stonith_exec(request); - return; - - } else if(is_reply && crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { - /* Reply to a complex fencing op */ - process_remote_stonith_exec(request); - return; - } else if(crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) { const char *flag_name = NULL; @@ -1314,17 +1346,13 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * crm_ipcs_send_ack(client->channel, id, "ack", __FUNCTION__, __LINE__); client->request_id = 0; } - return; - - /* } else if(is_reply && crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { */ - /* process_remote_stonith_exec(request); */ - /* return; */ + return 0; - } else if(is_reply == FALSE && crm_str_eq(op, STONITH_OP_RELAY, TRUE)) { + } else if(crm_str_eq(op, STONITH_OP_RELAY, TRUE)) { xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, request, LOG_TRACE); crm_notice("Peer %s has received a forwarded fencing request from %s to fence (%s) peer %s", stonith_our_uname, - client ? client->name : remote, + client ? client->name : remote_peer, crm_element_value(dev, F_STONITH_ACTION), crm_element_value(dev, F_STONITH_TARGET)); @@ -1332,9 +1360,9 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * rc = -EINPROGRESS; } - } else if(is_reply == FALSE && crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { + } else if(crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { - if(remote || stand_alone) { + if(remote_peer || stand_alone) { rc = stonith_fence(request); } else if(call_options & st_opt_manual_ack) { @@ -1351,66 +1379,22 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * if(client) { int tolerance = 0; - crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); crm_notice("Client %s.%.8s wants to fence (%s) '%s' with device '%s'", client->name, client->id, action, target, device?device:"(any)"); - crm_trace("tolerance=%d, remote_op_list=%p", tolerance, remote_op_list); - if(tolerance > 0 && remote_op_list) { - GHashTableIter iter; - time_t now = time(NULL); - remote_fencing_op_t *rop = NULL; - - g_hash_table_iter_init(&iter, remote_op_list); - while(g_hash_table_iter_next(&iter, NULL, (void**)&rop)) { - if (target == NULL || action == NULL) { - continue; - } else if(strcmp(rop->target, target) != 0) { - continue; - } else if(rop->state != st_done) { - continue; - } else if(strcmp(rop->action, action) != 0) { - continue; - } else if((rop->completed + tolerance) < now) { - continue; - } - - crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s", - target, action, tolerance, rop->delegate, rop->originator); - rc = 0; - goto done; - } + crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); + + if(stonith_check_fence_tolerance(tolerance, target, action)) { + rc = 0; + goto done; } } else { crm_notice("Peer %s wants to fence (%s) '%s' with device '%s'", - remote, action, target, device?device:"(any)"); + remote_peer, action, target, device?device:"(any)"); } - if(g_hash_table_lookup(topology, target) && safe_str_eq(target, stonith_our_uname)) { - GHashTableIter gIter; - crm_node_t *entry = NULL; - int membership = crm_proc_plugin | crm_proc_heartbeat | crm_proc_cpg; - - g_hash_table_iter_init(&gIter, crm_peer_cache); - while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { - crm_trace("Checking for %s.%d != %s", - entry->uname, entry->id, target); - if(entry->uname - && (entry->processes & membership) - && safe_str_neq(entry->uname, target)) { - alternate_host = entry->uname; - break; - } - } - if(alternate_host == NULL) { - crm_err("No alternate host available to handle complex self fencing request"); - g_hash_table_iter_init(&gIter, crm_peer_cache); - while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { - crm_notice("Peer[%d] %s", entry->id, entry->uname); - } - } - } + alternate_host = check_alternate_host(target); if(alternate_host) { crm_notice("Forwarding complex self fencing request to peer %s", alternate_host); @@ -1428,9 +1412,6 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * rc = stonith_fence_history(request, &data); always_reply = TRUE; - } else if(crm_str_eq(op, CRM_OP_REGISTER, TRUE)) { - return; - } else if(crm_str_eq(op, STONITH_OP_DEVICE_ADD, TRUE)) { const char *id = NULL; xmlNode *notify_data = create_xml_node(NULL, op); @@ -1439,7 +1420,7 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(device_list)); - do_stonith_notify(call_options, op, rc, notify_data, NULL); + do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if(crm_str_eq(op, STONITH_OP_DEVICE_DEL, TRUE)) { @@ -1450,7 +1431,7 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(device_list)); - do_stonith_notify(call_options, op, rc, notify_data, NULL); + do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if(crm_str_eq(op, STONITH_OP_LEVEL_ADD, TRUE)) { @@ -1461,7 +1442,7 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(topology)); - do_stonith_notify(call_options, op, rc, notify_data, NULL); + do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if(crm_str_eq(op, STONITH_OP_LEVEL_DEL, TRUE)) { @@ -1472,7 +1453,7 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(topology)); - do_stonith_notify(call_options, op, rc, notify_data, NULL); + do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if(crm_str_eq(op, STONITH_OP_CONFIRM, TRUE)) { @@ -1487,29 +1468,76 @@ stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode * free_xml(reply); } else { - crm_err("Unknown %s%s from %s", op, is_reply?" reply":"", - client?client->name:remote); + crm_err("Unknown %s from %s", op, client ? client->name : remote_peer); crm_log_xml_warn(request, "UnknownOp"); } done: - do_crm_log_unlikely(rc>0?LOG_DEBUG:LOG_INFO,"Processed %s%s from %s: %s (%d)", op, is_reply?" reply":"", - client?client->name:remote, rc>0?"":pcmk_strerror(rc), rc); - - if(is_reply || rc == -EINPROGRESS) { + if (rc == -EINPROGRESS) { /* Nothing (yet) */ - } else if(remote) { + } else if(remote_peer) { reply = stonith_construct_reply(request, output, data, rc); - send_cluster_message(crm_get_peer(0, remote), crm_msg_stonith_ng, reply, FALSE); + send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, reply, FALSE); free_xml(reply); } else if(rc <= pcmk_ok || always_reply) { reply = stonith_construct_reply(request, output, data, rc); - do_local_reply(reply, client_id, call_options & st_opt_sync_call, remote!=NULL); + do_local_reply(reply, client_id, call_options & st_opt_sync_call, remote_peer!=NULL); free_xml(reply); } free(output); free_xml(data); + + return rc; +} + +static void +handle_reply(stonith_client_t *client, xmlNode *request, const char *remote_peer) +{ + const char *op = crm_element_value(request, F_STONITH_OPERATION); + + if(crm_str_eq(op, STONITH_OP_QUERY, TRUE)) { + process_remote_stonith_query(request); + } else if(crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) { + process_remote_stonith_exec(request); + } else if(crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { + /* Reply to a complex fencing op */ + process_remote_stonith_exec(request); + } else { + crm_err("Unknown %s reply from %s", op, client ? client->name : remote_peer); + crm_log_xml_warn(request, "UnknownOp"); + } +} + +void +stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode *request, const char *remote_peer) +{ + int call_options = 0; + int rc = 0; + gboolean is_reply = FALSE; + const char *op = crm_element_value(request, F_STONITH_OPERATION); + + if(get_xpath_object("//"T_STONITH_REPLY, request, LOG_DEBUG_3)) { + is_reply = TRUE; + } + + crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); + crm_debug("Processing %s%s from %s (%16x)", op, is_reply?" reply":"", + client?client->name:remote_peer, call_options); + + if(is_set(call_options, st_opt_sync_call)) { + CRM_ASSERT(client == NULL || client->request_id == id); + } + + if (is_reply) { + handle_reply(client, request, remote_peer); + } else { + rc = handle_request(client, id, flags, request, remote_peer); + } + + do_crm_log_unlikely(rc>0?LOG_DEBUG:LOG_INFO,"Processed %s%s from %s: %s (%d)", op, is_reply?" reply":"", + client?client->name:remote_peer, rc>0?"":pcmk_strerror(rc), rc); + } diff --git a/fencing/internal.h b/fencing/internal.h index 3c67ac0..a06c0cb 100644 --- a/fencing/internal.h +++ b/fencing/internal.h @@ -1,5 +1,19 @@ #include <crm/common/mainloop.h> + +/*! + * \internal + * \brief Check to see if target was fenced in the last few seconds. + * \param tolerance, The number of seconds to look back in time + * \param target, The node to search for + * \param action, The action we want to match. + * + * \retval FALSE, not match + * \retval TRUE, fencing operation took place in the last 'tolerance' number of seconds. + */ +gboolean +stonith_check_fence_tolerance(int tolerance, const char *target, const char *action); + typedef struct stonith_device_s { char *id; char *agent; @@ -33,32 +47,66 @@ typedef struct stonith_client_s { } stonith_client_t; typedef struct remote_fencing_op_s { + /* The unique id associated with this operation */ char *id; + /*! The node this operation will fence */ char *target; + /*! The fencing action to perform on the target. (reboot, on, off) */ char *action; - guint replies; - gint op_timer_total; - gint op_timer_one; - gint query_timer; + /*! Marks if the final notifications have been sent to local stonith clients. */ + gboolean notify_sent; + /*! The number of query replies received */ + guint replies; + /*! Does this node own control of this operation */ + gboolean owner; + /*! After query is complete, This the high level timer that expires the entire operation */ + guint op_timer_total; + /*! This timer expires the current fencing request. Many fencing + * requests may exist in a single operation */ + guint op_timer_one; + /*! This timer expires the query request sent out to determine + * what nodes are contain what devices, and who those devices can fence */ + guint query_timer; + /*! This is the default timeout to use for each fencing device if no + * custom timeout is received in the query. */ gint base_timeout; + /*! This is the calculated total timeout an operation can take before + * expiring. This is calculated by adding together all the timeout + * values associated with the devices this fencing operation may call */ gint total_timeout; + /*! Delegate is the node being asked to perform a fencing action + * on behalf of the node that owns the remote operation. Some operations + * will involve multiple delegates. This value represents the final delegate + * that is used. */ char *delegate; + /*! The point at which the remote operation completed */ time_t completed; + /*! The stonith_call_options associated with this remote operation */ long long call_options; + /*! The current state of the remote operation. This indicates + * what phase the op is in, query, exec, done, duplicate, failed. */ enum op_state state; + /*! The node that owns the remote operation */ char *originator; + /*! The local client id that initiated the fencing request */ char *client_id; + /*! The name of client that initiated the fencing request */ char *client_name; + /*! List of the received query results for all the nodes in the cpg group */ GListPtr query_results; + /*! The original request that initiated the remote stonith operation */ xmlNode *request; + /*! The current topology level being executed */ guint level; - int topology_device_number; - + /*! The device list of all the devices at the current executing topology level. */ GListPtr devices; + + /*! List of duplicate operations attached to this operation. Once this operation + * completes, the duplicate operations will be closed out as well. */ GListPtr duplicates; } remote_fencing_op_t; @@ -71,7 +119,7 @@ typedef struct stonith_topology_s { extern long long get_stonith_flag(const char *name); -extern void stonith_command(stonith_client_t * client, uint32_t id, uint32_t flags, xmlNode * op_request, const char *remote); +extern void stonith_command(stonith_client_t * client, uint32_t id, uint32_t flags, xmlNode * op_request, const char *remote_peer); extern int stonith_device_register(xmlNode * msg, const char **desc); @@ -87,7 +135,7 @@ extern xmlNode *stonith_construct_reply(xmlNode * request, char *output, xmlNode void do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); -extern void do_stonith_notify(int options, const char *type, int result, xmlNode * data, const char *remote); +extern void do_stonith_notify(int options, const char *type, int result, xmlNode * data); extern remote_fencing_op_t *initiate_remote_stonith_op(stonith_client_t * client, xmlNode * request, gboolean manual_ack); diff --git a/fencing/main.c b/fencing/main.c index f83653b..a2c30f6 100644 --- a/fencing/main.c +++ b/fencing/main.c @@ -213,7 +213,7 @@ st_ipc_destroy(qb_ipcs_connection_t *c) static void stonith_peer_callback(xmlNode * msg, void* private_data) { - const char *remote = crm_element_value(msg, F_ORIG); + const char *remote_peer = crm_element_value(msg, F_ORIG); const char *op = crm_element_value(msg, F_STONITH_OPERATION); if(crm_str_eq(op, "poke", TRUE)) { @@ -221,7 +221,7 @@ stonith_peer_callback(xmlNode * msg, void* private_data) } crm_log_xml_trace(msg, "Peer[inbound]"); - stonith_command(NULL, 0, 0, msg, remote); + stonith_command(NULL, 0, 0, msg, remote_peer); } #if SUPPORT_HEARTBEAT @@ -402,9 +402,7 @@ do_stonith_async_timeout_update(const char *client_id, const char *call_id, int } void -do_stonith_notify( - int options, const char *type, int result, xmlNode *data, - const char *remote) +do_stonith_notify(int options, const char *type, int result, xmlNode *data) { /* TODO: Standardize the contents of data */ xmlNode *update_msg = create_xml_node(NULL, "notify"); @@ -470,7 +468,7 @@ static void topology_remove_helper(const char *node, int level) crm_xml_add(notify_data, F_STONITH_DEVICE, desc); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(topology)); - do_stonith_notify(0, STONITH_OP_LEVEL_DEL, rc, notify_data, NULL); + do_stonith_notify(0, STONITH_OP_LEVEL_DEL, rc, notify_data); free_xml(notify_data); free_xml(data); @@ -489,7 +487,7 @@ static void topology_register_helper(const char *node, int level, stonith_key_va crm_xml_add(notify_data, F_STONITH_DEVICE, desc); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(topology)); - do_stonith_notify(0, STONITH_OP_LEVEL_ADD, rc, notify_data, NULL); + do_stonith_notify(0, STONITH_OP_LEVEL_ADD, rc, notify_data); free_xml(notify_data); free_xml(data); diff --git a/fencing/regression.py.in b/fencing/regression.py.in index 7b9a1bc..44147de 100644 --- a/fencing/regression.py.in +++ b/fencing/regression.py.in @@ -68,25 +68,6 @@ class Test: } ) - def start_corosync(self): - if self.enable_corosync == 0: - return - - if self.verbose: - print "Starting corosync" - - test = subprocess.Popen("corosync", stdout=subprocess.PIPE) - test.wait() - time.sleep(10) - - def stop_corosync(self): - if self.enable_corosync == 0: - return - - cmd = shlex.split("killall -9 -q corosync") - test = subprocess.Popen(cmd, stdout=subprocess.PIPE) - test.wait() - def stop_pacemaker(self): cmd = shlex.split("killall -9 -q pacemakerd") test = subprocess.Popen(cmd, stdout=subprocess.PIPE) @@ -95,13 +76,11 @@ class Test: def start_environment(self): ### make sure we are in full control here ### self.stop_pacemaker() - self.stop_corosync() cmd = shlex.split("killall -9 -q stonithd") test = subprocess.Popen(cmd, stdout=subprocess.PIPE) test.wait() - self.start_corosync() if self.verbose: print "Starting stonithd with %s" % self.stonith_options @@ -122,15 +101,6 @@ class Test: if self.verbose: print self.stonith_output - if os.path.exists('/var/log/corosync.log'): - print "Daemon output" - f = open('/var/log/corosync.log', 'r') - for line in f.readlines(): - print line.strip() - os.remove('/var/log/corosync.log') - - self.stop_corosync() - def add_stonith_log_pattern(self, pattern): self.stonith_patterns.append(pattern) @@ -215,6 +185,7 @@ class Test: cur = cur + 1 if line.count(pats[cur]): del pats[cur] + break if len(pats) > 0 or negative_matches: if self.verbose: @@ -278,6 +249,20 @@ class Tests: print "%35s - %s" % (test.name, test.description) print "==== END OF LIST ====\n" + + def start_corosync(self): + if self.verbose: + print "Starting corosync" + + test = subprocess.Popen("corosync", stdout=subprocess.PIPE) + test.wait() + time.sleep(10) + + def stop_corosync(self): + cmd = shlex.split("killall -9 -q corosync") + test = subprocess.Popen(cmd, stdout=subprocess.PIPE) + test.wait() + def run_single(self, name): for test in self.tests: if test.name == name: @@ -367,6 +352,84 @@ class Tests: # timeout is 2+1+4000 = 4003 test.add_stonith_log_pattern("remote op timeout set to 4003") + def build_fence_merge_tests(self): + + ### Simple test that overlapping fencing operations get merged + test = self.new_test("cpg_custom_merge_single", + "Verify overlapping identical fencing operations are merged, no fencing levels used.", 1) + test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node3\"") + test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\" ") + test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3\"") + test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") + test.add_cmd("stonith_admin", "-F node3 -t 10") + ### one merger will happen + test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") + ### the pattern below signifies that both the original and duplicate operation completed + test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation off of node3 by") + + ### Test that multiple mergers occur + test = self.new_test("cpg_custom_merge_multiple", + "Verify multiple overlapping identical fencing operations are merged", 1) + test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node3\"") + test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\" ") + test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3\"") + test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") + test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") + test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") + test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") + test.add_cmd("stonith_admin", "-F node3 -t 10") + ### 4 mergers should occur + test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") + test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") + test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") + test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") + ### the pattern below signifies that both the original and duplicate operation completed + test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation off of node3 by") + + ### Test that multiple mergers occur with topologies used + test = self.new_test("cpg_custom_merge_with_topology", + "Verify multiple overlapping identical fencing operations are merged with fencing levels.", 1) + test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node3\"") + test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\" ") + test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3\"") + test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") + test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2") + test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") + test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") + test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") + test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") + test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") + test.add_cmd("stonith_admin", "-F node3 -t 10") + ### 4 mergers should occur + test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") + test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") + test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") + test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") + ### the pattern below signifies that both the original and duplicate operation completed + test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation off of node3 by") + + + test = self.new_test("cpg_custom_no_merge", + "Verify differing fencing operations are not merged", 1) + test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node3 node2\"") + test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3 node2\" ") + test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3 node2\"") + test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") + test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2") + test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") + test.add_cmd_no_wait("stonith_admin", "-F node2 -t 10") + test.add_cmd("stonith_admin", "-F node3 -t 10") + test.add_stonith_negative_log_pattern("Merging stonith action off for node node3 originating from client") + def build_standalone_tests(self): test_types = [ { @@ -469,6 +532,28 @@ class Tests: test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0") test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0") + + # test what happens when the first fencing level had devices that no one has registered + for test_type in test_types: + if test_type["use_cpg"] == 0: + continue + + test = self.new_test("%s_topology_missing_devices" % test_type["prefix"], + "Verify topology can continue with missing devices.", test_type["use_cpg"]) + test.add_cmd("stonith_admin", "-R true2 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") + test.add_cmd("stonith_admin", "-R true3 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") + test.add_cmd("stonith_admin", "-R true4 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") + test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") + + test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") + test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1") + test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2") + test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2") + test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3") + test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4") + + test.add_cmd("stonith_admin", "-F node3 -t 2") + # Test what happens if multiple fencing levels are defined, and then the first one is removed. for test_type in test_types: if test_type["use_cpg"] == 0: @@ -563,8 +648,8 @@ class Tests: test.add_cmd_check_stdout("stonith_admin", "-H node3", "was able to turn off node node3", "") - def setup_environment(self): - if self.autogen_corosync_cfg: + def setup_environment(self, use_corosync): + if self.autogen_corosync_cfg and use_corosync: corosync_conf = (""" totem { version: 2 @@ -598,10 +683,26 @@ logging { os.system("cat <<-END >>/etc/corosync/corosync.conf\n%s\nEND" % (corosync_conf)) + + if use_corosync: + ### make sure we are in control ### + self.stop_corosync() + self.start_corosync() + os.system("cp /usr/share/pacemaker/tests/cts/fence_false /usr/sbin/fence_false") os.system("cp /usr/share/pacemaker/tests/cts/fence_true /usr/sbin/fence_true") - def cleanup_environment(self): + def cleanup_environment(self, use_corosync): + if use_corosync: + self.stop_corosync() + + if self.verbose and os.path.exists('/var/log/corosync.log'): + print "Daemon output" + f = open('/var/log/corosync.log', 'r') + for line in f.readlines(): + print line.strip() + os.remove('/var/log/corosync.log') + if self.autogen_corosync_cfg: os.system("rm -f /etc/corosync/corosync.conf") @@ -662,20 +763,29 @@ def main(argv): o = TestOptions() o.build_options(argv) + use_corosync = 1 + tests = Tests(o.options['verbose']) tests.build_standalone_tests() tests.build_custom_timeout_tests() tests.build_api_sanity_tests() - - print "Starting ..." - - tests.setup_environment() + tests.build_fence_merge_tests() if o.options['list-tests']: tests.print_list() + sys.exit(0) elif o.options['show-usage']: o.show_usage() - elif o.options['run-only-pattern'] != "": + sys.exit(0) + + print "Starting ..." + + if o.options['no-cpg']: + use_corosync = 0 + + tests.setup_environment(use_corosync) + + if o.options['run-only-pattern'] != "": tests.run_tests_matching(o.options['run-only-pattern']) tests.print_results() elif o.options['run-only'] != "": @@ -691,7 +801,7 @@ def main(argv): tests.run_tests() tests.print_results() - tests.cleanup_environment() + tests.cleanup_environment(use_corosync) tests.exit() if __name__=="__main__": main(sys.argv) diff --git a/fencing/remote.c b/fencing/remote.c index d564a90..92dc007 100644 --- a/fencing/remote.c +++ b/fencing/remote.c @@ -43,6 +43,7 @@ #include <crm/common/util.h> #include <internal.h> +#define TIMEOUT_MULTIPLY_FACTOR 1.2 typedef struct st_query_result_s { @@ -55,6 +56,7 @@ typedef struct st_query_result_s GHashTable *remote_op_list = NULL; void call_remote_stonith(remote_fencing_op_t *op, st_query_result_t *peer); +static void remote_op_done(remote_fencing_op_t *op, xmlNode *data, int rc, int dup); extern xmlNode *stonith_create_op( int call_id, const char *token, const char *op, xmlNode *data, int call_options); @@ -72,6 +74,23 @@ static void free_remote_query(gpointer data) } } +static void +clear_remote_op_timers(remote_fencing_op_t *op) +{ + if(op->query_timer) { + g_source_remove(op->query_timer); + op->query_timer = 0; + } + if(op->op_timer_total) { + g_source_remove(op->op_timer_total); + op->op_timer_total = 0; + } + if(op->op_timer_one) { + g_source_remove(op->op_timer_one); + op->op_timer_one = 0; + } +} + static void free_remote_op(gpointer data) { remote_fencing_op_t *op = data; @@ -79,6 +98,8 @@ static void free_remote_op(gpointer data) crm_trace("Free'ing op %s for %s", op->id, op->target); crm_log_xml_debug(op->request, "Destroying"); + clear_remote_op_timers(op); + free(op->id); free(op->action); free(op->target); @@ -86,15 +107,6 @@ static void free_remote_op(gpointer data) free(op->client_name); free(op->originator); - if(op->query_timer > 0) { - g_source_remove(op->query_timer); - } - if(op->op_timer_total > 0) { - g_source_remove(op->op_timer_total); - } - if(op->op_timer_one > 0) { - g_source_remove(op->op_timer_one); - } if(op->query_results) { g_list_free_full(op->query_results, free_remote_query); } @@ -105,105 +117,80 @@ static void free_remote_op(gpointer data) free(op); } -static void remote_op_done(remote_fencing_op_t *op, xmlNode *data, int rc, int dup) +static xmlNode * +create_op_done_notify(remote_fencing_op_t *op, int rc) { - GListPtr iter = NULL; - const char *subt = NULL; - - xmlNode *local_data = NULL; - xmlNode *notify_data = NULL; - - op->completed = time(NULL); - - if(op->query_timer > 0) { - g_source_remove(op->query_timer); - op->query_timer = 0; - } - if(op->op_timer_total > 0) { - g_source_remove(op->op_timer_total); - op->op_timer_total = 0; - } - if(op->op_timer_one > 0) { - g_source_remove(op->op_timer_one); - op->op_timer_one = 0; - } - - if(op->request == NULL) { - crm_err("Already sent notifications for '%s of %s by %s' (for=%s@%s.%.8s, state=%d): %s", - op->action, op->target, op->delegate?op->delegate:"<no-one>", - op->client_name, op->originator, op->id, op->state, pcmk_strerror(rc)); - return; - } - - if(data == NULL) { - data = create_xml_node(NULL, "remote-op"); - local_data = data; - - } else if(op->delegate == NULL) { - op->delegate = crm_element_value_copy(data, F_ORIG); - } - - /* Do notification with a clean data object */ - notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); crm_xml_add_int(notify_data, "state", op->state); crm_xml_add_int(notify_data, F_STONITH_RC, rc); crm_xml_add(notify_data, F_STONITH_TARGET, op->target); crm_xml_add(notify_data, F_STONITH_ACTION, op->action); crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate); - crm_xml_add(notify_data, F_STONITH_REMOTE, op->id); + crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id); crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator); crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id); crm_xml_add(notify_data, F_STONITH_CLIENTNAME,op->client_name); - subt = crm_element_value(data, F_SUBTYPE); - if(dup == FALSE && safe_str_neq(subt, "broadcast")) { - static int count = 0; - xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); - - count++; - crm_trace("Broadcasting result to peers"); - crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY); - crm_xml_add(bcast, F_SUBTYPE, "broadcast"); - crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY); - crm_xml_add_int(bcast, "count", count); - add_message_xml(bcast, F_STONITH_CALLDATA, notify_data); - send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE); - free_xml(notify_data); - free_xml(local_data); - free_xml(bcast); + return notify_data; +} - /* Defer notification until the bcast message arrives */ +static void +bcast_result_to_peers(remote_fencing_op_t *op, int rc) +{ + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); + xmlNode *notify_data = create_op_done_notify(op, rc); + + count++; + crm_trace("Broadcasting result to peers"); + crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY); + crm_xml_add(bcast, F_SUBTYPE, "broadcast"); + crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY); + crm_xml_add_int(bcast, "count", count); + add_message_xml(bcast, F_STONITH_CALLDATA, notify_data); + send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE); + free_xml(notify_data); + free_xml(bcast); + + return; +} + +static void +handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, int rc) +{ + xmlNode *notify_data = NULL; + xmlNode *reply = NULL; + + if (op->notify_sent == TRUE) { + /* nothing to do */ return; } - - { - int level = LOG_ERR; - xmlNode *reply = NULL; - crm_xml_add_int(data, "state", op->state); - crm_xml_add(data, F_STONITH_TARGET, op->target); - crm_xml_add(data, F_STONITH_OPERATION, op->action); + /* Do notification with a clean data object */ + notify_data = create_op_done_notify(op, rc); + crm_xml_add_int(data, "state", op->state); + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); - reply = stonith_construct_reply(op->request, NULL, data, rc); - crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); + reply = stonith_construct_reply(op->request, NULL, data, rc); + crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); - if(rc == pcmk_ok || dup) { - level = LOG_NOTICE; - } else if(safe_str_neq(op->originator, stonith_our_uname)) { - level = LOG_NOTICE; - } - - do_crm_log(level, - "Operation %s of %s by %s for %s@%s.%.8s: %s", - op->action, op->target, op->delegate?op->delegate:"<no-one>", - op->client_name, op->originator, op->id, pcmk_strerror(rc)); + /* Send fencing OP reply to local client that initiated fencing */ + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); - do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); - free_xml(reply); - } + /* bcast to all local clients that the fencing operation happend */ + do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); - do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data, NULL); + /* mark this op as having notify's already sent */ + op->notify_sent = TRUE; + free_xml(reply); + free_xml(notify_data); +} +static void +handle_duplicates(remote_fencing_op_t *op, xmlNode *data, int rc) +{ + GListPtr iter = NULL; for(iter = op->duplicates; iter != NULL; iter = iter->next) { remote_fencing_op_t *other = iter->data; @@ -217,9 +204,85 @@ static void remote_op_done(remote_fencing_op_t *op, xmlNode *data, int rc, int d crm_err("Skipping duplicate notification for %s@%s - %d", other->client_name, other->originator, other->state); } } - - free_xml(notify_data); - free_xml(local_data); +} + +/*! + * \internal + * \brief Finalize a remote operation. + * + * \description This function has two code paths. + * + * Path 1. This node is the owner of the operation and needs + * to notify the cpg group via a broadcast as to the operation's + * results. + * + * Path 2. The cpg broadcast is received. All nodes notify their local + * stonith clients the operation results. + * + * So, The owner of the operation first notifies the cluster of the result, + * and once that cpg notify is received back it notifies all the local clients. + * + * Nodes that are passive watchers of the operation will receive the + * broadcast and only need to notify their local clients the operation finished. + * + * \param op, The fencing operation to finalize + * \param data, The xml msg reply (if present) of the last delegated fencing + * operation. + * \param dup, Is this operation a duplicate, if so treat it a little differently + * making sure the broadcast is not sent out. + */ +static void +remote_op_done(remote_fencing_op_t *op, xmlNode *data, int rc, int dup) +{ + int level = LOG_ERR; + const char *subt = NULL; + xmlNode *local_data = NULL; + + op->completed = time(NULL); + clear_remote_op_timers(op); + + if(op->notify_sent == TRUE) { + crm_err("Already sent notifications for '%s of %s by %s' (for=%s@%s.%.8s, state=%d): %s", + op->action, op->target, op->delegate?op->delegate:"<no-one>", + op->client_name, op->originator, op->id, op->state, pcmk_strerror(rc)); + goto remote_op_done_cleanup; + } + + if(!op->delegate && data) { + op->delegate = crm_element_value_copy(data, F_ORIG); + } + + if(data == NULL) { + data = create_xml_node(NULL, "remote-op"); + local_data = data; + } + + /* Tell everyone the operation is done, we will continue + * with doing the local notifications once we receive + * the broadcast back. */ + subt = crm_element_value(data, F_SUBTYPE); + if(dup == FALSE && safe_str_neq(subt, "broadcast")) { + /* Defer notification until the bcast message arrives */ + bcast_result_to_peers(op, rc); + goto remote_op_done_cleanup; + } + + if(rc == pcmk_ok || dup) { + level = LOG_NOTICE; + } else if(safe_str_neq(op->originator, stonith_our_uname)) { + level = LOG_NOTICE; + } + + do_crm_log(level, + "Operation %s of %s by %s for %s@%s.%.8s: %s", + op->action, op->target, op->delegate?op->delegate:"<no-one>", + op->client_name, op->originator, op->id, pcmk_strerror(rc)); + + handle_local_reply_and_notify(op, data, rc); + + if (dup == FALSE) { + handle_duplicates(op, data, rc); + } /* Free non-essential parts of the record * Keep the record around so we can query the history @@ -234,6 +297,8 @@ static void remote_op_done(remote_fencing_op_t *op, xmlNode *data, int rc, int d op->request = NULL; } +remote_op_done_cleanup: + free_xml(local_data); } static gboolean remote_op_timeout_one(gpointer userdata) @@ -321,6 +386,60 @@ static int stonith_topology_next(remote_fencing_op_t *op) return -EINVAL; } +/*! + * \brief Check to see if this operation is a duplicate of another in flight + * operation. If so merge this operation into the inflight operation, and mark + * it as a duplicate. + */ +static void +merge_duplicates(remote_fencing_op_t *op) +{ + GHashTableIter iter; + remote_fencing_op_t *other = NULL; + g_hash_table_iter_init(&iter, remote_op_list); + while(g_hash_table_iter_next(&iter, NULL, (void**)&other)) { + if(other->state > st_exec) { + /* Must be in-progress */ + continue; + } else if (safe_str_neq(op->target, other->target)) { + /* Must be for the same node */ + continue; + } else if(safe_str_neq(op->action, other->action)) { + crm_trace("Must be for the same action: %s vs. ", op->action, other->action); + continue; + } else if(safe_str_eq(op->client_name, other->client_name)) { + crm_trace("Must be for different clients: %s", op->client_name); + continue; + } else if(safe_str_eq(other->target, other->originator)) { + crm_trace("Can't be a suicide operation: %s", other->target); + continue; + } + + /* There is another in-flight request to fence the same host + * Piggyback on that instead. If it fails, so do we. + */ + other->duplicates = g_list_append(other->duplicates, op); + if(other->total_timeout == 0) { + crm_trace("Making a best-guess as to the timeout used"); + other->total_timeout = op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL, op->base_timeout); + } + crm_notice("Merging stonith action %s for node %s originating from client %s.%.8s with identical request from %s@%s.%.8s (%ds)", + op->action, op->target, op->client_name, op->id, other->client_name, other->originator, other->id, other->total_timeout); + report_timeout_period(op, other->total_timeout); + op->state = st_duplicate; + } +} + +/*! + * \internal + * \brief Create a new remote stonith op + * \param client, he local stonith client id that initaited the operation + * \param request, The request from the client that started the operation + * \param peer, Is this operation owned by another stonith peer? Operations + * owned by other peers are stored on all the stonith nodes, but only the + * owner executes the operation. All the nodes get the results to the operation + * once the owner finishes executing it. + */ void *create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer) { remote_fencing_op_t *op = NULL; @@ -331,26 +450,25 @@ void *create_remote_stonith_op(const char *client, xmlNode *request, gboolean pe crm_str_hash, g_str_equal, NULL, free_remote_op); } + /* If this operation is owned by another node, check to make + * sure we haven't already created this operation. */ if(peer && dev) { - const char *peer_id = crm_element_value(dev, F_STONITH_REMOTE); - CRM_CHECK(peer_id != NULL, return NULL); + const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); + CRM_CHECK(op_id != NULL, return NULL); - op = g_hash_table_lookup(remote_op_list, peer_id); + op = g_hash_table_lookup(remote_op_list, op_id); if(op) { - crm_debug("%s already exists", peer_id); + crm_debug("%s already exists", op_id); return op; } } op = calloc(1, sizeof(remote_fencing_op_t)); - op->op_timer_total = -1; - op->query_timer = -1; - crm_element_value_int(request, F_STONITH_TIMEOUT, (int*)&(op->base_timeout)); if(peer && dev) { - op->id = crm_element_value_copy(dev, F_STONITH_REMOTE); + op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID); } else { op->id = crm_generate_uuid(); } @@ -363,7 +481,7 @@ void *create_remote_stonith_op(const char *client, xmlNode *request, gboolean pe op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN); if(op->originator == NULL) { - /* Local request */ + /* Local or relayed request */ op->originator = strdup(stonith_our_uname); } @@ -395,43 +513,8 @@ void *create_remote_stonith_op(const char *client, xmlNode *request, gboolean pe } } - { - GHashTableIter iter; - remote_fencing_op_t *other = NULL; - g_hash_table_iter_init(&iter, remote_op_list); - while(g_hash_table_iter_next(&iter, NULL, (void**)&other)) { - if(other->state > st_exec) { - /* Must be in-progress */ - continue; - } else if (safe_str_neq(op->target, other->target)) { - /* Must be for the same node */ - continue; - } else if(safe_str_neq(op->action, other->action)) { - crm_trace("Must be for the same action: %s vs. ", op->action, other->action); - continue; - } else if(safe_str_eq(op->client_name, other->client_name)) { - crm_trace("Must be for different clients: %s", op->client_name); - continue; - } else if(safe_str_eq(other->target, other->originator)) { - crm_trace("Can't be a suicide operation: %s", other->target); - continue; - } - - /* There is another in-flight request to fence the same host - * Piggyback on that instead. If it fails, so do we. - */ - other->duplicates = g_list_append(other->duplicates, op); - if(other->total_timeout == 0) { - crm_trace("Making a best-guess as to the timeout used"); - other->total_timeout = op->total_timeout = 1.2 * get_op_total_timeout(op, NULL, op->base_timeout); - } - crm_notice("Merging stonith action %s for node %s originating from client %s.%.8s with identical request from %s@%s.%.8s (%ds)", - op->action, op->target, op->client_name, op->id, other->client_name, other->originator, other->id, other->total_timeout); - report_timeout_period(op, other->total_timeout); - op->state = st_duplicate; - return op; - } - } + /* check to see if this is a duplicate operation of another in-flight operation */ + merge_duplicates(op); return op; } @@ -449,7 +532,8 @@ remote_fencing_op_t *initiate_remote_stonith_op(stonith_client_t *client, xmlNod } CRM_LOG_ASSERT(client_id != NULL); - op = create_remote_stonith_op(client_id, request, FALSE); + op = create_remote_stonith_op(client_id, request, FALSE); + op->owner = TRUE; CRM_CHECK(op->action, return NULL); @@ -460,6 +544,7 @@ remote_fencing_op_t *initiate_remote_stonith_op(stonith_client_t *client, xmlNod switch(op->state) { case st_failed: crm_warn("Initiation of remote operation %s for %s: failed (%s)", op->action, op->target, op->id); + remote_op_done(op, NULL, -EINVAL, FALSE); return op; case st_duplicate: @@ -479,7 +564,7 @@ remote_fencing_op_t *initiate_remote_stonith_op(stonith_client_t *client, xmlNod crm_xml_add(query, F_STONITH_DEVICE, "manual_ack"); } - crm_xml_add(query, F_STONITH_REMOTE, op->id); + crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id); crm_xml_add(query, F_STONITH_TARGET, op->target); crm_xml_add(query, F_STONITH_ACTION, op->action); crm_xml_add(query, F_STONITH_ORIGIN, op->originator); @@ -533,9 +618,9 @@ static st_query_result_t *stonith_choose_peer(remote_fencing_op_t *op) && stonith_topology_next(op) == pcmk_ok); if(op->devices) { - crm_trace("Couldn't find anyone to fence %s with %s", op->target, (char*)op->devices->data); + crm_debug("Couldn't find anyone to fence %s with %s", op->target, (char*)op->devices->data); } else { - crm_trace("Couldn't find anyone to fence %s", op->target); + crm_debug("Couldn't find anyone to fence %s", op->target); } return NULL; @@ -635,7 +720,7 @@ report_timeout_period(remote_fencing_op_t *op, int op_timeout) /* The client is connected to another node, relay this update to them */ update = stonith_create_op(0, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0); - crm_xml_add(update, F_STONITH_REMOTE, op->id); + crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id); crm_xml_add(update, F_STONITH_CLIENTID, client_id); crm_xml_add(update, F_STONITH_CALLID, call_id); crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout); @@ -661,9 +746,9 @@ void call_remote_stonith(remote_fencing_op_t *op, st_query_result_t *peer) peer = stonith_choose_peer(op); } - if(op->op_timer_total <= 0) { + if(!op->op_timer_total) { int t = get_op_total_timeout(op, peer, op->base_timeout); - op->total_timeout = 1.2 * t; + op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * t; op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op); report_timeout_period(op, op->total_timeout); crm_info("Total remote op timeout set to %d for fencing of node %s for %s.%.8s", @@ -681,10 +766,10 @@ void call_remote_stonith(remote_fencing_op_t *op, st_query_result_t *peer) } if(peer) { - int t = 1.2 * get_device_timeout(peer, device, op->base_timeout); + int t = TIMEOUT_MULTIPLY_FACTOR * get_device_timeout(peer, device, op->base_timeout); xmlNode *query = stonith_create_op(0, op->id, STONITH_OP_FENCE, NULL, 0); - crm_xml_add(query, F_STONITH_REMOTE, op->id); + crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id); crm_xml_add(query, F_STONITH_TARGET, op->target); crm_xml_add(query, F_STONITH_ACTION, op->action); crm_xml_add(query, F_STONITH_ORIGIN, op->originator); @@ -705,7 +790,7 @@ void call_remote_stonith(remote_fencing_op_t *op, st_query_result_t *peer) } op->state = st_exec; - if(op->op_timer_one > 0) { + if(op->op_timer_one) { g_source_remove(op->op_timer_one); } op->op_timer_one = g_timeout_add((1000 * t), remote_op_timeout_one, op); @@ -714,10 +799,10 @@ void call_remote_stonith(remote_fencing_op_t *op, st_query_result_t *peer) free_xml(query); return; - } else if(op->query_timer < 0) { + } else if(!op->owner) { crm_err("The termination of %s for %s is not ours to control", op->target, op->client_name); - } else if(op->query_timer == 0) { + } else if(!op->query_timer) { CRM_LOG_ASSERT(op->state < st_done); /* We've exhausted all available peers */ @@ -748,6 +833,37 @@ static gint sort_peers(gconstpointer a, gconstpointer b) return 0; } +static gboolean +all_topology_devices_found(remote_fencing_op_t *op) +{ + GListPtr device = NULL; + GListPtr iter = NULL; + GListPtr match = NULL; + stonith_topology_t *tp = NULL; + int i; + + tp = g_hash_table_lookup(topology, op->target); + + if (!tp) { + return FALSE; + } + + for (i = 0; i < ST_LEVEL_MAX; i++) { + for (device = tp->levels[i]; device; device = device->next) { + match = FALSE; + for(iter = op->query_results; iter != NULL; iter = iter->next) { + st_query_result_t *peer = iter->data; + match = g_list_find_custom(peer->device_list, device->data, sort_strings); + } + if (!match) { + return FALSE; + } + } + } + + return TRUE; +} + int process_remote_stonith_query(xmlNode *msg) { int devices = 0; @@ -755,12 +871,12 @@ int process_remote_stonith_query(xmlNode *msg) const char *host = NULL; remote_fencing_op_t *op = NULL; st_query_result_t *result = NULL; - xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE, msg, LOG_ERR); + xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); xmlNode *child = NULL; CRM_CHECK(dev != NULL, return -EPROTO); - id = crm_element_value(dev, F_STONITH_REMOTE); + id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); CRM_CHECK(id != NULL, return -EPROTO); dev = get_xpath_object("//@st-available-devices", msg, LOG_ERR); @@ -817,7 +933,16 @@ int process_remote_stonith_query(xmlNode *msg) op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers); - if(op->state == st_query && is_set(op->call_options, st_opt_all_replies) == FALSE) { + /* All the query results are in for the topology, start the fencing ops. */ + if(is_set(op->call_options, st_opt_topology)) { + /* If we start the fencing before all the topology results are in, + * it is possible fencing levels will be skipped because of the missing + * query results. */ + if (op->state == st_query && all_topology_devices_found(op)) { + call_remote_stonith(op, result); + } + /* We have a result for a non-topology fencing op, start fencing */ + } else if(op->state == st_query) { call_remote_stonith(op, result); } else if(op->state == st_done) { @@ -834,11 +959,11 @@ int process_remote_stonith_exec(xmlNode *msg) const char *id = NULL; const char *device = NULL; remote_fencing_op_t *op = NULL; - xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE, msg, LOG_ERR); + xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); CRM_CHECK(dev != NULL, return -EPROTO); - id = crm_element_value(dev, F_STONITH_REMOTE); + id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); CRM_CHECK(id != NULL, return -EPROTO); dev = get_xpath_object("//@"F_STONITH_RC, msg, LOG_ERR); @@ -979,3 +1104,36 @@ int stonith_fence_history(xmlNode *msg, xmlNode **output) return rc; } +gboolean +stonith_check_fence_tolerance(int tolerance, + const char *target, + const char *action) +{ + GHashTableIter iter; + time_t now = time(NULL); + remote_fencing_op_t *rop = NULL; + + crm_trace("tolerance=%d, remote_op_list=%p", tolerance, remote_op_list); + + if (tolerance <= 0 || !remote_op_list || target == NULL || action == NULL) { + return FALSE; + } + + g_hash_table_iter_init(&iter, remote_op_list); + while(g_hash_table_iter_next(&iter, NULL, (void**)&rop)) { + if(strcmp(rop->target, target) != 0) { + continue; + } else if(rop->state != st_done) { + continue; + } else if(strcmp(rop->action, action) != 0) { + continue; + } else if((rop->completed + tolerance) < now) { + continue; + } + + crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s", + target, action, tolerance, rop->delegate, rop->originator); + return TRUE; + } + return FALSE; +} diff --git a/include/crm/cib.h b/include/crm/cib.h index 5e55774..5e1c30a 100644 --- a/include/crm/cib.h +++ b/include/crm/cib.h @@ -65,7 +65,7 @@ enum cib_call_options { }; #define cib_default_options = cib_none -#define CIB_OP_NOTIFY "cib_notify" +#define T_CIB_DIFF_NOTIFY "cib_diff_notify" /* *INDENT-ON* */ diff --git a/include/crm/cib/internal.h b/include/crm/cib/internal.h index 3f2a83e..10d4d49 100644 --- a/include/crm/cib/internal.h +++ b/include/crm/cib/internal.h @@ -36,6 +36,7 @@ # define CIB_OP_APPLY_DIFF "cib_apply_diff" # define CIB_OP_UPGRADE "cib_upgrade" # define CIB_OP_DELETE_ALT "cib_delete_alt" +# define CIB_OP_NOTIFY "cib_notify" # define F_CIB_CLIENTID "cib_clientid" # define F_CIB_CALLOPTS "cib_callopt" @@ -69,7 +70,6 @@ # define T_CIB_PRE_NOTIFY "cib_pre_notify" # define T_CIB_POST_NOTIFY "cib_post_notify" # define T_CIB_UPDATE_CONFIRM "cib_update_confirmation" -# define T_CIB_DIFF_NOTIFY "cib_diff_notify" # define T_CIB_REPLACE_NOTIFY "cib_refresh_notify" # define cib_channel_ro "cib_ro" diff --git a/include/crm/cib/util.h b/include/crm/cib/util.h index e917792..87d96de 100644 --- a/include/crm/cib/util.h +++ b/include/crm/cib/util.h @@ -71,7 +71,6 @@ int set_standby(cib_t * the_cib, const char *uuid, const char *scope, const char xmlNode *get_cib_copy(cib_t * cib); xmlNode *cib_get_generation(cib_t * cib); -gboolean determine_host(cib_t * cib_conn, char **node_uname, char **node_uuid); void cib_metadata(void); const char *cib_pref(GHashTable * options, const char *name); diff --git a/include/crm/cluster.h b/include/crm/cluster.h index 389fde3..386436b 100644 --- a/include/crm/cluster.h +++ b/include/crm/cluster.h @@ -170,4 +170,7 @@ gboolean is_openais_cluster(void); gboolean is_classic_ais_cluster(void); gboolean is_heartbeat_cluster(void); +char *get_local_node_name(void); +char *get_node_name(uint32_t nodeid); + #endif diff --git a/include/crm/cluster/internal.h b/include/crm/cluster/internal.h index 9640b87..5909d1e 100644 --- a/include/crm/cluster/internal.h +++ b/include/crm/cluster/internal.h @@ -25,6 +25,10 @@ # define AIS_IPC_MESSAGE_SIZE 8192*128 # define CRM_MESSAGE_IPC_ACK 0 +#ifndef INTERFACE_MAX +# define INTERFACE_MAX 2 /* from the private coroapi.h header */ +#endif + typedef struct crm_ais_host_s AIS_Host; typedef struct crm_ais_msg_s AIS_Message; @@ -344,6 +348,12 @@ gboolean heartbeat_initialize_nodelist(void *cluster, gboolean force_member, xml # if SUPPORT_COROSYNC +# if SUPPORT_PLUGIN +char *classic_node_name(uint32_t nodeid); +# else +char *corosync_node_name(uint64_t /*cmap_handle_t*/ cmap_handle, uint32_t nodeid); +# endif + gboolean corosync_initialize_nodelist(void *cluster, gboolean force_member, xmlNode *xml_parent); gboolean send_ais_message(xmlNode * msg, gboolean local, @@ -356,6 +366,10 @@ gboolean init_cs_connection(crm_cluster_t *cluster); gboolean init_cs_connection_once(crm_cluster_t *cluster); # endif +#ifdef SUPPORT_CMAN +char *cman_node_name(uint32_t nodeid); +#endif + enum crm_quorum_source { crm_quorum_cman, crm_quorum_corosync, @@ -381,4 +395,7 @@ gboolean init_quorum_connection( gboolean(*dispatch) (unsigned long long, gboolean), void (*destroy) (gpointer)); void set_node_uuid(const char *uname, const char *uuid); + +gboolean node_name_is_valid(const char *key, const char *name); + #endif diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h index cb8abff..09e6ca6 100644 --- a/include/crm/fencing/internal.h +++ b/include/crm/fencing/internal.h @@ -58,7 +58,7 @@ xmlNode *create_device_registration_xml(const char *id, const char *namespace, c #define F_STONITH_CALLDATA "st_calldata" #define F_STONITH_OPERATION "st_op" #define F_STONITH_TARGET "st_target" -#define F_STONITH_REMOTE "st_remote_op" +#define F_STONITH_REMOTE_OP_ID "st_remote_op" #define F_STONITH_RC "st_rc" /*! Timeout period per a device execution */ #define F_STONITH_TIMEOUT "st_timeout" diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h index bafea57..75e51dc 100644 --- a/include/crm/pengine/internal.h +++ b/include/crm/pengine/internal.h @@ -231,4 +231,6 @@ extern ticket_t *ticket_new(const char * ticket_id, pe_working_set_t * data_set) char *clone_strip(const char *last_rsc_id); char *clone_zero(const char *last_rsc_id); +gint sort_node_uname(gconstpointer a, gconstpointer b); + #endif diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h index a4d5930..164ec65 100644 --- a/include/crm/stonith-ng.h +++ b/include/crm/stonith-ng.h @@ -41,7 +41,7 @@ enum stonith_call_options { st_opt_manual_ack = 0x00000008, st_opt_discard_reply = 0x00000010, - st_opt_all_replies = 0x00000020, +/* st_opt_all_replies = 0x00000020, */ st_opt_topology = 0x00000040, st_opt_scope_local = 0x00000100, st_opt_cs_nodeid = 0x00000200, @@ -54,7 +54,7 @@ enum stonith_call_options { }; #define stonith_default_options = stonith_none - +/*! Order matters here, do not change values */ enum op_state { st_query, diff --git a/include/crm_internal.h b/include/crm_internal.h index c5a9abd..4c9c3a8 100644 --- a/include/crm_internal.h +++ b/include/crm_internal.h @@ -201,6 +201,10 @@ char *generate_hash_key(const char *crm_msg_reference, const char *sys); xmlNode *crm_recv_remote_msg(void *session, gboolean encrypted); void crm_send_remote_msg(void *session, xmlNode * msg, gboolean encrypted); +const char *daemon_option(const char *option); +void set_daemon_option(const char *option, const char *value); +gboolean daemon_option_enabled(const char *daemon, const char *option); + # define crm_config_err(fmt...) { crm_config_error = TRUE; crm_err(fmt); } # define crm_config_warn(fmt...) { crm_config_warning = TRUE; crm_warn(fmt); } diff --git a/include/portability.h b/include/portability.h index 814c5d2..741a17a 100644 --- a/include/portability.h +++ b/include/portability.h @@ -74,8 +74,9 @@ char *strndup(const char *str, size_t len); # define USE_GNU # endif -# if !HAVE_LIBGLIB_2_0 +# if NEED_G_HASH_ITER +# include <glib.h> typedef struct fake_ghi { GHashTable *hash; int nth; /* current index over the iteration */ @@ -92,6 +93,7 @@ g_hash_prepend_value(gpointer key, gpointer value, gpointer user_data) *values = g_list_prepend(*values, value); } +/* Since: 2.14 */ static inline GList * g_hash_table_get_values(GHashTable * hash_table) { @@ -114,6 +116,7 @@ g_hash_table_nth_data(gpointer key, gpointer value, gpointer user_data) return FALSE; } +/* Since: 2.16 */ static inline void g_hash_table_iter_init(GHashTableIter * iter, GHashTable * hash_table) { @@ -143,10 +146,31 @@ g_hash_table_iter_next(GHashTableIter * iter, gpointer * key, gpointer * value) return found; } +/* Since: 2.16 */ +static inline void +g_hash_table_iter_remove (GHashTableIter *iter) +{ + g_hash_table_remove(iter->hash, iter->key); + iter->nth--; /* Or zero to be safe? */ +} + +/* Since: 2.16 */ +static inline int +g_strcmp0 (const char *str1, + const char *str2) +{ + if (!str1) + return -(str1 != str2); + if (!str2) + return str1 != str2; + return strcmp (str1, str2); +} # endif /* !HAVE_LIBGLIB_2_0 */ #ifdef NEED_G_LIST_FREE_FULL # include <glib.h> +# include <string.h> +/* Since: 2.28 */ static inline void g_list_free_full(GList *list, GDestroyNotify free_func) { g_list_foreach(list, (GFunc) free_func, NULL); diff --git a/lib/cib/cib_attrs.c b/lib/cib/cib_attrs.c index 1edbc02..abef1c2 100644 --- a/lib/cib/cib_attrs.c +++ b/lib/cib/cib_attrs.c @@ -451,6 +451,13 @@ query_node_uuid(cib_t * the_cib, const char *uname, char **uuid) } } } + + if (rc != pcmk_ok) { + crm_debug("Could not map name=%s to a UUID: %s\n", uname, pcmk_strerror(rc)); + } else { + crm_info("Mapped %s to %s", uname, *uuid); + } + free_xml(fragment); return rc; } diff --git a/lib/cib/cib_utils.c b/lib/cib/cib_utils.c index 30381d9..f3d7a32 100644 --- a/lib/cib/cib_utils.c +++ b/lib/cib/cib_utils.c @@ -674,35 +674,6 @@ cib_native_notify(gpointer data, gpointer user_data) crm_trace("Callback invoked..."); } -gboolean -determine_host(cib_t * cib_conn, char **node_uname, char **node_uuid) -{ - CRM_CHECK(node_uname != NULL, return FALSE); - - if (*node_uname == NULL) { - struct utsname name; - - if (uname(&name) < 0) { - crm_perror(LOG_ERR, "uname(2) call failed"); - return FALSE; - } - *node_uname = strdup(name.nodename); - crm_info("Detected uname: %s", *node_uname); - } - - if (cib_conn && *node_uname != NULL && node_uuid != NULL && *node_uuid == NULL) { - int rc = query_node_uuid(cib_conn, *node_uname, node_uuid); - - if (rc != pcmk_ok) { - fprintf(stderr, "Could not map uname=%s to a UUID: %s\n", - *node_uname, pcmk_strerror(rc)); - return FALSE; - } - crm_info("Mapped %s to %s", *node_uname, crm_str(*node_uuid)); - } - return TRUE; -} - pe_cluster_option cib_opts[] = { /* name, old-name, validate, default, description */ {"enable-acl", NULL, "boolean", NULL, "false", &check_boolean, diff --git a/lib/cluster/cluster.c b/lib/cluster/cluster.c index 28e072a..4cd1633 100644 --- a/lib/cluster/cluster.c +++ b/lib/cluster/cluster.c @@ -19,13 +19,14 @@ #include <crm_internal.h> #include <dlfcn.h> -#include <sys/param.h> #include <stdio.h> -#include <sys/types.h> #include <unistd.h> #include <string.h> #include <stdlib.h> #include <time.h> +#include <sys/param.h> +#include <sys/types.h> +#include <sys/utsname.h> #include <crm/crm.h> #include <crm/msg_xml.h> @@ -344,10 +345,89 @@ get_uuid(const char *uname) return get_node_uuid(0, uname); } +char * +get_local_node_name(void) +{ + int rc; + char *name = NULL; + struct utsname res; + enum cluster_type_e stack = get_cluster_type(); + + switch(stack) { + +#if SUPPORT_CMAN + case pcmk_cluster_cman: + name = cman_node_name(0 /* AKA. CMAN_NODEID_US */); + break; +#endif + +#if SUPPORT_COROSYNC +# if !SUPPORT_PLUGIN + case pcmk_cluster_corosync: + name = corosync_node_name(0, 0); + break; +# endif +#endif + case pcmk_cluster_heartbeat: + case pcmk_cluster_classic_ais: + rc = uname(&res); + if(rc == 0) { + name = strdup(res.nodename); + } + break; + default: + crm_err("Unknown cluster type: %s (%d)", name_for_cluster_type(stack), stack); + } + + if(name == NULL) { + crm_err("Could not obtain the local %s node name", name_for_cluster_type(stack)); + exit(100); + } + return name; +} + +char * +get_node_name(uint32_t nodeid) +{ + char *name = NULL; + enum cluster_type_e stack = get_cluster_type(); + + switch (stack) { + case pcmk_cluster_heartbeat: + break; + +#if SUPPORT_PLUGIN + case pcmk_cluster_classic_ais: + name = classic_node_name(nodeid); + break; +#else + case pcmk_cluster_corosync: + name = corosync_node_name(0, nodeid); + break; +#endif + +#if SUPPORT_CMAN + case pcmk_cluster_cman: + name = cman_node_name(nodeid); + break; +#endif + + default: + crm_err("Unknown cluster type: %s (%d)", name_for_cluster_type(stack), stack); + } + + if(name == NULL) { + crm_notice("Could not obtain a node name for %s nodeid %u", + name_for_cluster_type(stack), nodeid); + } + return name; +} + +/* Only used by update_failcount() in te_utils.c */ const char * get_uname(const char *uuid) { - const char *uname = NULL; + char *uname = NULL; if (crm_uname_cache == NULL) { crm_uname_cache = g_hash_table_new_full(crm_str_hash, g_str_equal, @@ -362,48 +442,48 @@ get_uname(const char *uuid) crm_trace("%s = %s (cached)", uuid, uname); return uname; } + #if SUPPORT_COROSYNC if (is_openais_cluster()) { if (!uname_is_uuid() && is_corosync_cluster()) { uint32_t id = crm_int_helper(uuid, NULL); crm_node_t *node = g_hash_table_lookup(crm_peer_id_cache, GUINT_TO_POINTER(id)); - uname = node ? node->uname : NULL; - } else { - uname = uuid; - } + if(node && node->uname) { + uname = strdup(node->uname); + } - if (uname) { - crm_trace("Storing %s = %s", uuid, uname); - g_hash_table_insert(crm_uname_cache, strdup(uuid), strdup(uname)); + } else { + uname = strdup(uuid); } } #endif #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { - if (heartbeat_cluster != NULL && uuid != NULL) { + if (heartbeat_cluster != NULL) { cl_uuid_t uuid_raw; - char *hb_uname = NULL; char *uuid_copy = strdup(uuid); cl_uuid_parse(uuid_copy, &uuid_raw); - hb_uname = malloc( MAX_NAME); + uname = malloc( MAX_NAME); - if (heartbeat_cluster->llc_ops->get_name_by_uuid(heartbeat_cluster, &uuid_raw, hb_uname, - MAX_NAME) == HA_FAIL) { + if (heartbeat_cluster->llc_ops->get_name_by_uuid( + heartbeat_cluster, &uuid_raw, uname, MAX_NAME) == HA_FAIL) { crm_err("Could not calculate uname for %s", uuid); free(uuid_copy); - free(hb_uname); - - } else { - crm_trace("Storing %s = %s", uuid, uname); - g_hash_table_insert(crm_uname_cache, uuid_copy, hb_uname); + free(uname); + uname = NULL; } } } #endif - return g_hash_table_lookup(crm_uname_cache, uuid); + + if (uname) { + crm_trace("Storing %s = %s", uuid, uname); + g_hash_table_insert(crm_uname_cache, strdup(uuid), uname); + } + return uname; } void @@ -563,3 +643,24 @@ is_heartbeat_cluster(void) { return get_cluster_type() == pcmk_cluster_heartbeat; } + +gboolean +node_name_is_valid(const char *key, const char *name) +{ + int octet; + + if(name == NULL) { + crm_trace("%s is empty", key); + return FALSE; + + } else if(sscanf(name, "%d.%d.%d.%d", &octet, &octet, &octet, &octet) == 4) { + crm_trace("%s contains an ipv4 address, ignoring: %s", key, name); + return FALSE; + + } else if(strstr(name, ":") != NULL) { + crm_trace("%s contains an ipv6 address, ignoring: %s", key, name); + return FALSE; + } + crm_trace("%s is valid", key); + return TRUE; +} diff --git a/lib/cluster/corosync.c b/lib/cluster/corosync.c index 7ea1f3e..a838638 100644 --- a/lib/cluster/corosync.c +++ b/lib/cluster/corosync.c @@ -66,36 +66,11 @@ static uint32_t pcmk_nodeid = 0; } \ } while(counter < max) -#ifndef INTERFACE_MAX -# define INTERFACE_MAX 2 /* from the private coroapi.h header */ -#endif - -static gboolean -corosync_name_is_valid(const char *key, const char *name) -{ - int octet; - - if(name == NULL) { - crm_trace("%s is empty", key); - return FALSE; - - } else if(sscanf(name, "%d.%d.%d.%d", &octet, &octet, &octet, &octet) == 4) { - crm_trace("%s contains an ipv4 address, ignoring: %s", key, name); - return FALSE; - - } else if(strstr(name, ":") != NULL) { - crm_trace("%s contains an ipv4 address, ignoring: %s", key, name); - return FALSE; - } - crm_trace("%s is valid", key); - return TRUE; -} - /* * CFG functionality stolen from node_name() in corosync-quorumtool.c * This resolves the first address assigned to a node and returns the name or IP address. */ -static char *corosync_node_name(cmap_handle_t cmap_handle, uint32_t nodeid) +char *corosync_node_name(uint64_t /*cmap_handle_t*/ cmap_handle, uint32_t nodeid) { int lpc = 0; int rc = CS_OK; @@ -106,6 +81,29 @@ static char *corosync_node_name(cmap_handle_t cmap_handle, uint32_t nodeid) corosync_cfg_handle_t cfg_handle = 0; static corosync_cfg_callbacks_t cfg_callbacks = {}; + /* nodeid == 0 == CMAN_NODEID_US */ + if(nodeid == 0 && pcmk_nodeid) { + nodeid = pcmk_nodeid; + + } else if(nodeid == 0) { + /* Look it up */ + int rc = -1; + int retries = 0; + cpg_handle_t handle = 0; + cpg_callbacks_t cb = {}; + + cs_repeat(retries, 5, rc = cpg_initialize(&handle, &cb)); + if (rc == CS_OK) { + retries = 0; + cs_repeat(retries, 5, rc = cpg_local_get(handle, &pcmk_nodeid)); + } + + if (rc != CS_OK) { + crm_err("Could not get local node id from the CPG API: %d", rc); + } + cpg_finalize(handle); + } + if(cmap_handle == 0 && local_handle == 0) { retries = 0; crm_trace("Initializing CMAP connection"); @@ -120,7 +118,7 @@ static char *corosync_node_name(cmap_handle_t cmap_handle, uint32_t nodeid) } while(retries < 5 && rc != CS_OK); if (rc != CS_OK) { - crm_warn("Could not connect to Cluster Configuration Database API, error %d", cs_strerror(rc)); + crm_warn("Could not connect to Cluster Configuration Database API, error %s", cs_strerror(rc)); local_handle = 0; } } @@ -149,7 +147,7 @@ static char *corosync_node_name(cmap_handle_t cmap_handle, uint32_t nodeid) rc = cmap_get_string(cmap_handle, key, &name); crm_trace("%s = %s", key, name); - if(corosync_name_is_valid(key, name) == FALSE) { + if(node_name_is_valid(key, name) == FALSE) { free(name); name = NULL; } g_free(key); @@ -160,7 +158,7 @@ static char *corosync_node_name(cmap_handle_t cmap_handle, uint32_t nodeid) rc = cmap_get_string(cmap_handle, key, &name); crm_trace("%s = %s %d", key, name, rc); - if(corosync_name_is_valid(key, name) == FALSE) { + if(node_name_is_valid(key, name) == FALSE) { free(name); name = NULL; } g_free(key); @@ -210,14 +208,14 @@ static char *corosync_node_name(cmap_handle_t cmap_handle, uint32_t nodeid) if (getnameinfo((struct sockaddr *)addrs[0].address, addrlen, buf, sizeof(buf), NULL, 0, 0) == 0) { crm_notice("Inferred node name '%s' for nodeid %u from DNS", buf, nodeid); - if(corosync_name_is_valid("DNS", buf)) { + if(node_name_is_valid("DNS", buf)) { name = strdup(buf); } } } else { crm_debug("Unable to get node address for nodeid %u: %s", nodeid, cs_strerror(rc)); } - cmap_finalize(cfg_handle); + corosync_cfg_finalize(cfg_handle); } if(local_handle) { @@ -886,7 +884,6 @@ init_cs_connection(crm_cluster_t *cluster) gboolean init_cs_connection_once(crm_cluster_t *cluster) { - struct utsname res; enum cluster_type_e stack = get_cluster_type(); crm_peer_init(); @@ -899,14 +896,8 @@ init_cs_connection_once(crm_cluster_t *cluster) if (init_cpg_connection(cluster->cs_dispatch, cluster->destroy, &pcmk_nodeid) == FALSE) { return FALSE; - } else if (uname(&res) < 0) { - crm_perror(LOG_ERR, "Could not determin the current host"); - exit(100); - - } else { - pcmk_uname = strdup(res.nodename); } - + pcmk_uname = get_local_node_name(); crm_info("Connection to '%s': established", name_for_cluster_type(stack)); CRM_ASSERT(pcmk_uname != NULL); diff --git a/lib/cluster/legacy.c b/lib/cluster/legacy.c index 9dbdd7f..6bb54ce 100644 --- a/lib/cluster/legacy.c +++ b/lib/cluster/legacy.c @@ -23,11 +23,14 @@ #include <crm/cluster.h> #include <crm/common/mainloop.h> #include <sys/utsname.h> +#include <sys/socket.h> +#include <netdb.h> #if SUPPORT_COROSYNC # include <corosync/confdb.h> # include <corosync/corodefs.h> -# include <corosync/cpg.h> +# include <corosync/cpg.h> +# include <corosync/cfg.h> cpg_handle_t pcmk_cpg_handle = 0; struct cpg_name pcmk_cpg_group = { @@ -1059,37 +1062,95 @@ init_cs_connection(crm_cluster_t *cluster) return FALSE; } -static char * -get_local_node_name(void) +char *classic_node_name(uint32_t nodeid) { + int rc = CS_OK; + int retries = 0; char *name = NULL; - struct utsname res; - if (is_cman_cluster()) { -# if SUPPORT_CMAN - cman_node_t us; - cman_handle_t cman; + corosync_cfg_handle_t cfg_handle = 0; + static corosync_cfg_callbacks_t cfg_callbacks = {}; + + if(nodeid == 0 /* AKA. CMAN_NODEID_US */) { + nodeid = pcmk_nodeid; + } + + if(name == NULL) { + retries = 0; + crm_trace("Initializing CFG connection"); + do { + rc = corosync_cfg_initialize(&cfg_handle, &cfg_callbacks); + if(rc != CS_OK) { + retries++; + crm_debug("API connection setup failed: %d. Retrying in %ds", rc, retries); + sleep(retries); + } + + } while(retries < 5 && rc != CS_OK); + + if (rc != CS_OK) { + crm_warn("Could not connect to the Corosync CFG API, error %d", rc); + cfg_handle = 0; + } + } + + if(name == NULL && cfg_handle != 0) { + int numaddrs; + char buf[INET6_ADDRSTRLEN]; + + socklen_t addrlen; + struct sockaddr_storage *ss; + corosync_cfg_node_address_t addrs[INTERFACE_MAX]; + + rc = corosync_cfg_get_node_addrs(cfg_handle, nodeid, INTERFACE_MAX, &numaddrs, addrs); + if (rc == CS_OK) { + ss = (struct sockaddr_storage *)addrs[0].address; + if (ss->ss_family == AF_INET6) { + addrlen = sizeof(struct sockaddr_in6); + } else { + addrlen = sizeof(struct sockaddr_in); + } - cman = cman_init(NULL); - if (cman != NULL && cman_is_active(cman)) { - us.cn_name[0] = 0; - cman_get_node(cman, CMAN_NODEID_US, &us); - name = strdup(us.cn_name); - crm_info("Using CMAN node name: %s", name); + if (getnameinfo((struct sockaddr *)addrs[0].address, addrlen, buf, sizeof(buf), NULL, 0, 0) == 0) { + crm_notice("Inferred node name '%s' for nodeid %u from DNS", buf, nodeid); + if(node_name_is_valid("DNS", buf)) { + name = strdup(buf); + } + } } else { - crm_err("Couldn't determin node name from CMAN"); + crm_debug("Unable to get node address for nodeid %u: %d", nodeid, rc); } + corosync_cfg_finalize(cfg_handle); + } - cman_finish(cman); -# endif + if(name == NULL) { + crm_debug("Unable to get node name for nodeid %u", nodeid); + } + return name; +} - } else if (uname(&res) < 0) { - crm_perror(LOG_ERR, "Could not determin the current host"); - exit(100); +char *cman_node_name(uint32_t nodeid) +{ + char *name = NULL; - } else { - name = strdup(res.nodename); +# if SUPPORT_CMAN + cman_node_t us; + cman_handle_t cman; + + cman = cman_init(NULL); + if (cman != NULL && cman_is_active(cman)) { + us.cn_name[0] = 0; + cman_get_node(cman, nodeid, &us); + name = strdup(us.cn_name); + crm_info("Using CMAN node name %s for %u", name, nodeid); + } + + cman_finish(cman); +# endif + + if(name == NULL) { + crm_debug("Unable to get node name for nodeid %u", nodeid); } return name; } @@ -1114,7 +1175,7 @@ init_cs_connection_once(crm_cluster_t *cluster) if (init_cpg_connection(cluster) == FALSE) { return FALSE; } - pcmk_uname = get_local_node_name(); + pcmk_uname = cman_node_name(0 /* CMAN_NODEID_US */); break; case pcmk_cluster_heartbeat: crm_info("Could not find an active corosync based cluster"); diff --git a/lib/common/iso8601.c b/lib/common/iso8601.c index 4dc49c1..47b1ce5 100644 --- a/lib/common/iso8601.c +++ b/lib/common/iso8601.c @@ -285,7 +285,7 @@ crm_time_get_seconds(crm_time_t * dt) return in_seconds; } -#define EPOCH_SECONDS 62135596800 /* Calculated using crm_time_get_seconds() */ +#define EPOCH_SECONDS 62135596800ULL /* Calculated using crm_time_get_seconds() */ unsigned long long crm_time_get_seconds_since_epoch(crm_time_t * dt) { diff --git a/lib/common/logging.c b/lib/common/logging.c index af80a54..3863482 100644 --- a/lib/common/logging.c +++ b/lib/common/logging.c @@ -95,7 +95,7 @@ crm_trigger_blackbox(int nsig) crm_write_blackbox(nsig, NULL); } -static const char * +const char * daemon_option(const char *option) { char env_name[NAME_MAX]; @@ -104,19 +104,46 @@ daemon_option(const char *option) snprintf(env_name, NAME_MAX, "PCMK_%s", option); value = getenv(env_name); if (value != NULL) { + crm_trace("Found %s = %s", env_name, value); return value; } snprintf(env_name, NAME_MAX, "HA_%s", option); value = getenv(env_name); if (value != NULL) { + crm_trace("Found %s = %s", env_name, value); return value; } + crm_trace("Nothing found for %s", option); return NULL; } -static gboolean +void +set_daemon_option(const char *option, const char *value) +{ + char env_name[NAME_MAX]; + + snprintf(env_name, NAME_MAX, "PCMK_%s", option); + if(value) { + crm_trace("Setting %s to %s", env_name, value); + setenv(env_name, value, 1); + } else { + crm_trace("Unsetting %s", env_name); + unsetenv(env_name); + } + + snprintf(env_name, NAME_MAX, "HA_%s", option); + if(value) { + crm_trace("Setting %s to %s", env_name, value); + setenv(env_name, value, 1); + } else { + crm_trace("Unsetting %s", env_name); + unsetenv(env_name); + } +} + +gboolean daemon_option_enabled(const char *daemon, const char *option) { const char *value = daemon_option(option); @@ -512,12 +539,12 @@ crm_update_callsites(void) static gboolean log = TRUE; if(log) { log = FALSE; - crm_info("Enabling callsites based on priority=%d, files=%s, functions=%s, formats=%s, tags=%s", - crm_log_level, - getenv("PCMK_trace_files"), - getenv("PCMK_trace_functions"), - getenv("PCMK_trace_formats"), - getenv("PCMK_trace_tags")); + crm_debug("Enabling callsites based on priority=%d, files=%s, functions=%s, formats=%s, tags=%s", + crm_log_level, + getenv("PCMK_trace_files"), + getenv("PCMK_trace_functions"), + getenv("PCMK_trace_formats"), + getenv("PCMK_trace_tags")); } qb_log_filter_fn_set(crm_log_filter); } @@ -540,6 +567,7 @@ crm_log_init(const char *entity, int level, gboolean daemon, gboolean to_stderr, int lpc = 0; const char *logfile = daemon_option("debugfile"); const char *facility = daemon_option("logfacility"); + const char *f_copy = facility; /* Redirect messages from glib functions to our handler */ #ifdef HAVE_G_LOG_SET_DEFAULT_HANDLER @@ -550,8 +578,11 @@ crm_log_init(const char *entity, int level, gboolean daemon, gboolean to_stderr, g_log_set_always_fatal((GLogLevelFlags) 0); /*value out of range */ if (facility == NULL) { - /* Set a default */ facility = "daemon"; + + } else if(safe_str_eq(facility, "none")) { + facility = "daemon"; + quiet = TRUE; } if (entity) { @@ -590,25 +621,28 @@ crm_log_init(const char *entity, int level, gboolean daemon, gboolean to_stderr, set_format_string(lpc, crm_system_name); } - if (quiet) { - /* Nuke any syslog activity */ - unsetenv("HA_logfacility"); - unsetenv("PCMK_logfacility"); - qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_FALSE); + crm_enable_stderr(to_stderr); - } else if(daemon) { - setenv("HA_logfacility", facility, TRUE); - setenv("PCMK_logfacility", facility, TRUE); + if(logfile) { + crm_add_logfile(logfile); } if (daemon_option_enabled(crm_system_name, "blackbox")) { crm_enable_blackbox(0); } + + crm_trace("Quiet: %d, facility %s", quiet, f_copy); + daemon_option("debugfile"); + daemon_option("logfacility"); - crm_enable_stderr(to_stderr); + if (quiet) { + /* Nuke any syslog activity */ + facility = NULL; + qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_FALSE); + } - if(logfile) { - crm_add_logfile(logfile); + if(daemon) { + set_daemon_option("logfacility", facility); } if(daemon diff --git a/lib/common/mainloop.c b/lib/common/mainloop.c index 11ff24a..1f5360e 100644 --- a/lib/common/mainloop.c +++ b/lib/common/mainloop.c @@ -500,7 +500,7 @@ qb_ipcs_service_t *mainloop_add_ipc_server( rc = qb_ipcs_run(server); if (rc < 0) { - crm_err("Could not start %s IPC server: %s (%d)", name, strerror(rc), rc); + crm_err("Could not start %s IPC server: %s (%d)", name, pcmk_strerror(rc), rc); return NULL; } diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c index ce60494..68d0753 100644 --- a/lib/fencing/st_client.c +++ b/lib/fencing/st_client.c @@ -1800,7 +1800,7 @@ xml_to_event(xmlNode *msg) event->action = crm_element_value_copy(data, F_STONITH_ACTION); event->target = crm_element_value_copy(data, F_STONITH_TARGET); event->executioner = crm_element_value_copy(data, F_STONITH_DELEGATE); - event->id = crm_element_value_copy(data, F_STONITH_REMOTE); + event->id = crm_element_value_copy(data, F_STONITH_REMOTE_OP_ID); event->client_origin = crm_element_value_copy(data, F_STONITH_CLIENTNAME); } else { crm_err("No data for %s event", ntype); diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c index 155928c..7ce2467 100644 --- a/lib/pengine/clone.c +++ b/lib/pengine/clone.c @@ -364,13 +364,6 @@ clone_print_xml(resource_t * rsc, const char *pre_text, long options, void *prin free(child_text); } -static gint sort_node_uname(gconstpointer a, gconstpointer b) -{ - node_t *node_a = (node_t*)a; - node_t *node_b = (node_t*)b; - return strcmp(node_a->details->uname, node_b->details->uname); -} - void clone_print(resource_t * rsc, const char *pre_text, long options, void *print_data) { diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index 1c8dcf0..37728c3 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -190,15 +190,6 @@ unpack_config(xmlNode * config, pe_working_set_t * data_set) return TRUE; } -static gint -sort_nodes_uname(gconstpointer a, gconstpointer b) -{ - const node_t *na = a; - const node_t *nb = b; - - return strcmp(na->details->uname, nb->details->uname); -} - gboolean unpack_nodes(xmlNode * xml_nodes, pe_working_set_t * data_set) { @@ -290,7 +281,7 @@ unpack_nodes(xmlNode * xml_nodes, pe_working_set_t * data_set) unpack_instance_attributes(data_set->input, xml_obj, XML_TAG_UTILIZATION, NULL, new_node->details->utilization, NULL, FALSE, data_set->now); - data_set->nodes = g_list_insert_sorted(data_set->nodes, new_node, sort_nodes_uname); + data_set->nodes = g_list_insert_sorted(data_set->nodes, new_node, sort_node_uname); crm_trace("Done with node %s", crm_element_value(xml_obj, XML_ATTR_UNAME)); } } diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c index e843969..70eec16 100644 --- a/lib/pengine/utils.c +++ b/lib/pengine/utils.c @@ -131,7 +131,7 @@ node_list_dup(GListPtr list1, gboolean reset, gboolean filter) return result; } -static gint +gint sort_node_uname(gconstpointer a, gconstpointer b) { const node_t *node_a = a; diff --git a/mcp/corosync.c b/mcp/corosync.c index bec24f6..7f83d2d 100644 --- a/mcp/corosync.c +++ b/mcp/corosync.c @@ -48,7 +48,7 @@ static struct cpg_name cpg_group = { .value[0] = 0, }; -gboolean use_cman = FALSE; +enum cluster_type_e stack = pcmk_cluster_unknown; static cpg_handle_t cpg_handle; static corosync_cfg_handle_t cfg_handle; @@ -403,7 +403,7 @@ config_find_next(confdb_handle_t config, const char *name, confdb_handle_t top_h } #else static int -get_config_opt(cmap_handle_t object_handle, const char *key, char **value, const char *fallback) +get_config_opt(uint64_t unused, cmap_handle_t object_handle, const char *key, char **value, const char *fallback) { int rc = 0, retries = 0; @@ -422,42 +422,12 @@ get_config_opt(cmap_handle_t object_handle, const char *key, char **value, const #endif -char * -get_local_node_name(void) -{ - char *name = NULL; - struct utsname res; - - if (use_cman) { -#if SUPPORT_CMAN - cman_node_t us; - cman_handle_t cman; - - cman = cman_init(NULL); - if (cman != NULL && cman_is_active(cman)) { - us.cn_name[0] = 0; - cman_get_node(cman, CMAN_NODEID_US, &us); - name = strdup(us.cn_name); - crm_info("Using CMAN node name: %s", name); - - } else { - crm_err("Couldn't determin node name from CMAN"); - } - - cman_finish(cman); +#if HAVE_CONFDB +# define KEY_PREFIX "" +#elif HAVE_CMAP +# define KEY_PREFIX "logging." #endif - } else if (uname(&res) < 0) { - crm_perror(LOG_ERR, "Could not determin the current host"); - exit(100); - - } else { - name = strdup(res.nodename); - } - return name; -} - - gboolean read_config(void) { @@ -465,13 +435,13 @@ read_config(void) int retries = 0; gboolean have_log = FALSE; + const char *const_value = NULL; + char *logging_debug = NULL; char *logging_logfile = NULL; char *logging_to_logfile = NULL; char *logging_to_syslog = NULL; - char *logging_syslog_facility = NULL; - - enum cluster_type_e stack = pcmk_cluster_unknown; + char *logging_syslog_facility = NULL; #if HAVE_CONFDB char *value = NULL; @@ -494,6 +464,7 @@ read_config(void) } while(retries < 5); #elif HAVE_CMAP cmap_handle_t local_handle; + uint64_t config = 0; /* There can be only one (possibility if confdb isn't around) */ do { @@ -524,19 +495,18 @@ read_config(void) /* =::=::= Should we be here =::=::= */ if (stack == pcmk_cluster_corosync) { - setenv("HA_cluster_type", "corosync", 1); - setenv("HA_quorum_type", "corosync", 1); + set_daemon_option("cluster_type", "corosync"); + set_daemon_option("quorum_type", "corosync"); #if HAVE_CONFDB } else if (stack == pcmk_cluster_cman) { - setenv("HA_cluster_type", "cman", 1); - setenv("HA_quorum_type", "cman", 1); + set_daemon_option("cluster_type", "cman"); + set_daemon_option("quorum_type", "cman"); enable_crmd_as_root(TRUE); - use_cman = TRUE; } else if (stack == pcmk_cluster_classic_ais) { - setenv("HA_cluster_type", "openais", 1); - setenv("HA_quorum_type", "pcmk", 1); + set_daemon_option("cluster_type", "openais"); + set_daemon_option("quorum_type", "pcmk"); /* Look for a service block to indicate our plugin is loaded */ top_handle = config_find_init(config); @@ -548,8 +518,8 @@ read_config(void) get_config_opt(config, local_handle, "ver", &value, "0"); if (safe_str_eq(value, "1")) { get_config_opt(config, local_handle, "use_logd", &value, "no"); - setenv("HA_use_logd", value, 1); - setenv("HA_LOGD", value, 1); + set_daemon_option("use_logd", value); + set_daemon_option("LOGD", value); get_config_opt(config, local_handle, "use_mgmtd", &value, "no"); enable_mgmtd(crm_is_true(value)); @@ -574,67 +544,85 @@ read_config(void) #if HAVE_CONFDB top_handle = config_find_init(config); local_handle = config_find_next(config, "logging", top_handle); +#endif + + /* =::=::= Logging =::=::= */ + get_config_opt(config, local_handle, KEY_PREFIX"debug", &logging_debug, "off"); + + const_value = daemon_option("debugfile"); + if(const_value) { + logging_to_logfile = strdup("on"); + logging_logfile = strdup(const_value); + crm_trace("Using debugfile setting from the environment: %s", logging_logfile); + + } else { + get_config_opt(config, local_handle, KEY_PREFIX"to_logfile", &logging_to_logfile, "off"); + get_config_opt(config, local_handle, KEY_PREFIX"logfile", &logging_logfile, "/var/log/pacemaker"); + } - get_config_opt(config, local_handle, "debug", &logging_debug, "off"); - get_config_opt(config, local_handle, "logfile", &logging_logfile, "/var/log/pacemaker"); - get_config_opt(config, local_handle, "to_logfile", &logging_to_logfile, "off"); - get_config_opt(config, local_handle, "to_syslog", &logging_to_syslog, "on"); - get_config_opt(config, local_handle, "syslog_facility", &logging_syslog_facility, "daemon"); + const_value = daemon_option("logfacility"); + if(const_value) { + logging_syslog_facility = strdup(const_value); + crm_trace("Using logfacility setting from the environment: %s", logging_syslog_facility); + + if(safe_str_eq(logging_syslog_facility, "none")) { + logging_to_syslog = strdup("off"); + } else { + logging_to_syslog = strdup("on"); + } + } else { + get_config_opt(config, local_handle, KEY_PREFIX"to_syslog", &logging_to_syslog, "on"); + get_config_opt(config, local_handle, KEY_PREFIX"syslog_facility", &logging_syslog_facility, "daemon"); + } + +#if HAVE_CONFDB confdb_finalize(config); #elif HAVE_CMAP - /* =::=::= Logging =::=::= */ - get_config_opt(local_handle, "logging.debug", &logging_debug, "off"); - get_config_opt(local_handle, "logging.logfile", &logging_logfile, "/var/log/pacemaker"); - get_config_opt(local_handle, "logging.to_logfile", &logging_to_logfile, "off"); - get_config_opt(local_handle, "logging.to_syslog", &logging_to_syslog, "on"); - get_config_opt(local_handle, "logging.syslog_facility", &logging_syslog_facility, "daemon"); - cmap_finalize(local_handle); #endif - - if (crm_is_true(logging_debug)) { - setenv("HA_debug", "1", 1); + + if(daemon_option("debug")) { + crm_trace("Using debug setting from the environment: %s", daemon_option("debug")); + if(get_crm_log_level() < LOG_DEBUG && daemon_option_enabled("pacemakerd", "debug")) { + set_crm_log_level(LOG_DEBUG); + } + + } else if (crm_is_true(logging_debug)) { + set_daemon_option("debug", "1"); if(get_crm_log_level() < LOG_DEBUG) { set_crm_log_level(LOG_DEBUG); } } else { - setenv("HA_debug", "0", 1); + set_daemon_option("debug", "0"); } if (crm_is_true(logging_to_logfile)) { if(crm_add_logfile(logging_logfile)) { - setenv("HA_debugfile", logging_logfile, 1); - setenv("HA_DEBUGLOG", logging_logfile, 1); - setenv("HA_LOGFILE", logging_logfile, 1); + /* What a cluster fsck, eventually we need to mandate /one/ */ + set_daemon_option("debugfile", logging_logfile); + set_daemon_option("DEBUGLOG", logging_logfile); + set_daemon_option("LOGFILE", logging_logfile); have_log = TRUE; } else { crm_err("Couldn't create logfile: %s", logging_logfile); } } - + if (have_log && crm_is_true(logging_to_syslog) == FALSE) { - crm_info("User configured file based logging and explicitly disabled syslog."); + qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_FALSE); free(logging_syslog_facility); - logging_syslog_facility = NULL; + logging_syslog_facility = strdup("none"); + crm_info("User configured file based logging and explicitly disabled syslog."); - } else { - if (crm_is_true(logging_to_syslog) == FALSE) { - crm_err - ("Please enable some sort of logging, either 'to_logfile: on' or 'to_syslog: on'."); - crm_err("If you use file logging, be sure to also define a value for 'logfile'"); - } + } else if (crm_is_true(logging_to_syslog) == FALSE) { + crm_err("Please enable some sort of logging, either 'to_logfile: on' or 'to_syslog: on'."); + crm_err("If you use file logging, be sure to also define a value for 'logfile'"); } - if(logging_syslog_facility) { - setenv("HA_logfacility", logging_syslog_facility, 1); - setenv("HA_LOGFACILITY", logging_syslog_facility, 1); - } else { - unsetenv("HA_logfacility"); - unsetenv("HA_LOGFACILITY"); - } + set_daemon_option("logfacility", logging_syslog_facility); free(logging_debug); free(logging_logfile); diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c index 0bcb5c6..cfa762e 100644 --- a/mcp/pacemaker.c +++ b/mcp/pacemaker.c @@ -28,6 +28,7 @@ #include <crm/msg_xml.h> #include <crm/common/ipc.h> #include <crm/common/mainloop.h> +#include <crm/cluster.h> gboolean fatal_error = FALSE; GMainLoop *mainloop = NULL; @@ -574,11 +575,20 @@ update_node_processes(uint32_t id, const char *uname, uint32_t procs) if (uname != NULL) { if (node->uname == NULL || safe_str_eq(node->uname, uname) == FALSE) { + int lpc, len = strlen(uname); + crm_notice("%p Node %u now known as %s%s%s", node, id, uname, node->uname?node->uname:", was: ", node->uname?node->uname:""); free(node->uname); node->uname = strdup(uname); changed = TRUE; + + for(lpc = 0; lpc < len; lpc++) { + if(uname[lpc] >= 'A' && uname[lpc] <= 'Z') { + crm_warn("Node names with capitals are discouraged, consider changing '%s' to something else", uname); + break; + } + } } } else { @@ -639,20 +649,17 @@ main(int argc, char **argv) struct rlimit cores; crm_ipc_t *old_instance = NULL; qb_ipcs_service_t *ipcs = NULL; + const char *facility = daemon_option("logfacility"); -/* *INDENT-OFF* */ - /* =::=::= Default Environment =::=::= */ - setenv("HA_mcp", "true", 1); - setenv("HA_COMPRESSION", "bz2", 1); - setenv("HA_debug", "0", 1); - setenv("HA_logfacility", "daemon", 1); - setenv("HA_LOGFACILITY", "daemon", 1); - setenv("HA_use_logd", "off", 1); -/* *INDENT-ON* */ + set_daemon_option("mcp", "true"); + set_daemon_option("use_logd", "off"); crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); crm_set_options(NULL, "mode [options]", long_options, "Start/Stop Pacemaker\n"); + /* Restore the original facility so that read_config() does the right thing */ + set_daemon_option("logfacility", facility); + while (1) { flag = crm_get_option(argc, argv, &option_index); if (flag == -1) diff --git a/mcp/pacemaker.h b/mcp/pacemaker.h index fdccdda..224df93 100644 --- a/mcp/pacemaker.h +++ b/mcp/pacemaker.h @@ -54,7 +54,6 @@ extern void update_process_clients(void); extern void update_process_peers(void); extern gboolean update_node_processes(uint32_t node, const char *uname, uint32_t procs); -extern char *get_local_node_name(void); extern void enable_mgmtd(gboolean enable); extern void enable_crmd_as_root(gboolean enable); diff --git a/tools/Makefile.am b/tools/Makefile.am index cdc0445..0e7b1a9 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -108,7 +108,7 @@ crm_verify_LDADD = $(top_builddir)/lib/pengine/libpe_status.la \ $(COMMONLIBS) crm_attribute_SOURCES = crm_attribute.c -crm_attribute_LDADD = $(COMMONLIBS) +crm_attribute_LDADD = $(top_builddir)/lib/cluster/libcrmcluster.la $(COMMONLIBS) crm_resource_SOURCES = crm_resource.c crm_resource_LDADD = $(top_builddir)/lib/pengine/libpe_rules.la \ diff --git a/tools/crm_attribute.c b/tools/crm_attribute.c index 658fe0b..0077c31 100644 --- a/tools/crm_attribute.c +++ b/tools/crm_attribute.c @@ -35,6 +35,7 @@ #include <crm/common/xml.h> #include <crm/common/ipc.h> #include <crm/common/util.h> +#include <crm/cluster.h> #include <crm/cib.h> #include <crm/attrd.h> @@ -230,7 +231,12 @@ main(int argc, char **argv) type = XML_CIB_TAG_CRMCONFIG; } else if (safe_str_neq(type, XML_CIB_TAG_TICKETS)) { - determine_host(the_cib, &dest_uname, &dest_node); + if(dest_uname == NULL) { + dest_uname = get_local_node_name(); + } + if (pcmk_ok != query_node_uuid(the_cib, dest_uname, &dest_node)) { + fprintf(stderr, "Could not map name=%s to a UUID\n", dest_uname); + } } if ((command == 'v' || command == 'D') diff --git a/tools/crm_master b/tools/crm_master index 4d85315..deefa5e 100755 --- a/tools/crm_master +++ b/tools/crm_master @@ -50,4 +50,4 @@ if [ -z "$OCF_RESOURCE_INSTANCE" ]; then exit 1 fi -crm_attribute -N `uname -n` -n master-$OCF_RESOURCE_INSTANCE $options +crm_attribute -N `crm_node -n` -n master-$OCF_RESOURCE_INSTANCE $options diff --git a/tools/crm_node.c b/tools/crm_node.c index b968abd..5c40a34 100644 --- a/tools/crm_node.c +++ b/tools/crm_node.c @@ -64,6 +64,8 @@ static struct crm_option long_options[] = { #endif {"-spacer-", 1, 0, '-', "\nCommands:"}, + {"name", 0, 0, 'n', "\tDisplay the name used by the cluster for this node"}, + {"name-for-id", 1, 0, 'N', "\tDisplay the name used by the cluster for the node with the specified id"}, {"epoch", 0, 0, 'e', "\tDisplay the epoch during which this node joined the cluster"}, {"quorum", 0, 0, 'q', "\tDisplay a 1 if our partition has quorum, 0 if not"}, {"list", 0, 0, 'l', "\tDisplay all known members (past and present) of this cluster (Not available for heartbeat clusters)"}, @@ -673,6 +675,7 @@ main(int argc, char **argv) { int flag = 0; int argerr = 0; + uint32_t nodeid = 0; gboolean force_flag = FALSE; gboolean dangerous_cmd = FALSE; enum cluster_type_e try_stack = pcmk_cluster_unknown; @@ -719,11 +722,16 @@ main(int argc, char **argv) command = flag; target_uname = optarg; break; + case 'N': + command = flag; + nodeid = crm_parse_int(optarg, NULL); + break; case 'p': case 'e': case 'q': case 'i': case 'l': + case 'n': command = flag; break; default: @@ -740,6 +748,15 @@ main(int argc, char **argv) crm_help('?', EX_USAGE); } + if(command == 'n') { + fprintf(stdout, "%s\n", get_local_node_name()); + exit(0); + + } else if(command == 'N') { + fprintf(stdout, "%s\n", get_node_name(nodeid)); + exit(0); + } + if (dangerous_cmd && force_flag == FALSE) { fprintf(stderr, "The supplied command is considered dangerous." " To prevent accidental destruction of the cluster," diff --git a/tools/crm_simulate.c b/tools/crm_simulate.c index 272e7bf..278ec00 100644 --- a/tools/crm_simulate.c +++ b/tools/crm_simulate.c @@ -105,7 +105,7 @@ create_node_entry(cib_t * cib_conn, char *node) cib_conn->cmds->create(cib_conn, XML_CIB_TAG_NODES, cib_object, cib_sync_call | cib_scope_local); /* Not bothering with subsequent query to see if it exists, - we'll bomb out later in the call to determine_host... */ + we'll bomb out later in the call to query_node_uuid()... */ free_xml(cib_object); } @@ -138,8 +138,9 @@ inject_node_state(cib_t * cib_conn, char *node) if (rc == -ENXIO) { char *uuid = NULL; + query_node_uuid(cib_conn, node, &uuid); + cib_object = create_xml_node(NULL, XML_CIB_TAG_STATE); - determine_host(cib_conn, &node, &uuid); crm_xml_add(cib_object, XML_ATTR_UUID, uuid); crm_xml_add(cib_object, XML_ATTR_UNAME, node); cib_conn->cmds->create(cib_conn, XML_CIB_TAG_STATUS, cib_object,