diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c index 88046f41..25257b71 100644 --- a/src/watchdog/watchdog.c +++ b/src/watchdog/watchdog.c @@ -378,7 +378,8 @@ typedef struct wd_cluster int network_monitor_sock; bool clusterInitialized; bool ipc_auth_needed; - int current_failover_id; + int current_failover_id; + int failover_command_timeout; struct timeval last_bcast_srv_msg_time; /* timestamp when last packet was * broadcasted by the local node */ char last_bcast_srv_msg; @@ -608,7 +609,7 @@ static int revoke_cluster_membership_of_node(WatchdogNode* wdNode, WD_NODE_MEMBE static int restore_cluster_membership_of_node(WatchdogNode* wdNode); static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear); static void wd_execute_cluster_command_processor(WatchdogNode * wdNode, WDPacketData * pkt); - +static void update_failover_timeout(WatchdogNode * wdNode, POOL_CONFIG *pool_config); /* global variables */ wd_cluster g_cluster; struct timeval g_tm_set_time; @@ -2393,7 +2394,7 @@ service_expired_failovers(void) if (failoverObj) { - if (WD_TIME_DIFF_SEC(currTime, failoverObj->startTime) >= FAILOVER_COMMAND_FINISH_TIMEOUT) + if (WD_TIME_DIFF_SEC(currTime, failoverObj->startTime) >= g_cluster.failover_command_timeout) { failovers_to_del = lappend(failovers_to_del, failoverObj); ereport(DEBUG1, @@ -4243,6 +4244,7 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt) if (standby_config) { verify_pool_configurations(wdNode, standby_config); + update_failover_timeout(wdNode, standby_config); } } } @@ -6019,6 +6021,8 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac send_cluster_command(NULL, WD_DECLARE_COORDINATOR_MESSAGE, 4); set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE); update_missed_beacon_count(NULL,true); + update_failover_timeout(g_cluster.localNode, pool_config); + ereport(LOG, (errmsg("I am announcing my self as leader/coordinator watchdog node"))); if (message_level_is_interesting(DEBUG2)) @@ -8162,6 +8166,43 @@ static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear) } } +static void +update_failover_timeout(WatchdogNode * wdNode, POOL_CONFIG *pool_config) +{ + int failover_command_timeout; + if (get_local_node_state() != WD_COORDINATOR) + return; + + failover_command_timeout = pool_config->health_check_period + (pool_config->health_check_retry_delay * pool_config->health_check_max_retries); + + if (pool_config->health_check_params) + { + int i; + for (i = 0 ; i < pool_config->backend_desc->num_backends; i++) + { + int pn_failover_command_timeout = pool_config->health_check_params[i].health_check_period + + (pool_config->health_check_params[i].health_check_retry_delay * + pool_config->health_check_params[i].health_check_max_retries); + + if (failover_command_timeout < pn_failover_command_timeout) + failover_command_timeout = pn_failover_command_timeout; + } + } + + if (g_cluster.localNode == wdNode) + { + /* Reset*/ + g_cluster.failover_command_timeout = failover_command_timeout; + } + else if (g_cluster.failover_command_timeout < failover_command_timeout) + g_cluster.failover_command_timeout = failover_command_timeout; + + if (g_cluster.failover_command_timeout < FAILOVER_COMMAND_FINISH_TIMEOUT) + g_cluster.failover_command_timeout = FAILOVER_COMMAND_FINISH_TIMEOUT; + + ereport(DEBUG,(errmsg("Setting failover command timeout to %d",failover_command_timeout))); +} + #ifdef WATCHDOG_DEBUG /* * Node down request file. In the file, each line consists of watchdog diff --git a/src/watchdog/wd_json_data.c b/src/watchdog/wd_json_data.c index 83842a75..1e2fe8a9 100644 --- a/src/watchdog/wd_json_data.c +++ b/src/watchdog/wd_json_data.c @@ -141,6 +141,43 @@ get_pool_config_from_json(char *json_data, int data_len) strncpy(config->backend_desc->backend_info[i].backend_hostname, ptr, sizeof(config->backend_desc->backend_info[i].backend_hostname) - 1); } + value = json_get_value_for_key(root, "health_check_params"); + /* We don't get seperate health check params from older version + * so be kind if the JSON does not contain one + */ + if (value != NULL && value->type == json_array) + { + int health_check_params_count = value->u.array.length; + if (health_check_params_count != config->backend_desc->num_backends) + { + ereport(LOG, + (errmsg("unexpected number of health check parameters received"), + errdetail("expected:%d got %d",config->backend_desc->num_backends,health_check_params_count))); + } + config->health_check_params = palloc0(sizeof(HealthCheckParams) * config->backend_desc->num_backends); + + if (health_check_params_count > config->backend_desc->num_backends) + health_check_params_count = config->backend_desc->num_backends; + + for (i = 0; i < health_check_params_count; i++) + { + json_value *arr_value = value->u.array.values[i]; + + if (json_get_int_value_for_key(arr_value, "health_check_timeout", &config->health_check_params[i].health_check_timeout)) + config->health_check_params[i].health_check_timeout = 0; + if (json_get_int_value_for_key(arr_value, "health_check_period", &config->health_check_params[i].health_check_period)) + config->health_check_params[i].health_check_period = 0; + if (json_get_int_value_for_key(arr_value, "health_check_max_retries", &config->health_check_params[i].health_check_max_retries)) + config->health_check_params[i].health_check_max_retries = 0; + if (json_get_int_value_for_key(arr_value, "health_check_retry_delay", &config->health_check_params[i].health_check_retry_delay)) + config->health_check_params[i].health_check_retry_delay = 0; + if (json_get_int_value_for_key(arr_value, "connect_timeout", &config->health_check_params[i].connect_timeout)) + config->health_check_params[i].connect_timeout = 0; + } + } + else + config->health_check_params = NULL; + /* wd_nodes array */ value = json_get_value_for_key(root, "wd_nodes"); if (value == NULL || value->type != json_array) @@ -220,6 +257,23 @@ get_pool_config_json(void) jw_put_bool(jNode, "failover_require_consensus", pool_config->failover_require_consensus); jw_put_bool(jNode, "allow_multiple_failover_requests_from_node", pool_config->allow_multiple_failover_requests_from_node); + /* Array of health_check params + * We transport num_backend at max + */ + jw_start_array(jNode, "health_check_params"); + for (i = 0; i < pool_config->backend_desc->num_backends; i++) + { + jw_start_object(jNode, "HealthCheckParams"); + + jw_put_int(jNode, "health_check_timeout", pool_config->health_check_params[i].health_check_timeout); + jw_put_int(jNode, "health_check_period", pool_config->health_check_params[i].health_check_period); + jw_put_int(jNode, "health_check_max_retries", pool_config->health_check_params[i].health_check_max_retries); + jw_put_int(jNode, "health_check_retry_delay", pool_config->health_check_params[i].health_check_retry_delay); + jw_put_int(jNode, "connect_timeout", pool_config->health_check_params[i].connect_timeout); + jw_end_element(jNode); + } + jw_end_element(jNode); /* backend_desc array End */ + /* Array of backends */ jw_start_array(jNode, "backend_desc"); for (i = 0; i < pool_config->backend_desc->num_backends; i++)