diff --git a/src/include/pool.h b/src/include/pool.h index 2950d290..d4ac575e 100644 --- a/src/include/pool.h +++ b/src/include/pool.h @@ -426,7 +426,7 @@ typedef enum #define REQ_DETAIL_WATCHDOG 0x00000002 /* failover req from watchdog */ #define REQ_DETAIL_CONFIRMED 0x00000004 /* failover req that does not * require majority vote */ -#define REQ_DETAIL_UPDATE 0x00000008 /* failover req is just and update +#define REQ_DETAIL_UPDATE 0x00000008 /* failover req is just an update * node status request */ typedef struct @@ -452,6 +452,16 @@ typedef struct /* greater than 0 if follow primary command or detach_false_primary in * execution */ bool follow_primary_count; + bool follow_primary_lock_pending; /* watchdog process can't wait + * for follow_primary lock acquisition + * in case it is held at the time of + * request. + * This flag indicates that lock was requested + * by watchdog coordinator and next contender should + * wait for the coordinator to release the lock + */ + bool follow_primary_lock_held_remotely; /* true when lock is held by + watchdog coordinator*/ bool follow_primary_ongoing; /* true if follow primary command is ongoing */ } POOL_REQUEST_INFO; @@ -631,8 +641,8 @@ extern POOL_NODE_STATUS * verify_backend_node_status(POOL_CONNECTION_POOL_SLOT * extern POOL_NODE_STATUS * pool_get_node_status(void); extern void pool_set_backend_status_changed_time(int backend_id); extern int get_next_main_node(void); -extern bool pool_acquire_follow_primary_lock(bool block); -extern void pool_release_follow_primary_lock(void); +extern bool pool_acquire_follow_primary_lock(bool block, bool remote_reques); +extern void pool_release_follow_primary_lock(bool remote_reques); /* strlcpy.c */ #ifndef HAVE_STRLCPY diff --git a/src/include/watchdog/wd_internal_commands.h b/src/include/watchdog/wd_internal_commands.h index 4f0b37a2..5093eecc 100644 --- a/src/include/watchdog/wd_internal_commands.h +++ b/src/include/watchdog/wd_internal_commands.h @@ -31,6 +31,19 @@ #include "watchdog/wd_ipc_conn.h" #include "watchdog/wd_commands.h" +/* + * These lock can only be acquired by + * coordinator watchdog node on standby + * watchdog node. + */ +typedef enum WD_LOCK_STANDBY_TYPE +{ + WD_INVALID_LOCK, + /* currently we have only one lock */ + WD_FOLLOW_PRIMARY_LOCK +}WD_LOCK_STANDBY_TYPE; + + extern WdCommandResult wd_start_recovery(void); extern WdCommandResult wd_end_recovery(void); extern WDFailoverCMDResults wd_send_failback_request(int node_id, unsigned char flags); @@ -60,4 +73,8 @@ extern void set_watchdog_node_escalated(void); extern void reset_watchdog_node_escalated(void); extern bool get_watchdog_node_escalation_state(void); extern size_t wd_ipc_get_shared_mem_size(void); + +extern WdCommandResult wd_lock_standby(WD_LOCK_STANDBY_TYPE lock_type); +extern WdCommandResult wd_unlock_standby(WD_LOCK_STANDBY_TYPE lock_type); + #endif /* WD_INTERNAL_COMMANDS_H */ diff --git a/src/include/watchdog/wd_ipc_defines.h b/src/include/watchdog/wd_ipc_defines.h index e66c4551..16ff5a41 100644 --- a/src/include/watchdog/wd_ipc_defines.h +++ b/src/include/watchdog/wd_ipc_defines.h @@ -75,6 +75,7 @@ typedef enum WDValueDataType #define WD_COMMAND_REELECT_LEADER "REELECT_LEADER" #define WD_COMMAND_SHUTDOWN_CLUSTER "SHUTDOWN_CLUSTER" #define WD_COMMAND_RELOAD_CONFIG_CLUSTER "RELOAD_CONFIG_CLUSTER" +#define WD_COMMAND_LOCK_ON_STANDBY "APPLY_LOCK_ON_STANDBY" #define WD_FUNCTION_START_RECOVERY "START_RECOVERY" diff --git a/src/main/main.c b/src/main/main.c index 2702bf28..575bee4a 100644 --- a/src/main/main.c +++ b/src/main/main.c @@ -77,6 +77,7 @@ int myargc; char **myargv; int assert_enabled = 0; char *pool_key = NULL; + int main(int argc, char **argv) { diff --git a/src/main/pgpool_main.c b/src/main/pgpool_main.c index d216cb55..869c0a86 100644 --- a/src/main/pgpool_main.c +++ b/src/main/pgpool_main.c @@ -1305,6 +1305,13 @@ sigusr1_interrupt_processor(void) (errmsg("we have joined the watchdog cluster as STANDBY node"), errdetail("syncing the backend states from the LEADER watchdog node"))); sync_backend_from_watchdog(); + /* + * we also want to release the follow_primary lock if it was held + * by the remote node. + * because the change of watchdog coordinator would lead to forever stuck + * in the the locked state + */ + pool_release_follow_primary_lock(true); } } if (user1SignalSlot->signalFlags[SIG_FAILOVER_INTERRUPT]) @@ -3223,9 +3230,9 @@ find_primary_node(void) pfree(password); /* Verify backend status */ - pool_acquire_follow_primary_lock(true); + pool_acquire_follow_primary_lock(true, false); status = verify_backend_node_status(slots); - pool_release_follow_primary_lock(); + pool_release_follow_primary_lock(false); for (i = 0; i < NUM_BACKENDS; i++) { @@ -3363,7 +3370,16 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary) #endif SetProcessGlobalVariables(PT_FOLLOWCHILD); - pool_acquire_follow_primary_lock(true); + /* + * when the watchdog is enabled, we would come here + * only on the coordinator node. + * so before acquiring the local lock, Lock all the + * standby nodes so that they should stop false primary + * detection until we are finished with the follow primary + * command. + */ + wd_lock_standby(WD_FOLLOW_PRIMARY_LOCK); + pool_acquire_follow_primary_lock(true, false); Req_info->follow_primary_ongoing = true; ereport(LOG, (errmsg("start triggering follow command."))); @@ -3377,7 +3393,9 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary) old_main_node, new_primary, old_primary); } Req_info->follow_primary_ongoing = false; - pool_release_follow_primary_lock(); + pool_release_follow_primary_lock(false); + /* inform standby watchdog nodes to release the lock aswell*/ + wd_unlock_standby(WD_FOLLOW_PRIMARY_LOCK); exit(0); } else if (pid == -1) @@ -4318,9 +4336,11 @@ pool_set_backend_status_changed_time(int backend_id) * they are conflicting each other. If argument "block" is true, this * function will not return until it succeeds in acquiring the lock. This * function returns true if succeeded in acquiring the lock. + * + * first arg:block is ignored when remote_request is set */ bool -pool_acquire_follow_primary_lock(bool block) +pool_acquire_follow_primary_lock(bool block, bool remote_request) { pool_sigset_t oldmask; volatile int follow_primary_count; @@ -4338,6 +4358,29 @@ pool_acquire_follow_primary_lock(bool block) (errmsg("pool_acquire_follow_primary_lock: lock was not held by anyone"))); break; } + else if (follow_primary_count > 0 && remote_request) + { + if (Req_info->follow_primary_lock_held_remotely) + { + /* The lock was already held by remote node and we only + * support one remote lock + */ + ereport(LOG, + (errmsg("pool_acquire_follow_primary_lock: received remote locking request while lock is already held by the remote node"))); + + } + else + { + /* set the flag that watchdog has requested the lock */ + Req_info->follow_primary_lock_pending = true; + } + pool_semaphore_unlock(FOLLOW_PRIMARY_SEM); + POOL_SETMASK(&oldmask); + /* return and inform that the lock was held by someone */ + ereport(DEBUG1, + (errmsg("pool_acquire_follow_primary_lock: lock was held by someone %d", follow_primary_count))); + return false; + } else if (follow_primary_count > 0 && !block) { @@ -4357,6 +4400,8 @@ pool_acquire_follow_primary_lock(bool block) } /* acquire lock */ + Req_info->follow_primary_lock_held_remotely = remote_request; + Req_info->follow_primary_count = 1; pool_semaphore_unlock(FOLLOW_PRIMARY_SEM); POOL_SETMASK(&oldmask); @@ -4371,13 +4416,71 @@ pool_acquire_follow_primary_lock(bool block) * Release lock on follow primary command execution. */ void -pool_release_follow_primary_lock(void) +pool_release_follow_primary_lock(bool remote_request) { pool_sigset_t oldmask; POOL_SETMASK2(&BlockSig, &oldmask); pool_semaphore_lock(FOLLOW_PRIMARY_SEM); - Req_info->follow_primary_count = 0; + if (remote_request) + { + if (Req_info->follow_primary_lock_held_remotely) + { + /* remote request can only release locks held by remote nodes */ + Req_info->follow_primary_count = 0; + Req_info->follow_primary_lock_held_remotely = false; + ereport(DEBUG1, + (errmsg("pool_release_follow_primary_lock relased the remote lock"))); + } + else if (Req_info->follow_primary_count) + { + /* + * we have received the release lock request from remote + * but the lock is not held by remote node. + * Just ignore the request + */ + ereport(DEBUG1, + (errmsg("pool_release_follow_primary_lock is not relasing the lock since it was not held by remote node"))); + } + /* + * Silently ignore, if we received the release request from remote while no lock was held. + * Also clear the pending lock request, As we only support single remote lock + */ + Req_info->follow_primary_lock_pending = false; + + } + else /*local request */ + { + /* + * if we have a pending lock request from watchdog + * do not remove the actual lock, Just clear the pending flag + */ + if (Req_info->follow_primary_lock_pending) + { + Req_info->follow_primary_lock_held_remotely = true; + Req_info->follow_primary_count = 1; + /* also clear the pending lock flag */ + Req_info->follow_primary_lock_pending = false; + ereport(DEBUG1, + (errmsg("pool_release_follow_primary_lock is not relasing the lock and shifting it to coordinator watchdog node"))); + } + else + { + if (Req_info->follow_primary_lock_held_remotely) + { + /* + * Ideally this should not happen. + * yet if for some reason our local node is trying to release a lock + * that is heald by remote node. Just produce a LOG message and release + * the lock + */ + ereport(LOG, + (errmsg("pool_release_follow_primary_lock is relasing the remote lock by local request"))); + } + Req_info->follow_primary_count = 0; + Req_info->follow_primary_lock_held_remotely = false; + } + } pool_semaphore_unlock(FOLLOW_PRIMARY_SEM); POOL_SETMASK(&oldmask); diff --git a/src/streaming_replication/pool_worker_child.c b/src/streaming_replication/pool_worker_child.c index fb15ceb0..61b94446 100644 --- a/src/streaming_replication/pool_worker_child.c +++ b/src/streaming_replication/pool_worker_child.c @@ -196,7 +196,7 @@ do_worker_child(void) */ follow_primary_lock_acquired = false; - if (pool_acquire_follow_primary_lock(false) == true) + if (pool_acquire_follow_primary_lock(false, false) == true) { follow_primary_lock_acquired = true; @@ -227,17 +227,11 @@ do_worker_child(void) /* * If detach_false_primary is enabled, send * degenerate request to detach invalid node. - * This should only happen on leader watchdog node - * and quorum exists if watchdog is enabled. Other - * nodes will be informed by the leader node later - * on. */ - if ((pool_config->detach_false_primary && !pool_config->use_watchdog) || - (pool_config->detach_false_primary && pool_config->use_watchdog && - wd_internal_get_watchdog_quorum_state() >= 0 && wd_status == WD_COORDINATOR)) + if (pool_config->detach_false_primary) { n = i; - degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER | REQ_DETAIL_CONFIRMED); + degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER); } } } @@ -245,7 +239,7 @@ do_worker_child(void) PG_CATCH(); { discard_persistent_connection(); - pool_release_follow_primary_lock(); + pool_release_follow_primary_lock(false); follow_primary_lock_acquired = false; sleep(pool_config->sr_check_period); PG_RE_THROW(); @@ -256,7 +250,7 @@ do_worker_child(void) discard_persistent_connection(); if (follow_primary_lock_acquired) { - pool_release_follow_primary_lock(); + pool_release_follow_primary_lock(false); follow_primary_lock_acquired = false; } } @@ -656,6 +650,6 @@ static void sr_check_will_die(int code, Datum arg) { if (follow_primary_lock_acquired) - pool_release_follow_primary_lock(); + pool_release_follow_primary_lock(false); } diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c index e167bd41..6adea428 100644 --- a/src/watchdog/watchdog.c +++ b/src/watchdog/watchdog.c @@ -2102,6 +2102,16 @@ process_IPC_execute_cluster_command(WDCommandData * ipcCommand) ereport(LOG, (errmsg("Watchdog has received reload config cluster command from IPC channel"))); } + else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0) + { + ereport(LOG, + (errmsg("Watchdog has received 'LOCK ON STANDBY' command from IPC channel"))); + if (get_local_node_state() != WD_COORDINATOR) + { + ereport(LOG, + (errmsg("'LOCK ON STANDBY' command can only be processed on coordinator node"))); + } + } else { ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext, @@ -3011,7 +3021,6 @@ static IPC_CMD_PROCESS_RES process_IPC_failover_indication(WDCommandData * ipcCo errdetail("failed to get failover state from json data in command packet"))); res = FAILOVER_RES_INVALID_FUNCTION; } - } else { @@ -4092,6 +4101,73 @@ wd_execute_cluster_command_processor(WatchdogNode * wdNode, WDPacketData * pkt) (errmsg("processing reload config command from remote node \"%s\"", wdNode->nodeName))); pool_signal_parent(SIGHUP); } + else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0) + { + int i; + int lock_type = -1; + char *operation = NULL; + if (get_local_node_state() != WD_STANDBY && wdNode->state == WD_COORDINATOR) + { + if (nArgs == 2) + { + for ( i =0; i < nArgs; i++) + { + if (strcmp(wdExecCommandArg[i].arg_name, "StandbyLockType") == 0) + { + lock_type = atoi(wdExecCommandArg[i].arg_value); + } + else if (strcmp(wdExecCommandArg[i].arg_name, "LockingOperation") == 0) + { + operation = wdExecCommandArg[i].arg_value; + } + else + ereport(LOG, + (errmsg("unsupported argument \"%s\" in 'LOCK ON STANDBY' from remote node \"%s\"", wdExecCommandArg[i].arg_name, wdNode->nodeName))); + } + if (lock_type < 0 || operation == NULL) + { + ereport(LOG, + (errmsg("missing argument in 'LOCK ON STANDBY' from remote node \"%s\"", wdNode->nodeName), + errdetail("command ignored"))); + } + else if (lock_type == WD_FOLLOW_PRIMARY_LOCK) + { + ereport(LOG, + (errmsg("processing follow primary looking[%s] request from remote node \"%s\"", operation,wdNode->nodeName))); + + if (strcasecmp("acquire", operation) == 0) + pool_acquire_follow_primary_lock(false, true); + else if (strcasecmp("release", operation) == 0) + pool_release_follow_primary_lock(true); + else + ereport(LOG, + (errmsg("invalid looking operaition[%s] in 'LOCK ON STANDBY' from remote node \"%s\"", operation, wdNode->nodeName), + errdetail("command ignored"))); + } + else + ereport(LOG, + (errmsg("unsupported lock-type:%d in 'LOCK ON STANDBY' from remote node \"%s\"", lock_type, wdNode->nodeName))); + + } + else + { + ereport(LOG, + (errmsg("invalid arguments in 'LOCK ON STANDBY' command from remote node \"%s\"", wdNode->nodeName))); + } + } + else if (get_local_node_state() != WD_STANDBY) + { + ereport(LOG, + (errmsg("invalid node state to execute 'LOCK ON STANDBY' command"))); + + } + else + { + ereport(LOG, + (errmsg("'LOCK ON STANDBY' command can only be accepted from the coordinator watchdog node"), + errdetail("ignoring..."))); + } + } else { ereport(WARNING, diff --git a/src/watchdog/wd_internal_commands.c b/src/watchdog/wd_internal_commands.c index d628ead4..d2eb16fd 100644 --- a/src/watchdog/wd_internal_commands.c +++ b/src/watchdog/wd_internal_commands.c @@ -72,6 +72,8 @@ static WDFailoverCMDResults wd_get_failover_result_from_data(WDIPCCmdResult * re static WDFailoverCMDResults wd_issue_failover_command(char *func_name, int *node_id_set, int count, unsigned char flags); +static WdCommandResult wd_send_locking_command(WD_LOCK_STANDBY_TYPE lock_type, + bool acquire); void wd_ipc_initialize_data(void) @@ -552,3 +554,37 @@ wd_internal_get_watchdog_local_node_state(void) { return get_watchdog_local_node_state(pool_config->wd_authkey); } + +static WdCommandResult +wd_send_locking_command(WD_LOCK_STANDBY_TYPE lock_type, bool acquire) +{ + WDExecCommandArg wdExecCommandArg[2]; + + strncpy(wdExecCommandArg[0].arg_name, "StandbyLockType", sizeof(wdExecCommandArg[0].arg_name) - 1); + snprintf(wdExecCommandArg[0].arg_value, sizeof(wdExecCommandArg[0].arg_name) - 1, "%d",lock_type); + + strncpy(wdExecCommandArg[1].arg_name, "LockingOperation", sizeof(wdExecCommandArg[1].arg_name) - 1); + snprintf(wdExecCommandArg[1].arg_value, sizeof(wdExecCommandArg[1].arg_name) - 1, + "%s",acquire?"acquire":"release"); + + ereport(DEBUG1, + (errmsg("sending standby locking request to watchdog"))); + + return wd_execute_cluster_command(WD_COMMAND_LOCK_ON_STANDBY, 2, wdExecCommandArg); +} + +WdCommandResult +wd_lock_standby(WD_LOCK_STANDBY_TYPE lock_type) +{ + if (pool_config->use_watchdog) + return wd_send_locking_command(lock_type, true); + return COMMAND_OK; +} + +WdCommandResult +wd_unlock_standby(WD_LOCK_STANDBY_TYPE lock_type) +{ + if (pool_config->use_watchdog) + return wd_send_locking_command(lock_type, false); + return COMMAND_OK; +}