View Issue Details

IDProjectCategoryView StatusLast Update
0000455Pgpool-IIBugpublic2019-01-21 13:03
ReporternagataAssigned Tonagata 
PrioritynormalSeverityminorReproducibilitysometimes
Status closedResolutionfixed 
Product Version4.0.2 
Target VersionFixed in Version 
Summary0000455: watchdog lifecheck process has segfalut in query mode
DescriptionWhen wd_lifecheck_method is 'query', lifecheck process has a segmentation fault. This happens not always, but I can reproduce this about in five minutes from start up of the cluster with the configuration produced by watchdog_setup.

In my analysis, this is due to a new feature of 4.0, AES password support in wd_lifecheck_password. This is performed by get_pgpool_config_user_password(), but this is not multi-thread safe function since this uses pstrdup and pfree. And unfortunately, 'query' mode uses pthread. pstrdup could allocate the same address in different threads, and this led double-free when pfree was called.

To fix, we have to prohibit to use get_pgpool_config_user_password() in query mode, that is, AES password can not be supported in wd_lifecheck_password. Scanning pool_passwd when wd_lifecheck_passwrd is empty can not be supported, either. The patch for wd_lifecheck.c is attached, but additionally some documentation fix will be necessary.

We might be able to fix get_pgpool_config_user_password() to be multi-thread safe, or fix 'query' mode to not use pthread, but I don't think it is worth, because 'query' mode is deprecated. (Maybe, this should have been removed when 4.0 was released....)
Steps To ReproduceThis happens not always, but I can reproduce this about in five minutes from start up of the cluster with the configuration produced by watchdog_setup with wd_lifecheck_method = 'query'.
TagsNo tags attached.

Activities

nagata

2019-01-10 11:17

developer  

wd_lifecheck_segfault.patch (1,337 bytes)
diff --git a/src/watchdog/wd_lifecheck.c b/src/watchdog/wd_lifecheck.c
index a9790d48..1010e907 100644
--- a/src/watchdog/wd_lifecheck.c
+++ b/src/watchdog/wd_lifecheck.c
@@ -900,9 +900,25 @@ create_conn(char *hostname, int port)
 {
 	static char conninfo[1024];
 	PGconn	   *conn;
-	char	   *password = get_pgpool_config_user_password(pool_config->wd_lifecheck_user,
-														   pool_config->wd_lifecheck_password);
+	char	   *password = pool_config->wd_lifecheck_password;
 
+	/* We can not use get_pgpool_config_user_password here because this is not multi-thread safe. */
+	if (strlen(pool_config->wd_lifecheck_password) != 0)
+	{
+		PasswordType passwordType = get_password_type(pool_config->wd_lifecheck_password);
+
+		if (passwordType == PASSWORD_TYPE_AES)
+		{
+			ereport(WARNING,
+					(errmsg("AES password is not supported wd_lifecheck_password")));
+			return NULL;
+		}
+		else if (passwordType == PASSWORD_TYPE_TEXT_PREFIXED)
+		{
+			if (password)
+				password = (char*)(password + strlen(PASSWORD_TEXT_PREFIX));
+		}
+	}
 
 	if (strlen(pool_config->wd_lifecheck_dbname) == 0)
 	{
@@ -928,9 +944,6 @@ create_conn(char *hostname, int port)
 			 pool_config->wd_interval / 2 + 1);
 	conn = PQconnectdb(conninfo);
 
-	if (password)
-		pfree(password);
-
 	if (PQstatus(conn) != CONNECTION_OK)
 	{
 		ereport(DEBUG1,

pengbo

2019-01-10 14:01

developer   ~0002317

Thank you.

We will confirm this issue and test your patch.

pengbo

2019-01-15 14:48

developer   ~0002319

A patch for this issue is created by Usama.
If possible, could you test this patch?

See more details:
https://www.pgpool.net/pipermail/pgpool-hackers/2019-January/003220.html

crash_fix.diff (4,127 bytes)
diff --git a/src/watchdog/wd_lifecheck.c b/src/watchdog/wd_lifecheck.c
index a9790d48..ac6dff84 100644
--- a/src/watchdog/wd_lifecheck.c
+++ b/src/watchdog/wd_lifecheck.c
@@ -62,7 +62,8 @@ typedef struct
 {
 	LifeCheckNode *lifeCheckNode;
 	int			retry;			/* retry times (not used?) */
-}			WdPgpoolThreadArg;
+	char		*password;
+}WdPgpoolThreadArg;
 
 typedef struct WdUpstreamConnectionData
 {
@@ -80,7 +81,7 @@ static bool wd_ping_all_server(void);
 static WdUpstreamConnectionData * wd_get_server_from_pid(pid_t pid);
 
 static void *thread_ping_pgpool(void *arg);
-static PGconn *create_conn(char *hostname, int port);
+static PGconn *create_conn(char *hostname, int port, char *password);
 
 static pid_t lifecheck_main(void);
 static void check_pgpool_status(void);
@@ -103,7 +104,7 @@ static const char *lifecheck_child_name(pid_t pid);
 static void reaper(void);
 static int	is_wd_lifecheck_ready(void);
 static int	wd_lifecheck(void);
-static int	wd_ping_pgpool(LifeCheckNode * node);
+static int	wd_ping_pgpool(LifeCheckNode * node, char *password);
 static pid_t fork_lifecheck_child(void);
 
 
@@ -644,11 +645,13 @@ is_wd_lifecheck_ready(void)
 	for (i = 0; i < gslifeCheckCluster->nodeCount; i++)
 	{
 		LifeCheckNode *node = &gslifeCheckCluster->lifeCheckNodes[i];
+		char	   *password = get_pgpool_config_user_password(pool_config->wd_lifecheck_user,
+															   pool_config->wd_lifecheck_password);
 
 		/* query mode */
 		if (pool_config->wd_lifecheck_method == LIFECHECK_BY_QUERY)
 		{
-			if (wd_ping_pgpool(node) == WD_NG)
+			if (wd_ping_pgpool(node, password) == WD_NG)
 			{
 				ereport(DEBUG1,
 						(errmsg("watchdog checking life check is ready"),
@@ -656,6 +659,8 @@ is_wd_lifecheck_ready(void)
 								   i, node->hostName, node->pgpoolPort)));
 				rtn = WD_NG;
 			}
+			if (password)
+				pfree(password);
 		}
 		/* heartbeat mode */
 		else if (pool_config->wd_lifecheck_method == LIFECHECK_BY_HB)
@@ -803,6 +808,8 @@ check_pgpool_status_by_query(void)
 	LifeCheckNode *node;
 	int			rc,
 				i;
+	char	   *password = get_pgpool_config_user_password(pool_config->wd_lifecheck_user,
+														   pool_config->wd_lifecheck_password);
 
 	/* thread init */
 	pthread_attr_init(&attr);
@@ -813,6 +820,7 @@ check_pgpool_status_by_query(void)
 	{
 		node = &gslifeCheckCluster->lifeCheckNodes[i];
 		thread_arg[i].lifeCheckNode = node;
+		thread_arg[i].password = password;
 		rc = watchdog_thread_create(&thread[i], &attr, thread_ping_pgpool, (void *) &thread_arg[i]);
 	}
 
@@ -874,6 +882,8 @@ check_pgpool_status_by_query(void)
 					inform_node_status(node, "unable to connect to node");
 			}
 		}
+		if (password)
+			pfree(password);
 	}
 }
 
@@ -887,7 +897,7 @@ thread_ping_pgpool(void *arg)
 	uintptr_t	rtn;
 	WdPgpoolThreadArg *thread_arg = (WdPgpoolThreadArg *) arg;
 
-	rtn = (uintptr_t) wd_ping_pgpool(thread_arg->lifeCheckNode);
+	rtn = (uintptr_t) wd_ping_pgpool(thread_arg->lifeCheckNode,thread_arg->password);
 
 	pthread_exit((void *) rtn);
 }
@@ -896,13 +906,10 @@ thread_ping_pgpool(void *arg)
  * Create connection to pgpool
  */
 static PGconn *
-create_conn(char *hostname, int port)
+create_conn(char *hostname, int port, char *password)
 {
 	static char conninfo[1024];
 	PGconn	   *conn;
-	char	   *password = get_pgpool_config_user_password(pool_config->wd_lifecheck_user,
-														   pool_config->wd_lifecheck_password);
-
 
 	if (strlen(pool_config->wd_lifecheck_dbname) == 0)
 	{
@@ -928,9 +935,6 @@ create_conn(char *hostname, int port)
 			 pool_config->wd_interval / 2 + 1);
 	conn = PQconnectdb(conninfo);
 
-	if (password)
-		pfree(password);
-
 	if (PQstatus(conn) != CONNECTION_OK)
 	{
 		ereport(DEBUG1,
@@ -987,11 +991,11 @@ wd_check_heartbeat(LifeCheckNode * node)
  * Check if pgpool can accept the lifecheck query.
  */
 static int
-wd_ping_pgpool(LifeCheckNode * node)
+wd_ping_pgpool(LifeCheckNode * node, char* password)
 {
 	PGconn	   *conn;
 
-	conn = create_conn(node->hostName, node->pgpoolPort);
+	conn = create_conn(node->hostName, node->pgpoolPort, password);
 	if (conn == NULL)
 		return WD_NG;
 	return ping_pgpool(conn);
crash_fix.diff (4,127 bytes)

nagata

2019-01-16 15:29

developer   ~0002320

Last edited: 2019-01-16 15:36

View 3 revisions

I have tested this patch, but this was broken.

The position where pfree is called in check_pgpool_status_by_query was so wrong that password was freed before all threads exited. This caused another segmentation fault.
Also, there was a memory leak in is_wd_lifecheck_ready though it was a not big issue.

Attached is the fixed patch. This works well in my environment avoiding segfault. Could you please test this?



fixed_crash_fix.diff (3,959 bytes)
diff --git a/src/watchdog/wd_lifecheck.c b/src/watchdog/wd_lifecheck.c
index a9790d48..a24cad85 100644
--- a/src/watchdog/wd_lifecheck.c
+++ b/src/watchdog/wd_lifecheck.c
@@ -62,6 +62,7 @@ typedef struct
 {
 	LifeCheckNode *lifeCheckNode;
 	int			retry;			/* retry times (not used?) */
+	char		*password;
 }			WdPgpoolThreadArg;
 
 typedef struct WdUpstreamConnectionData
@@ -80,7 +81,7 @@ static bool wd_ping_all_server(void);
 static WdUpstreamConnectionData * wd_get_server_from_pid(pid_t pid);
 
 static void *thread_ping_pgpool(void *arg);
-static PGconn *create_conn(char *hostname, int port);
+static PGconn *create_conn(char *hostname, int port, char *password);
 
 static pid_t lifecheck_main(void);
 static void check_pgpool_status(void);
@@ -103,7 +104,7 @@ static const char *lifecheck_child_name(pid_t pid);
 static void reaper(void);
 static int	is_wd_lifecheck_ready(void);
 static int	wd_lifecheck(void);
-static int	wd_ping_pgpool(LifeCheckNode * node);
+static int	wd_ping_pgpool(LifeCheckNode * node, char *password);
 static pid_t fork_lifecheck_child(void);
 
 
@@ -640,6 +641,7 @@ is_wd_lifecheck_ready(void)
 {
 	int			rtn = WD_OK;
 	int			i;
+	char	   *password = NULL;
 
 	for (i = 0; i < gslifeCheckCluster->nodeCount; i++)
 	{
@@ -648,7 +650,10 @@ is_wd_lifecheck_ready(void)
 		/* query mode */
 		if (pool_config->wd_lifecheck_method == LIFECHECK_BY_QUERY)
 		{
-			if (wd_ping_pgpool(node) == WD_NG)
+			if (!password)
+				password = get_pgpool_config_user_password(pool_config->wd_lifecheck_user,
+														   pool_config->wd_lifecheck_password);
+			if (wd_ping_pgpool(node, password) == WD_NG)
 			{
 				ereport(DEBUG1,
 						(errmsg("watchdog checking life check is ready"),
@@ -682,6 +687,9 @@ is_wd_lifecheck_ready(void)
 		}
 	}
 
+	if (password)
+		pfree(password);
+
 	return rtn;
 }
 
@@ -803,6 +811,8 @@ check_pgpool_status_by_query(void)
 	LifeCheckNode *node;
 	int			rc,
 				i;
+	char	   *password = get_pgpool_config_user_password(pool_config->wd_lifecheck_user,
+														   pool_config->wd_lifecheck_password);
 
 	/* thread init */
 	pthread_attr_init(&attr);
@@ -813,6 +823,7 @@ check_pgpool_status_by_query(void)
 	{
 		node = &gslifeCheckCluster->lifeCheckNodes[i];
 		thread_arg[i].lifeCheckNode = node;
+		thread_arg[i].password = password;
 		rc = watchdog_thread_create(&thread[i], &attr, thread_ping_pgpool, (void *) &thread_arg[i]);
 	}
 
@@ -875,6 +886,9 @@ check_pgpool_status_by_query(void)
 			}
 		}
 	}
+
+	if (password)
+		pfree(password);
 }
 
 /*
@@ -886,8 +900,7 @@ thread_ping_pgpool(void *arg)
 {
 	uintptr_t	rtn;
 	WdPgpoolThreadArg *thread_arg = (WdPgpoolThreadArg *) arg;
-
-	rtn = (uintptr_t) wd_ping_pgpool(thread_arg->lifeCheckNode);
+	rtn = (uintptr_t) wd_ping_pgpool(thread_arg->lifeCheckNode,thread_arg->password);
 
 	pthread_exit((void *) rtn);
 }
@@ -896,12 +909,10 @@ thread_ping_pgpool(void *arg)
  * Create connection to pgpool
  */
 static PGconn *
-create_conn(char *hostname, int port)
+create_conn(char *hostname, int port, char *password)
 {
 	static char conninfo[1024];
 	PGconn	   *conn;
-	char	   *password = get_pgpool_config_user_password(pool_config->wd_lifecheck_user,
-														   pool_config->wd_lifecheck_password);
 
 
 	if (strlen(pool_config->wd_lifecheck_dbname) == 0)
@@ -928,9 +939,6 @@ create_conn(char *hostname, int port)
 			 pool_config->wd_interval / 2 + 1);
 	conn = PQconnectdb(conninfo);
 
-	if (password)
-		pfree(password);
-
 	if (PQstatus(conn) != CONNECTION_OK)
 	{
 		ereport(DEBUG1,
@@ -987,11 +995,11 @@ wd_check_heartbeat(LifeCheckNode * node)
  * Check if pgpool can accept the lifecheck query.
  */
 static int
-wd_ping_pgpool(LifeCheckNode * node)
+wd_ping_pgpool(LifeCheckNode * node, char* password)
 {
 	PGconn	   *conn;
 
-	conn = create_conn(node->hostName, node->pgpoolPort);
+	conn = create_conn(node->hostName, node->pgpoolPort, password);
 	if (conn == NULL)
 		return WD_NG;
 	return ping_pgpool(conn);
fixed_crash_fix.diff (3,959 bytes)

pengbo

2019-01-21 11:42

developer   ~0002323

Thank you for testing and fixing the patch.

Patch is commited:
https://git.postgresql.org/gitweb?p=pgpool2.git;a=commitdiff;h=3ae4acfa1037ae1ed4101465ea880909b59dd30e

nagata

2019-01-21 11:59

developer   ~0002324

Thanks. The fixed version will be released at Feb. 21st, right?

pengbo

2019-01-21 12:40

developer   ~0002325

Last edited: 2019-01-21 12:41

View 2 revisions

Yes, we plan to do next minor release on Feb. 21st.

nagata

2019-01-21 13:03

developer   ~0002326

Thank you! I'll close this.

Issue History

Date Modified Username Field Change
2019-01-10 11:17 nagata New Issue
2019-01-10 11:17 nagata File Added: wd_lifecheck_segfault.patch
2019-01-10 11:19 nagata Description Updated View Revisions
2019-01-10 12:11 nagata Steps to Reproduce Updated View Revisions
2019-01-10 14:01 pengbo Note Added: 0002317
2019-01-15 14:48 pengbo File Added: crash_fix.diff
2019-01-15 14:48 pengbo Note Added: 0002319
2019-01-16 15:29 nagata File Added: fixed_crash_fix.diff
2019-01-16 15:29 nagata Note Added: 0002320
2019-01-16 15:32 nagata Note Edited: 0002320 View Revisions
2019-01-16 15:36 nagata Note Edited: 0002320 View Revisions
2019-01-21 11:42 pengbo Note Added: 0002323
2019-01-21 11:59 nagata Note Added: 0002324
2019-01-21 12:40 pengbo Note Added: 0002325
2019-01-21 12:41 pengbo Note Edited: 0002325 View Revisions
2019-01-21 13:03 nagata Note Added: 0002326
2019-01-21 13:03 nagata Assigned To => nagata
2019-01-21 13:03 nagata Status new => closed
2019-01-21 13:03 nagata Resolution open => fixed