[pgpool-general-jp: 388] Re: pgpool 3.4.1のhealth checkについて

2008年 2月 26日 (火) 00:07:40 JST

澤田です。

パッチの件、ご検討ありがとうございます。メールを書いてから
良く考えてみたのですが、health_checkは主に親プロセスから呼ばれることから
child.cからmain.cに移してしまった方が、health_check_timer_expiredを
グローバル変数にする必要がなく、筋が良いかと思いましたがいかがでしょうか？
(添付は元の3.4.1のコードに対する差分です)

> > また、health checkなしでも上記状況の場合に縮退させるには
> > どうしたらよいでしょうか？
> 
> 今のところ手段は無いです。単なるロック待ちという可能性もあるので、異常
> かどうかを判断するのが難しいためです。health check を有効にしていただ
> くか、別の監視ツール等で異常な PostgreSQL をシャットダウンさせるような
> 仕組みが必要になります。

こちらもありがとうございます。health checkが良さそうですね。

--澤田 研一
--k ＠ sawada.cc

From: Yoshiyuki Asaba <y-asaba ＠ sraoss.co.jp>
Subject: [pgpool-general-jp: 387] Re: pgpool 3.4.1のhealth checkについて
Date: Mon, 25 Feb 2008 19:15:16 +0900 (JST)

> 浅羽です。
> 
> From: Kenichi Sawada <k ＠ sawada.cc>
> Subject: [pgpool-general-jp: 386] pgpool 3.4.1のhealth checkについて
> Date: Mon, 25 Feb 2008 11:31:25 +0900 (JST)
> 
> > pgpool-3.4.1をhealth checkありで使っているのですが、
> > 「バックエンドのPostgreSQLが何らかの原因で動作しなくなったが
> > portはlistenしており、TCPレベルではconnectするものの、返答が一切ない」
> > という状況で、health checkが動作(タイムアウト)しないようです。
> > これはおそらくhealth_check()においてALARMシグナルがブロックされている
> > ためではないかと思い、添付の通りパッチを作成してみたところ
> > 一応動作するようですが、他部分への影響など何かありますでしょうか？
> 
> パッチありがとうございます。取り込ませていただきます。
> 
> 
> > また、health checkなしでも上記状況の場合に縮退させるには
> > どうしたらよいでしょうか？
> 
> 今のところ手段は無いです。単なるロック待ちという可能性もあるので、異常
> かどうかを判断するのが難しいためです。health check を有効にしていただ
> くか、別の監視ツール等で異常な PostgreSQL をシャットダウンさせるような
> 仕組みが必要になります。
> 
> --
> Yoshiyuki Asaba
> y-asaba ＠ sraoss.co.jp
> _______________________________________________
> pgpool-general-jp mailing list
> pgpool-general-jp ＠ sraoss.jp
> http://www.sraoss.jp/mailman/listinfo/pgpool-general-jp
> 
-------------- next part --------------
*** main.c	2007-08-10 12:57:44.000000000 +0900
--- main.c.new	2008-02-25 23:54:04.000000000 +0900
***************
*** 360,366 ****
  				/*
  				 * set health checker timeout. we want to detect
  				 * commnuication path failure much earlier before
! 				 * TCP/IP statck detects it.
  				 */
  				pool_signal(SIGALRM, health_check_timer_handler);
  				alarm(pool_config.health_check_timeout);
--- 360,366 ----
  				/*
  				 * set health checker timeout. we want to detect
  				 * commnuication path failure much earlier before
! 				 * TCP/IP stack detects it.
  				 */
  				pool_signal(SIGALRM, health_check_timer_handler);
  				alarm(pool_config.health_check_timeout);
***************
*** 371,377 ****
--- 371,379 ----
  			 */
  			errno = 0;
  			health_check_timer_expired = 0;
+ 			POOL_SETMASK(&UnBlockSig);
  			sts = health_check();
+ 			POOL_SETMASK(&BlockSig);
  
  			if (errno != EINTR || (errno == EINTR && health_check_timer_expired))
  			{
***************
*** 1134,1136 ****
--- 1136,1253 ----
  	}
  	POOL_SETMASK(&BlockSig);
  }
+ 
+ /*
+  * check if we can connect to the backend
+  * returns 0 for ok. -1 for master down, -2 for secondary down.
+  */
+ int health_check(void)
+ {
+ 	int fd;
+ 
+ 	/* V2 startup packet */
+ 	typedef struct {
+ 		int len;		/* startup packet length */
+ 		StartupPacket_v2 sp;
+ 	} MySp;
+ 	MySp mysp;
+ 	char kind;
+ 
+ 	memset(&mysp, 0, sizeof(mysp));
+ 	mysp.len = htonl(296);
+ 	mysp.sp.protoVersion = htonl(PROTO_MAJOR_V2 << 16);
+ 	strcpy(mysp.sp.database, "template1");
+  	strncpy(mysp.sp.user, pool_config.health_check_user, sizeof(mysp.sp.user) - 1);
+ 	*mysp.sp.options = '\0';
+ 	*mysp.sp.unused = '\0';
+ 	*mysp.sp.tty = '\0';
+ 
+ 	if (*pool_config.current_backend_host_name == '\0')
+ 		fd = connect_unix_domain_socket(0);
+ 	else
+ 		fd = connect_inet_domain_socket(0);
+ 
+ 	if (fd < 0)
+ 	{
+ 		pool_error("health check failed. master %s at port %d is down",
+ 				   pool_config.current_backend_host_name,
+ 				   pool_config.current_backend_port);
+ 		return -1;
+ 	}
+ 
+ 	if (write(fd, &mysp, sizeof(mysp)) < 0)
+ 	{
+ 		pool_error("health check failed during write. master %s at port %d is down",
+ 				   pool_config.current_backend_host_name,
+ 				   pool_config.current_backend_port);
+ 		close(fd);
+ 		return -1;
+ 	}
+ 
+ 	read(fd, &kind, 1);
+ 	if (health_check_timer_expired == 1) {
+ 		pool_error("health check failed during read. master %s at port %d is open but no response",
+ 				   pool_config.current_backend_host_name,
+ 				   pool_config.current_backend_port);
+ 		close(fd);
+ 		return -1;
+ 	}
+ 
+ 	if (write(fd, "X", 1) < 0)
+ 	{
+ 		pool_error("health check failed during write. master %s at port %d is down",
+ 				   pool_config.current_backend_host_name,
+ 				   pool_config.current_backend_port);
+ 		close(fd);
+ 		return -1;
+ 	}
+ 
+ 	close(fd);
+ 
+ 	if (!DUAL_MODE)
+ 		return 0;
+ 
+ 	if (*pool_config.secondary_backend_host_name == '\0')
+ 		fd = connect_unix_domain_socket(1);
+ 	else
+ 		fd = connect_inet_domain_socket(1);
+ 
+ 	if (fd < 0)
+ 	{
+ 		pool_error("health check failed. secondary %s at port %d is down",
+ 				   pool_config.secondary_backend_host_name,
+ 				   pool_config.secondary_backend_port);
+ 		return -2;
+ 	}
+ 
+ 	if (write(fd, &mysp, sizeof(mysp)) < 0)
+ 	{
+ 		pool_error("health check failed during write. secondary %s at port %d is down",
+ 				   pool_config.secondary_backend_host_name,
+ 				   pool_config.secondary_backend_port);
+ 		close(fd);
+ 		return -2;
+ 	}
+ 
+ 	read(fd, &kind, 1);
+ 	if (health_check_timer_expired == 1) {
+ 		pool_error("health check failed during read. secondary %s at port %d is open but no response",
+ 				   pool_config.secondary_backend_host_name,
+ 				   pool_config.secondary_backend_port);
+ 		close(fd);
+ 		return -2;
+ 	}
+ 
+ 	if (write(fd, "X", 1) < 0)
+ 	{
+ 		pool_error("health check failed during write. secondary %s at port %d is down",
+ 				   pool_config.secondary_backend_host_name,
+ 				   pool_config.secondary_backend_port);
+ 		close(fd);
+ 		return -2;
+ 	}
+ 
+ 	close(fd);
+ 
+ 	return 0;
+ }
-------------- next part --------------
*** child.c	2007-08-01 13:25:40.000000000 +0900
--- child.c.new	2008-02-25 23:52:15.000000000 +0900
***************
*** 1128,1231 ****
  		free(sp);
  	}
  }
- 
- /*
-  * check if we can connect to the backend
-  * returns 0 for ok. -1 for master down, -2 for secondary down.
-  */
- int health_check(void)
- {
- 	int fd;
- 
- 	/* V2 startup packet */
- 	typedef struct {
- 		int len;		/* startup packet length */
- 		StartupPacket_v2 sp;
- 	} MySp;
- 	MySp mysp;
- 	char kind;
- 
- 	memset(&mysp, 0, sizeof(mysp));
- 	mysp.len = htonl(296);
- 	mysp.sp.protoVersion = htonl(PROTO_MAJOR_V2 << 16);
- 	strcpy(mysp.sp.database, "template1");
-  	strncpy(mysp.sp.user, pool_config.health_check_user, sizeof(mysp.sp.user) - 1);
- 	*mysp.sp.options = '\0';
- 	*mysp.sp.unused = '\0';
- 	*mysp.sp.tty = '\0';
- 
- 	if (*pool_config.current_backend_host_name == '\0')
- 		fd = connect_unix_domain_socket(0);
- 	else
- 		fd = connect_inet_domain_socket(0);
- 
- 	if (fd < 0)
- 	{
- 		pool_error("health check failed. master %s at port %d is down",
- 				   pool_config.current_backend_host_name,
- 				   pool_config.current_backend_port);
- 		return -1;
- 	}
- 
- 	if (write(fd, &mysp, sizeof(mysp)) < 0)
- 	{
- 		pool_error("health check failed during write. master %s at port %d is down",
- 				   pool_config.current_backend_host_name,
- 				   pool_config.current_backend_port);
- 		close(fd);
- 		return -1;
- 	}
- 
- 	read(fd, &kind, 1);
- 
- 	if (write(fd, "X", 1) < 0)
- 	{
- 		pool_error("health check failed during write. master %s at port %d is down",
- 				   pool_config.current_backend_host_name,
- 				   pool_config.current_backend_port);
- 		close(fd);
- 		return -1;
- 	}
- 
- 	close(fd);
- 
- 	if (!DUAL_MODE)
- 		return 0;
- 
- 	if (*pool_config.secondary_backend_host_name == '\0')
- 		fd = connect_unix_domain_socket(1);
- 	else
- 		fd = connect_inet_domain_socket(1);
- 
- 	if (fd < 0)
- 	{
- 		pool_error("health check failed. secondary %s at port %d is down",
- 				   pool_config.secondary_backend_host_name,
- 				   pool_config.secondary_backend_port);
- 		return -2;
- 	}
- 
- 	if (write(fd, &mysp, sizeof(mysp)) < 0)
- 	{
- 		pool_error("health check failed during write. secondary %s at port %d is down",
- 				   pool_config.secondary_backend_host_name,
- 				   pool_config.secondary_backend_port);
- 		close(fd);
- 		return -2;
- 	}
- 
- 	read(fd, &kind, 1);
- 
- 	if (write(fd, "X", 1) < 0)
- 	{
- 		pool_error("health check failed during write. secondary %s at port %d is down",
- 				   pool_config.secondary_backend_host_name,
- 				   pool_config.secondary_backend_port);
- 		close(fd);
- 		return -2;
- 	}
- 
- 	close(fd);
- 
- 	return 0;
- }
--- 1128,1130 ----