[pgpool-general: 2595] connection to pgpool2 waits forever after master failover

Fri Feb 28 23:16:50 JST 2014

Hi,

I'm trying to install a simple 2 node pgpool2 cluster with hot standby 
streaming replication and automatic failover.

So far the streaming replication between 2 postresql servers is working 
perfectly. For the failover part I'm using repmgr which also works great.

I can connect to both postgresql backends and also I can connect to both 
postgresql backends through pgpool2.

The automatic failover works. When I stop the master node, I can see 
that the standby becomes new master. I'm able to directly connect to new 
master postgresql node and it is no longer in read only mode. However 
when I want to connect to the cluster (after the failover) through 
pgpool2 (using psql client) the connection waits forever and I'm no 
longer to connect. I've turned on connection logging for postgresql and 
there is no record of an attempted connection. Pgpool2 log also doesn't 
show incomming connection attempt. To be able to connect through pgpool2 
again, I must restart it.

I'm unable to connect only after the master failover. If I stop the 
standby node, the backend is correctly detected as down, but I'm still 
able to connect to pgpool2.

Please help. Thank you.

My setup is:

2x PostgreSQL 9.3.2 on x86_64-unknown-linux-gnu, compiled by gcc 
(Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3, 64-bit
pgpool-II version 3.3.2 (tokakiboshi) (I've also tried the latest master 
at 46c9a71813768fa799580f3ae07599b9eaa14fef)
Linux 3.11.0-12-generic #19-Ubuntu SMP Wed Oct 9 16:20:46 UTC 2013 
x86_64 x86_64 x86_64 GNU/Linux

My pgpool.conf is:

listen_addresses = '*'
port = 9999
socket_dir = '/tmp'
pcp_port = 9898
pcp_socket_dir = '/tmp'
backend_hostname0 = 'dbnode1'
backend_port0 = 5432
backend_weight0 = 1
backend_data_directory0 = '/var/lib/postgresql/9.3/main'
backend_flag0 = 'ALLOW_TO_FAILOVER'
backend_hostname1 = 'dbnode2'
backend_port1 = 5432
backend_weight1 = 1
backend_data_directory1 = '/var/lib/postgresql/9.3/main'
backend_flag1 = 'ALLOW_TO_FAILOVER'
enable_pool_hba = on
pool_passwd = 'pool_passwd'
authentication_timeout = 10
ssl = off
num_init_children = 8
max_pool = 4
child_life_time = 300
child_max_connections = 0
connection_life_time = 0
client_idle_limit = 0
log_destination = 'stderr'
print_timestamp = on
log_connections = on
log_hostname = on
log_statement = off
log_per_node_statement = off
log_standby_delay = 'none'
syslog_facility = 'LOCAL0'
syslog_ident = 'pgpool'
debug_level = 1
pid_file_name = '/var/run/pgpool/pgpool.pid'
logdir = '/tmp'
connection_cache = off
reset_query_list = 'ABORT; DISCARD ALL'
replication_mode = off
replicate_select = off
insert_lock = on
lobj_lock_table = ''
replication_stop_on_mismatch = off
failover_if_affected_tuples_mismatch = off
load_balance_mode = off
ignore_leading_white_space = on
white_function_list = ''
black_function_list = 'nextval,setval'
master_slave_mode = on
master_slave_sub_mode = 'stream'
sr_check_period = 5
sr_check_user = 'repmgr'
sr_check_password = ''
delay_threshold = 0
follow_master_command = ''
parallel_mode = off
pgpool2_hostname = ''
system_db_hostname  = 'localhost'
system_db_port = 5432
system_db_dbname = 'pgpool'
system_db_schema = 'pgpool_catalog'
system_db_user = 'pgpool'
system_db_password = ''
health_check_period = 5
health_check_timeout = 30
health_check_user = 'repmgr'
health_check_password = ''
health_check_max_retries = 3
health_check_retry_delay = 10
failover_command = 'su postgres -c "ssh postgres@%H repmgr -f 
/var/lib/postgresql/9.3/repmgr/repmgr.conf --verbose standby promote"'
failback_command = ''
fail_over_on_backend_error = on
search_primary_node_timeout = 10
recovery_user = 'nobody'
recovery_password = ''
recovery_1st_stage_command = ''
recovery_2nd_stage_command = ''
recovery_timeout = 90
client_idle_limit_in_recovery = 0
use_watchdog = false
trusted_servers = ''
ping_path = '/bin'
wd_hostname = 'dbnode1'
wd_port = 9000
wd_authkey = ''
delegate_IP = '10.5.4.241'
ifconfig_path = '/sbin'
if_up_cmd = 'ifconfig eth0:0 inet $_IP_$ netmask 255.255.255.0'
if_down_cmd = 'ifconfig eth0:0 down'
arping_path = '/usr/sbin'           # arping command path
arping_cmd = 'arping -U $_IP_$ -w 1'
clear_memqcache_on_escalation = on
wd_escalation_command = ''
wd_lifecheck_method = 'heartbeat'
wd_interval = 3
wd_heartbeat_port = 9694
wd_heartbeat_keepalive = 2
wd_heartbeat_deadtime = 30
heartbeat_destination0 = 'dbnode2'
heartbeat_destination_port0 = 9694
heartbeat_device0 = ''
wd_life_point = 3
wd_lifecheck_query = 'SELECT 1'
wd_lifecheck_dbname = 'template1'
wd_lifecheck_user = 'repmgr'
wd_lifecheck_password = ''
other_pgpool_hostname0 = 'dbnode2'
other_pgpool_port0 = 9999
other_wd_port0 = 9000
relcache_expire = 0
relcache_size = 256
check_temp_table = on
memory_cache_enabled = off
memqcache_method = 'shmem'
memqcache_memcached_host = 'localhost'
memqcache_memcached_port = 11211
memqcache_total_size = 67108864
memqcache_max_num_cache = 1000000
memqcache_expire = 0
memqcache_auto_cache_invalidation = on
memqcache_maxcache = 409600
memqcache_cache_block_size = 1048576
memqcache_oiddir = '/var/log/pgpool/oiddir'
white_memqcache_table_list = ''
black_memqcache_table_list = ''

-- 
*Martin*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.sraoss.jp/pipermail/pgpool-general/attachments/20140228/291634d8/attachment.html>