[Pgpool-general] Ongoing issue with loosing cluster members

Mon Jul 19 18:30:50 UTC 2010

We have a 3 node PgPool cluster that will not hold itself together. After 
syncing up it will run, sometimes for a day, sometimes for a week or two, 
before one node falls out of replication, usually several hours later but 
sometimes a few days later a second falls out.

No errors are logged in either postgresql.log or pgpool.log (as far as I can 
tell, it's logging debugging information and it's hard to find where the moment 
occurs in pgpool.log). The log on the nodes (postgresql.log) chugs along then 
shows this at the moment the node falls out:

postgres[10747] 2010-07-19 12:47:02 EDT t_id:0 user:[user] database:[database] 
Command:idle in transaction - LOG:  08P01: unexpected EOF on client connection

It shows that for every connection that was open. I'm hoping someone can give 
me a better idea where to look to find the cause. PgPool is serving up an 
online ordering system and an internal Redmine service. Neither sees that much 
activity.

Some hints or something from someone who knows more about this then I do would 
be very much appreciated. Running PostgreSQL 8.4.4 and PgPool 2.3.3

Let me know what other information I need to provide.

pgpool.conf -

# Host name or IP address to listen on: '*' for all, '' for no TCP/IP
# connections
listen_addresses = '10.11.0.10'

# Port number for pgpool
port = 5432

# Port number for pgpool communication manager
pcp_port = 9898

# Unix domain socket path.  (The Debian package defaults to
# /var/run/postgresql.)
socket_dir = '/tmp'

# Unix domain socket path for pgpool communication manager.
# (Debian package defaults to /var/run/postgresql)
pcp_socket_dir = '/tmp'

# Unix domain socket path for the backend. Debian package defaults to 
/var/run/postgresql!
backend_socket_dir = '/tmp'

# pgpool communication manager timeout. 0 means no timeout, but strongly not 
recommended!
pcp_timeout = 10

# number of pre-forked child process
num_init_children = 45

# Number of connection pools allowed for a child process
max_pool = 5

# If idle for this many seconds, child exits.  0 means no timeout.
child_life_time = 300

# If idle for this many seconds, connection to PostgreSQL closes.
# 0 means no timeout.
connection_life_time = 0

# If child_max_connections connections were received, child exits.
# 0 means no exit.
child_max_connections = 0

# If client_idle_limit is n (n > 0), the client is forced to be
# disconnected whenever after n seconds idle (even inside an explicit
# transactions!)
# 0 means no disconnect.
client_idle_limit = 0

# Maximum time in seconds to complete client authentication.
# 0 means no timeout.
authentication_timeout = 120

# Logging directory
logdir = '/var/log/pgpool'

# Replication mode
replication_mode = true

# Load balancing mode, i.e., all SELECTs are load balanced.
# This is ignored if replication_mode is false.
load_balance_mode = true

# if there's a data mismatch between master and secondary
# start degeneration to stop replication mode
replication_stop_on_mismatch = true

# If true, replicate SELECT statement when load balancing is disabled.
# If false, it is only sent to the master node.
replicate_select = true

# Semicolon separated list of queries to be issued at the end of a session
reset_query_list = 'ABORT; DISCARD ALL'

# If true print timestamp on each log line.
print_timestamp = true

# If true, operate in master/slave mode.
master_slave_mode = false

# If true, cache connection pool.
connection_cache = false

# Health check timeout.  0 means no timeout.
health_check_timeout = 20

# Health check period.  0 means no health check.
health_check_period = 10

# Health check user
health_check_user = 'postgres'

# Execute command by failover.
# special values:  %d = node id
#                  %h = host name
#                  %p = port number
#                  %D = database cluster path
#                  %m = new master node id
#                  %M = old master node id
#                  %% = '%' character
#
failover_command = ''

# Execute command by failback.
# special values:  %d = node id
#                  %h = host name
#                  %p = port number
#                  %D = database cluster path
#                  %m = new master node id
#                  %M = old master node id
#                  %% = '%' character
#
failback_command = ''

# If true, automatically lock table with INSERT statements to keep SERIAL
# data consistency.  An /*INSERT LOCK*/ comment has the same effect.  A
# /NO INSERT LOCK*/ comment disables the effect.
insert_lock = true

# If true, ignore leading white spaces of each query while pgpool judges
# whether the query is a SELECT so that it can be load balanced.  This
# is useful for certain APIs such as DBI/DBD which is known to adding an
# extra leading white space.
ignore_leading_white_space = true

# If true, print all statements to the log.  Like the log_statement option
# to PostgreSQL, this allows for observing queries without engaging in full
# debugging.
log_statement = true

# If true, incoming connections will be printed to the log.
log_connections = true

# If true, hostname will be shown in ps status. Also shown in
# connection log if log_connections = true.
# Be warned that this feature will add overhead to look up hostname.
log_hostname = true

# if non 0, run in parallel query mode
parallel_mode = false

# if non 0, use query cache
enable_query_cache = false

#set pgpool2 hostname 
pgpool2_hostname = 'pgpool.domain.local'

# system DB info
system_db_hostname = 'localhost'
system_db_port = 5432
system_db_dbname = 'pgpool'
system_db_schema = 'pgpool_catalog'
system_db_user = 'pgpool'
system_db_password = ''

# backend_hostname, backend_port, backend_weight
# here are examples
backend_hostname0 = 'db1.domain.local'
backend_port0 = 5433
backend_weight0 = 1
backend_data_directory0 = '/db/pgdb'

backend_hostname1 = 'db2.domain.local'
backend_port1 = 5433
backend_weight1 = 1
backend_data_directory1 = '/db/pgdb'

backend_hostname2 = 'db3.domain.local'
backend_port2 = 5433
backend_weight2 = 1
backend_data_directory2 = '/db/pgdb'

# - HBA -

# If true, use pool_hba.conf for client authentication. In pgpool-II
# 1.1, the default value is false. The default value will be true in
# 1.2.
enable_pool_hba = false

# - online recovery -
# online recovery user
recovery_user = 'postgres'

# online recovery password
recovery_password = 'postgres'

# execute a command in first stage.
recovery_1st_stage_command = 'pgpool_recovery'

# execute a command in second stage.
recovery_2nd_stage_command = 'pgpool_recovery'

# maximum time in seconds to wait for remote start-up. 0 means no wait
recovery_timeout = 90

# If client_idle_limit_in_recovery is n (n > 0), the client is forced
# to be disconnected whenever after n seconds idle (even inside an
# explicit transactions!)  0 means no disconnect. This parameter only
# takes effect in recovery 2nd stage.
client_idle_limit_in_recovery = 0

# Specify table name to lock. This is used when rewriting lo_creat
# command in replication mode. The table must exist and has writable
# permission to public. If the table name is '', no rewriting occurs.
lobj_lock_table = ''

# If true, enable SSL support for both frontend and backend connections.
# note that you must also set ssl_key and ssl_cert for SSL to work in
# the frontend connections.
ssl = false
# path to the SSL private key file
#ssl_key = './server.key'
# path to the SSL public certificate file
#ssl_cert = './server.cert'

# If either ssl_ca_cert or ssl_ca_cert_dir is set, then certificate
# verification will be performed to establish the authenticity of the
# certificate.  If neither is set to a nonempty string then no such
# verification takes place.  ssl_ca_cert should be a path to a single
# PEM format file containing CA root certificate(s), whereas ssl_ca_cert_dir
# should be a directory containing such files.  These are analagous to the
# -CAfile and -CApath options to openssl verify(1), respectively.
#ssl_ca_cert = ''
#ssl_ca_cert_dir = ''