[pgpool-general: 2171] Re: read_startup_packet: incorrect packet length What does that mean?

Tue Oct 8 06:08:47 JST 2013

Hi Tatsuo.
Sorry for the delayed answer.
In the meantime I've changed the configuration, but the message 
"read_startup_packet: incorrect packet length" didn't go away. I am attaching 
configuration at the end of the e-mail and also a log file right after I 
started Pgpool.

I think I understand the issue I've described in the last e-mail. I mean the 
psql -c "show pool_nodes;" -d postgres -U postgres -p 9898 -> the port should 
be 9999. Just silly typo.

I would like to summary of the issues I am experiencing now:
1. There are periodic ERROR messages in the Pgpool log file - 
"read_startup_packet: incorrect packet length (-1656006549)".
2. There are periodic messages in the PostgreSQL log file - "incomplete 
startup packet".
3. If I halt or reboot primary node (soft shutdown), the secondary node 
triggers fail-over immediatelly. --> NOT DESIRED BEHAVIOUR
4. If I power off the primary node (unexpectedly), the secondary node applies 
health_check and triggers fail-over after health_check time-out. --> DESIRED 
BEHAVIOUR

I think, the first two issues are related. But don't know what could cause it. 
We are using special services which connects to the Pgpool by using JDBC. Can 
the services be the source of these error messages?

According to issue number 3.. I can see in the log file, what happened: 
"postmaster on DB node 0 was shutdown by administrative command".
How can I force Pgpool to not fail-over after the administrative command was 
issued? I would like to apply same health_check timouts also in the situation 
when the administrative command was issued. It is because in our environment 
one node is represented by more hosts where the Postgresql services are 
migrating from one host to another automatically. It occurs also if the "halt" 
or "reboot" command is issued on that one node. This is not errorneous 
condition for our environment, because if the "halt" is issued on one host, 
PostgreSQL service will be brought up on another node. Virtual IP address is 
in front of it.

THE ENVIRONMENT:
Node 1:
- Installed PostgreSQL 9.2 and Pgpool 3.3.1 - Primary
- Streaming replication Primary
Node 2:
- Installed PostgreSQL 9.2 and Pgpool 3.3.1 - Secondary
- Streaming replication Secondary

CONFIG FILE (same for both nodes):
listen_addresses = '*'
port = 9999
socket_dir = '/tmp'
pcp_port = 9898
pcp_socket_dir = '/tmp'
backend_hostname0 = 'node1'
backend_port0 = 5432
backend_weight0 = 1
backend_data_directory0 = '/var/lib/pgsql/9.2/data'
backend_flag0 = 'ALLOW_TO_FAILOVER'
backend_hostname1 = 'node2'
backend_port1 = 5432
backend_weight1 = 1
backend_data_directory1 = '/var/lib/pgsql/9.2/data'
backend_flag1 = 'ALLOW_TO_FAILOVER'
enable_pool_hba = on
pool_passwd = 'pool_passwd'
authentication_timeout = 60
ssl = off
num_init_children = 32
max_pool = 4
child_life_time = 300
child_max_connections = 0
connection_life_time = 0
client_idle_limit = 0
log_destination = 'stderr'
print_timestamp = on
log_connections = off
log_hostname = off
log_statement = off
log_per_node_statement = off
log_standby_delay = 'none'
syslog_facility = 'LOCAL0'
syslog_ident = 'pgpool'
debug_level = 0
pid_file_name = '/var/run/pgpool/pgpool.pid'
logdir = '/var/log/pgpool'
connection_cache = on
reset_query_list = 'ABORT; DISCARD ALL'
replication_mode = off
replicate_select = off
insert_lock = on
lobj_lock_table = ''
replication_stop_on_mismatch = off
failover_if_affected_tuples_mismatch = off
load_balance_mode = off
ignore_leading_white_space = on
white_function_list = ''
black_function_list = 'nextval,setval'
master_slave_mode = on
master_slave_sub_mode = 'stream'
sr_check_period = 10
sr_check_user = 'pgpool'
sr_check_password = 'pgpool'
delay_threshold = 10000000
follow_master_command = ''
parallel_mode = off
pgpool2_hostname = ''
system_db_hostname  = 'localhost'
system_db_port = 5432
system_db_dbname = 'pgpool'
system_db_schema = 'pgpool_catalog'
system_db_user = 'pgpool'
system_db_password = ''
health_check_period = 70
health_check_timeout = 30
health_check_user = 'pgpool'
health_check_password = 'pgpool'
health_check_max_retries = 3
health_check_retry_delay = 10
failover_command = '/usr/pgsql-9.2/bin/failover_stream.sh %d %H 
/tmp/trigger_file'
failback_command = ''
fail_over_on_backend_error = off
search_primary_node_timeout = 10
recovery_user = 'nobody'
recovery_password = ''
recovery_1st_stage_command = ''
recovery_2nd_stage_command = ''
recovery_timeout = 90
client_idle_limit_in_recovery = 0
use_watchdog = off
trusted_servers = '192.168.x.x'
ping_path = '/bin'
wd_hostname = ''
wd_port = 9000
wd_authkey = ''
delegate_IP = ''
ifconfig_path = '/sbin'
if_up_cmd = 'ifconfig eth0:0 inet $_IP_$ netmask 255.255.255.0'
if_down_cmd = 'ifconfig eth0:0 down'
arping_path = '/usr/sbin'
arping_cmd = 'arping -U $_IP_$ -w 1'
clear_memqcache_on_escalation = on
wd_escalation_command = ''
wd_lifecheck_method = 'heartbeat'
wd_interval = 5
wd_heartbeat_port = 9694
wd_heartbeat_keepalive = 2
wd_heartbeat_deadtime = 30
heartbeat_destination0 = 'node2'
heartbeat_destination_port0 = 9694
heartbeat_device0 = ''
wd_life_point = 3
wd_lifecheck_query = 'SELECT 1'
wd_lifecheck_dbname = 'template1'
wd_lifecheck_user = 'replication'
wd_lifecheck_password = 'replicate'
relcache_expire = 0
relcache_size = 256
check_temp_table = on
memory_cache_enabled = off
memqcache_method = 'shmem'
memqcache_memcached_host = 'localhost'
memqcache_memcached_port = 11211
memqcache_total_size = 67108864
memqcache_max_num_cache = 1000000
memqcache_expire = 0
memqcache_auto_cache_invalidation = on
memqcache_maxcache = 409600
memqcache_cache_block_size = 1048576
memqcache_oiddir = '/var/log/pgpool/oiddir'
white_memqcache_table_list = ''
black_memqcache_table_list = ''

PGPOOL.LOG - NODE1:
2013-10-07 22:37:28 LOG:   pid 19961: received fast shutdown request
2013-10-07 22:37:37 LOG:   pid 21563: read_status_file: 1 th backend is set to 
down status
2013-10-07 22:37:37 LOG:   pid 21563: pgpool-II successfully started. version 
3.3.1 (tokakiboshi)
2013-10-07 22:37:37 LOG:   pid 21563: find_primary_node: primary node id is 0
2013-10-07 22:37:47 ERROR: pid 21574: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:38:01 ERROR: pid 21574: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:38:17 ERROR: pid 21574: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:38:28 LOG:   pid 21606: send_failback_request: fail back 1 th 
node request from pid 21606
2013-10-07 22:38:28 LOG:   pid 21563: starting fail back. reconnect host 
node2(5432)
2013-10-07 22:38:28 LOG:   pid 21563: Do not restart children because we are 
failbacking node id 1 hostnode2 port:5432 and we are in streaming replication 
mode
2013-10-07 22:38:28 LOG:   pid 21563: find_primary_node_repeatedly: waiting 
for finding a primary node
2013-10-07 22:38:28 LOG:   pid 21563: find_primary_node: primary node id is 0
2013-10-07 22:38:28 LOG:   pid 21563: failover: set new primary node: 0
2013-10-07 22:38:28 LOG:   pid 21563: failover: set new master node: 0
2013-10-07 22:38:28 LOG:   pid 21563: failback done. reconnect host 
node2(5432)
2013-10-07 22:38:28 LOG:   pid 21607: worker process received restart request
2013-10-07 22:38:29 LOG:   pid 21606: pcp child process received restart 
request
2013-10-07 22:38:29 LOG:   pid 21563: PCP child 21606 exits with status 256 in 
failover()
2013-10-07 22:38:29 LOG:   pid 21563: fork a new PCP child pid 22141 in 
failover()
2013-10-07 22:38:29 LOG:   pid 21563: worker child 21607 exits with status 256
2013-10-07 22:38:29 LOG:   pid 21563: fork a new worker child pid 22142
2013-10-07 22:38:31 ERROR: pid 21574: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:38:31 LOG:   pid 21574: do_child: failback event found. restart 
myself.
2013-10-07 22:38:47 ERROR: pid 22144: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:39:01 LOG:   pid 21602: do_child: failback event found. restart 
myself.
2013-10-07 22:39:01 ERROR: pid 22144: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:39:17 LOG:   pid 21591: do_child: failback event found. restart 
myself.
2013-10-07 22:39:17 ERROR: pid 22480: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:39:31 LOG:   pid 21586: do_child: failback event found. restart 
myself.
2013-10-07 22:39:31 ERROR: pid 22633: read_startup_packet: incorrect packet 
length (-189809103)

PGPOOL.LOG - NODE2:
2013-10-07 22:37:28 LOG:   pid 19961: received fast shutdown request
2013-10-07 22:37:37 LOG:   pid 21563: read_status_file: 1 th backend is set to 
down status
2013-10-07 22:37:37 LOG:   pid 21563: pgpool-II successfully started. version 
3.3.1 (tokakiboshi)
2013-10-07 22:37:37 LOG:   pid 21563: find_primary_node: primary node id is 0
2013-10-07 22:37:47 ERROR: pid 21574: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:38:01 ERROR: pid 21574: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:38:17 ERROR: pid 21574: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:38:28 LOG:   pid 21606: send_failback_request: fail back 1 th 
node request from pid 21606
2013-10-07 22:38:28 LOG:   pid 21563: starting fail back. reconnect host 
node2(5432)
2013-10-07 22:38:28 LOG:   pid 21563: Do not restart children because we are 
failbacking node id 1 hostnode2 port:5432 and we are in streaming replication 
mode
2013-10-07 22:38:28 LOG:   pid 21563: find_primary_node_repeatedly: waiting 
for finding a primary node
2013-10-07 22:38:28 LOG:   pid 21563: find_primary_node: primary node id is 0
2013-10-07 22:38:28 LOG:   pid 21563: failover: set new primary node: 0
2013-10-07 22:38:28 LOG:   pid 21563: failover: set new master node: 0
2013-10-07 22:38:28 LOG:   pid 21563: failback done. reconnect host 
node2(5432)
2013-10-07 22:38:28 LOG:   pid 21607: worker process received restart request
2013-10-07 22:38:29 LOG:   pid 21606: pcp child process received restart 
request
2013-10-07 22:38:29 LOG:   pid 21563: PCP child 21606 exits with status 256 in 
failover()
2013-10-07 22:38:29 LOG:   pid 21563: fork a new PCP child pid 22141 in 
failover()
2013-10-07 22:38:29 LOG:   pid 21563: worker child 21607 exits with status 256
2013-10-07 22:38:29 LOG:   pid 21563: fork a new worker child pid 22142
2013-10-07 22:38:31 ERROR: pid 21574: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:38:31 LOG:   pid 21574: do_child: failback event found. restart 
myself.
2013-10-07 22:38:47 ERROR: pid 22144: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:39:01 LOG:   pid 21602: do_child: failback event found. restart 
myself.
2013-10-07 22:39:01 ERROR: pid 22144: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:39:17 LOG:   pid 21591: do_child: failback event found. restart 
myself.
2013-10-07 22:39:17 ERROR: pid 22480: read_startup_packet: incorrect packet 
length (-189809103)
2013-10-07 22:39:31 LOG:   pid 21586: do_child: failback event found. restart 
myself.
2013-10-07 22:39:31 ERROR: pid 22633: read_startup_packet: incorrect packet 
length (-189809103)

POSTGRESQL LOG - NODE1:
2013-10-07 22:36:45 CEST LOG:  shutting down
2013-10-07 22:36:45 CEST LOG:  database system is shut down
2013-10-07 22:37:53 CEST LOG:  database system was interrupted; last known up 
at 2013-10-07 22:37:01 CEST
2013-10-07 22:37:53 CEST LOG:  entering standby mode
2013-10-07 22:37:53 CEST LOG:  streaming replication successfully connected to 
primary
2013-10-07 22:37:55 CEST LOG:  redo starts at 1/A5000020
2013-10-07 22:37:55 CEST LOG:  consistent recovery state reached at 1/A50000E0
2013-10-07 22:37:55 CEST LOG:  database system is ready to accept read only 
connections
2013-10-07 22:37:57 CEST LOG:  incomplete startup packet
2013-10-07 22:38:05 CEST LOG:  incomplete startup packet
2013-10-07 22:38:27 CEST LOG:  incomplete startup packet
2013-10-07 22:38:35 CEST LOG:  incomplete startup packet
2013-10-07 22:38:57 CEST LOG:  incomplete startup packet
2013-10-07 22:39:05 CEST LOG:  incomplete startup packet
2013-10-07 22:39:27 CEST LOG:  incomplete startup packet
2013-10-07 22:39:35 CEST LOG:  incomplete startup packet

PGPOOL.LOG - NODE2 - after the fail-over:
2013-10-07 22:54:09 LOG:   pid 1651: postmaster on DB node 0 was shutdown by 
administrative command
2013-10-07 22:54:09 LOG:   pid 1651: degenerate_backend_set: 0 fail over 
request from pid 1651
2013-10-07 22:54:09 LOG:   pid 843: starting degeneration. shutdown host 
node1(5432)
2013-10-07 22:54:09 LOG:   pid 843: Restart all children
2013-10-07 22:54:09 LOG:   pid 856: postmaster on DB node 0 was shutdown by 
administrative command
2013-10-07 22:54:09 LOG:   pid 843: execute command: 
/usr/pgsql-9.2/bin/failover_stream.sh 0 node2 /tmp/trigger_file
2013-10-07 22:54:09 LOG:   pid 843: find_primary_node_repeatedly: waiting for 
finding a primary node

Best regards,
Michal Mistina
-----Original Message-----
From: Tatsuo Ishii [mailto:ishii at postgresql.org]
Sent: Saturday, September 21, 2013 5:43 AM
To: Mistina Michal
Cc: pgpool-general at pgpool.net
Subject: Re: [pgpool-general: 2141] Re: read_startup_packet: incorrect packet 
length What does that mean?

Please show pgpool log.
--
Tatsuo Ishii
SRA OSS, Inc. Japan
English: http://www.sraoss.co.jp/index_en.php
Japanese: http://www.sraoss.co.jp

> In addition.
>
> The errors described in the post could cause also something weird what
> I am experiencing.
>
> I hit following to show cluster nodes:
>
> psql -c "show pool_nodes;" -d postgres -U postgres -p 9898
>
>
>
> The terminal session is stucked then. I cannot cancel the command even
> with
> CTRL+C.
>
> However I can attach node successfully with command:
>
> pcp_attach_node -d 5 tc1 9898 postgres postgres 1
>
>
>
> Best regards,
>
> Michal Mistina
>
>
>
> From: pgpool-general-bounces at pgpool.net
> [mailto:pgpool-general-bounces at pgpool.net] On Behalf Of Mistina Michal
> Sent: Tuesday, September 17, 2013 1:54 PM
> To: pgpool-general at pgpool.net
> Subject: [pgpool-general: 2140] read_startup_packet: incorrect packet
> length What does that mean?
>
>
>
> Dear all.
>
> I don't understand what I see in the log files. Following keeps
> happening all the time after I start pgpool:
>
> 2013-09-17 13:26:31 ERROR: pid 5697: read_startup_packet: incorrect
> packet length (983664419)
>
> 2013-09-17 13:26:49 LOG:   pid 3608: do_child: failback event found. restart
> myself.
>
> 2013-09-17 13:26:49 ERROR: pid 5833: read_startup_packet: incorrect
> packet length (983664419)
>
> 2013-09-17 13:27:01 LOG:   pid 3591: do_child: failback event found. restart
> myself.
>
> 2013-09-17 13:27:01 ERROR: pid 6000: read_startup_packet: incorrect
> packet length (983664419)
>
> 2013-09-17 13:27:19 ERROR: pid 5372: read_startup_packet: incorrect
> packet length (983664419)
>
> 2013-09-17 13:27:31 LOG:   pid 3586: do_child: failback event found. restart
> myself.
>
> 2013-09-17 13:27:31 ERROR: pid 5372: read_startup_packet: incorrect
> packet length (983664419)
>
> 2013-09-17 13:27:49 ERROR: pid 6151: read_startup_packet: incorrect
> packet length (983664419)
>
> 2013-09-17 13:28:01 ERROR: pid 3590: read_startup_packet: incorrect
> packet length (983664419)
>
>
>
> Versions:
>
> pgpool-II-3.3.1-1.pgdg.x86_64
>
> postgresql92-9.2.4-1PGDG.rhel6.x86_64
>
> postgresql92-libs-9.2.4-1PGDG.rhel6.x86_64
>
> postgresql92-server-9.2.4-1PGDG.rhel6.x86_64
>
>
>
> Where should I look at? When do occure aforementioned errors?
>
>
>
> Probably I set something wrong in the config file. Or permissions on
> some files are wrong.
>
>
>
> The pgpool.conf follows.
>
> listen_addresses = '*'
>
> port = 9999
>
> socket_dir = '/tmp'
>
> pcp_port = 9898
>
> pcp_socket_dir = '/tmp'
>
> backend_hostname0 = 'tc1'
>
> backend_port0 = 5432
>
> backend_weight0 = 1
>
> backend_data_directory0 = '/var/lib/pgsql/9.2/data'
>
> backend_flag0 = 'ALLOW_TO_FAILOVER'
>
> backend_hostname1 = 'tc2'
>
> backend_port1 = 5432
>
> backend_weight1 = 1
>
> backend_data_directory1 = '/var/lib/pgsql/9.2/data'
>
> backend_flag1 = 'ALLOW_TO_FAILOVER'
>
> enable_pool_hba = off
>
> pool_passwd = 'pool_passwd'
>
> authentication_timeout = 60
>
> ssl = off
>
> num_init_children = 32
>
> max_pool = 4
>
> child_life_time = 300
>
> child_max_connections = 0
>
> connection_life_time = 0
>
> client_idle_limit = 0
>
> log_destination = 'stderr'
>
> print_timestamp = on
>
> log_connections = off
>
> log_hostname = off
>
> log_statement = off
>
> log_per_node_statement = off
>
> log_standby_delay = 'none'
>
> syslog_facility = 'LOCAL0'
>
> syslog_ident = 'pgpool'
>
> debug_level = 0
>
> pid_file_name = '/var/run/pgpool/pgpool.pid'
>
> logdir = '/var/log/pgpool'
>
> connection_cache = on
>
> reset_query_list = 'ABORT; DISCARD ALL'
>
> replication_mode = off
>
> replicate_select = off
>
> insert_lock = on
>
> lobj_lock_table = ''
>
> replication_stop_on_mismatch = off
>
> failover_if_affected_tuples_mismatch = off
>
> load_balance_mode = off
>
> ignore_leading_white_space = on
>
> white_function_list = ''
>
> black_function_list = 'nextval,setval'
>
> master_slave_mode = on
>
> master_slave_sub_mode = 'stream'
>
> sr_check_period = 10
>
> sr_check_user = 'replication'
>
> sr_check_password = 'replicate'
>
> delay_threshold = 10000000
>
> follow_master_command = ''
>
> parallel_mode = off
>
> pgpool2_hostname = ''
>
> system_db_hostname  = 'localhost'
>
> system_db_port = 5432
>
> system_db_dbname = 'pgpool'
>
> system_db_schema = 'pgpool_catalog'
>
> system_db_user = 'pgpool'
>
> system_db_password = ''
>
> health_check_period = 20
>
> health_check_timeout = 20
>
> health_check_user = 'replication'
>
> health_check_password = 'replicate'
>
> health_check_max_retries = 5
>
> health_check_retry_delay = 1
>
> failover_command = '/usr/pgsql-9.2/bin/failover_stream.sh %d %H
> /tmp/trigger_file'
>
> failback_command = ''
>
> fail_over_on_backend_error = on
>
> search_primary_node_timeout = 10
>
> recovery_user = 'nobody'
>
> recovery_password = ''
>
> recovery_1st_stage_command = ''
>
> recovery_2nd_stage_command = ''
>
> recovery_timeout = 90
>
> client_idle_limit_in_recovery = 0
>
> use_watchdog = off
>
> trusted_servers = '192.168.105.254'
>
> ping_path = '/bin'
>
> wd_hostname = ''
>
> wd_port = 9000
>
> wd_authkey = ''
>
> delegate_IP = ''
>
> ifconfig_path = '/sbin'
>
> if_up_cmd = 'ifconfig eth0:0 inet $_IP_$ netmask 255.255.255.0'
>
> if_down_cmd = 'ifconfig eth0:0 down'
>
> arping_path = '/usr/sbin'
>
> arping_cmd = 'arping -U $_IP_$ -w 1'
>
> clear_memqcache_on_escalation = on
>
> wd_escalation_command = ''
>
> wd_lifecheck_method = 'heartbeat'
>
> wd_interval = 5
>
> wd_heartbeat_port = 9694
>
> wd_heartbeat_keepalive = 2
>
> wd_heartbeat_deadtime = 30
>
> heartbeat_destination0 = 'tc2'
>
> heartbeat_destination_port0 = 9694
>
> heartbeat_device0 = ''
>
> wd_life_point = 3
>
> wd_lifecheck_query = 'SELECT 1'
>
> wd_lifecheck_dbname = 'template1'
>
> wd_lifecheck_user = 'replication'
>
> wd_lifecheck_password = 'replicate'
>
> relcache_expire = 0
>
> relcache_size = 256
>
> check_temp_table = on
>
> memory_cache_enabled = off
>
> memqcache_method = 'shmem'
>
> memqcache_memcached_host = 'localhost'
>
> memqcache_memcached_port = 11211
>
> memqcache_total_size = 67108864
>
> memqcache_max_num_cache = 1000000
>
> memqcache_expire = 0
>
> memqcache_auto_cache_invalidation = on
>
> memqcache_maxcache = 409600
>
> memqcache_cache_block_size = 1048576
>
> memqcache_oiddir = '/var/log/pgpool/oiddir'
>
> white_memqcache_table_list = ''
>
> black_memqcache_table_list = ''
>
>
>
>
>
> Best regards,
>
> Michal Mistina
>
>
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: smime.p7s
Type: application/pkcs7-signature
Size: 3076 bytes
Desc: not available
URL: <http://www.pgpool.net/pipermail/pgpool-general/attachments/20131007/9ae94b9d/attachment.p7s>