[pgpool-general-jp: 1439] Re: num_init_childrenを増やすとfailoverに失敗する

2016年 12月 9日 (金) 19:45:09 JST

川口です。

> 具体的に「失敗」というのはどのような状態になるのでしょうか？

具体的には、スタンバイ状態のDBがPromoteされずに
ReadOnlyのままになってしまう状態です。


> また、そのときのログを見せていただけますか？

こちらがログです。


2016-12-08 14:01:33: pid 16608: LOG:  watchdog: lifecheck started
2016-12-08 14:01:33: pid 16608: LOCATION:  wd_lifecheck.c:417
2016-12-08 14:01:38: pid 16683: LOG:  reading and processing packets
2016-12-08 14:01:38: pid 16683: DETAIL:  postmaster on DB node 1 was shutdown by administrative command
2016-12-08 14:01:38: pid 16683: LOCATION:  pool_process_query.c:4710
2016-12-08 14:01:38: pid 16683: LOG:  received degenerate backend request for node_id: 1 from pid [16683]
2016-12-08 14:01:38: pid 16683: LOCATION:  pgpool_main.c:1141
2016-12-08 14:01:38: pid 16592: LOG:  new IPC connection received
2016-12-08 14:01:38: pid 16592: LOCATION:  watchdog.c:2540
2016-12-08 14:01:38: pid 16591: LOG:  starting degeneration. shutdown host cloud-vm-db-t-a02.komuten01.local(5432)
2016-12-08 14:01:38: pid 16591: LOCATION:  pgpool_main.c:1648
2016-12-08 14:01:38: pid 16907: LOG:  reading and processing packets
2016-12-08 14:01:38: pid 16907: DETAIL:  postmaster on DB node 1 was shutdown by administrative command
2016-12-08 14:01:38: pid 16907: LOCATION:  pool_process_query.c:4710
2016-12-08 14:01:38: pid 16907: LOG:  received degenerate backend request for node_id: 1 from pid [16907]
2016-12-08 14:01:38: pid 16907: LOCATION:  pgpool_main.c:1141
2016-12-08 14:01:38: pid 16592: LOG:  new IPC connection received
2016-12-08 14:01:38: pid 16592: LOCATION:  watchdog.c:2540
2016-12-08 14:01:38: pid 16911: LOG:  reading and processing packets
2016-12-08 14:01:38: pid 16911: DETAIL:  postmaster on DB node 1 was shutdown by administrative command
2016-12-08 14:01:38: pid 16911: LOCATION:  pool_process_query.c:4710
2016-12-08 14:01:38: pid 16911: LOG:  received degenerate backend request for node_id: 1 from pid [16911]
2016-12-08 14:01:38: pid 16911: LOCATION:  pgpool_main.c:1141
2016-12-08 14:01:38: pid 16905: LOG:  reading and processing packets
2016-12-08 14:01:38: pid 16592: LOG:  new IPC connection received
2016-12-08 14:01:38: pid 16905: DETAIL:  postmaster on DB node 1 was shutdown by administrative command
2016-12-08 14:01:38: pid 16592: LOCATION:  watchdog.c:2540
2016-12-08 14:01:38: pid 16910: LOG:  reading and processing packets
2016-12-08 14:01:38: pid 16910: DETAIL:  postmaster on DB node 1 was shutdown by administrative command
2016-12-08 14:01:38: pid 16905: LOCATION:  pool_process_query.c:4710
2016-12-08 14:01:38: pid 16910: LOCATION:  pool_process_query.c:4710
2016-12-08 14:01:38: pid 16905: LOG:  received degenerate backend request for node_id: 1 from pid [16905]
2016-12-08 14:01:38: pid 16910: LOG:  received degenerate backend request for node_id: 1 from pid [16910]
2016-12-08 14:01:38: pid 16905: LOCATION:  pgpool_main.c:1141
2016-12-08 14:01:38: pid 16592: LOG:  new IPC connection received
2016-12-08 14:01:38: pid 16592: LOCATION:  watchdog.c:2540
2016-12-08 14:01:38: pid 16910: LOCATION:  pgpool_main.c:1141
2016-12-08 14:01:38: pid 16592: LOG:  new IPC connection received
2016-12-08 14:01:38: pid 16592: LOCATION:  watchdog.c:2540
2016-12-08 14:01:38: pid 16591: LOG:  Restart all children
2016-12-08 14:01:38: pid 16591: LOCATION:  pgpool_main.c:1747
2016-12-08 14:01:38: pid 16613: LOG:  child process received shutdown request signal 3
2016-12-08 14:01:38: pid 16613: LOCATION:  child.c:919

～この間すべて～
child process received shutdown request signal 3
child.c:919


2016-12-08 14:01:38: pid 16736: LOG:  child process received shutdown request signal 3
2016-12-08 14:01:38: pid 16736: LOCATION:  child.c:919
2016-12-08 14:01:38: pid 16914: ERROR:  failed to authenticate
2016-12-08 14:01:38: pid 16914: DETAIL:  invalid authentication message response type, Expecting 'R' and received 'E'
2016-12-08 14:01:38: pid 16914: LOCATION:  child.c:1375
2016-12-08 14:01:38: pid 16914: ERROR:  failed to make persistent db connection
2016-12-08 14:01:38: pid 16914: DETAIL:  connection to host:"db-server2:5432" failed
2016-12-08 14:01:38: pid 16914: LOCATION:  child.c:1193
2016-12-08 14:01:43: pid 16914: ERROR:  Failed to check replication time lag
2016-12-08 14:01:43: pid 16914: DETAIL:  No persistent db connection for the node 0
2016-12-08 14:01:43: pid 16914: HINT:  check sr_check_user and sr_check_password
2016-12-08 14:01:43: pid 16914: CONTEXT:  while checking replication time lag
2016-12-08 14:01:43: pid 16914: LOCATION:  pool_worker_child.c:285
2016-12-08 14:01:43: pid 16914: ERROR:  failed to make persistent db connection
2016-12-08 14:01:43: pid 16914: DETAIL:  connection to host:"db-server2:5432" failed
2016-12-08 14:01:43: pid 16914: LOCATION:  child.c:1193
2016-12-08 14:01:48: pid 16914: ERROR:  Failed to check replication time lag
2016-12-08 14:01:48: pid 16914: DETAIL:  No persistent db connection for the node 1
2016-12-08 14:01:48: pid 16914: HINT:  check sr_check_user and sr_check_password
2016-12-08 14:01:48: pid 16914: CONTEXT:  while checking replication time lag
2016-12-08 14:01:48: pid 16914: LOCATION:  pool_worker_child.c:285
2016-12-08 14:01:48: pid 16914: ERROR:  failed to make persistent db connection
2016-12-08 14:01:48: pid 16914: DETAIL:  connection to host:"db-server2:5432" failed
2016-12-08 14:01:48: pid 16914: LOCATION:  child.c:1193
2016-12-08 14:01:48: pid 16914: ERROR:  Failed to check replication time lag
2016-12-08 14:01:48: pid 16914: DETAIL:  No persistent db connection for the node 1
2016-12-08 14:01:48: pid 16914: HINT:  check sr_check_user and sr_check_password
2016-12-08 14:01:48: pid 16914: CONTEXT:  while checking replication time lag
2016-12-08 14:01:48: pid 16914: LOCATION:  pool_worker_child.c:285


以上です。


On 2016/12/09 7:44, Tatsuo Ishii wrote:

> 石井です。
>
> 具体的に「失敗」というのはどのような状態になるのでしょうか？
> また、そのときのログを見せていただけますか？
> --
> Tatsuo Ishii
> SRA OSS, Inc. Japan
> English: http://www.sraoss.co.jp/index_en.php
> Japanese:http://www.sraoss.co.jp
>
>> はじめまして、川口と申します。
>>
>> pgpool-IIでフェイルオーバの検証を行っています。
>>
>> PostgreSQL2台でStreaming Replicationを構成し、
>> そこへpgpool-II（こちらも2台体制）から接続しています。
>> なお、「コネクションプーリング」「負荷分散」「watchdog」「フェイルオー
>> バ」機能はすべてオンにしてあります。
>>
>> フェイルオーバは、num_init_childrenがデフォルトの32だと成功するのですが、
>> それを600に増やすと、成功したり失敗したりで安定しません。
>>
>>
>> このような場合、pgpool.conf設定の何かのタイムアウト時間を伸ばすなどの
>> 「勘所」のようなものはあるのでしょうか？
>>
>> どなたかご存知の方がいらっしゃれば、お手数ですがお教えください。
>> よろしくお願いします。
>>
>>
>> ●構成
>> OSは全てCentOS 7.2
>> pgpool-II : 3.5.4
>> PostgreSQL : 9.5.1
>>
>> マシンは全て仮想マシンで、
>> PostgreSQL用×2、pgpool用×2の計4台
>>
>>
>> ●設定ファイル(pgpool.conf)
>>
>> #------------
>> # CONNECTIONS
>> #------------
>>
>> listen_addresses = '*'
>> port = 9999
>> socket_dir = '(path)'
>> listen_backlog_multiplier = 2
>> serialize_accept = off
>> pcp_listen_addresses = '*'
>> pcp_port = 9898
>> pcp_socket_dir = '(path)'
>> backend_hostname0 = 'db-server1'
>> backend_port0 = 5432
>> backend_weight0 = 1
>> backend_data_directory0 = '(path)'
>> backend_flag0 = 'ALLOW_TO_FAILOVER'
>> backend_hostname1 = 'db-server2'
>> backend_port1 = 5432
>> backend_weight1 = 1
>> backend_data_directory1 = '(path)'
>> backend_flag1 = 'ALLOW_TO_FAILOVER'
>>
>> enable_pool_hba = on
>> pool_passwd = 'pool_passwd'
>> authentication_timeout = 60
>>
>> ssl = off
>> #ssl_key = '(path)/server.key'
>> #ssl_cert = '(path)/server.cert'
>> #ssl_ca_cert = ''
>> #ssl_ca_cert_dir = ''
>>
>> #------
>> # POOLS
>> #------
>>
>> num_init_children = 600
>> max_pool = 1
>> child_life_time = 300
>> child_max_connections = 0
>> connection_life_time = 0
>> client_idle_limit = 0
>>
>> #------
>> # LOGS
>> #------
>> log_destination = 'syslog'
>> log_line_prefix = '%t: pid %p: '
>> log_connections = on
>> log_hostname = off
>> log_statement = off
>> log_per_node_statement = off
>> log_standby_delay = 'none'
>>
>> syslog_facility = 'LOCAL0'
>> syslog_ident = 'pgpool'
>>
>> debug_level = 0
>> log_error_verbosity = verbose
>> #client_min_messages = notice
>> log_min_messages = debug5
>>
>> #---------------
>> # FILE LOCATIONS
>> #---------------
>> pid_file_name = '(path)/pgpool.pid'
>> logdir = '(path)/pgpool'
>>
>> #-------------------
>> # CONNECTION POOLING
>> #-------------------
>> connection_cache = on
>> reset_query_list = 'ABORT; DISCARD ALL'
>> #reset_query_list = 'ABORT; RESET ALL; SET SESSION AUTHORIZATION
>> #DEFAULT'
>>
>> #------------------
>> # REPLICATION MODE
>> #------------------
>> replication_mode = off
>> replicate_select = off
>> insert_lock = on
>> lobj_lock_table = ''
>>
>> replication_stop_on_mismatch = off
>> failover_if_affected_tuples_mismatch = off
>>
>> #--------------------
>> # LOAD BALANCING MODE
>> #--------------------
>> load_balance_mode = on
>> ignore_leading_white_space = on
>> white_function_list = ''
>> black_function_list = 'nextval,setval,nextval,setval'
>>
>> database_redirect_preference_list = ''
>> app_name_redirect_preference_list = ''
>> allow_sql_comments = off
>>
>> #------------------
>> # MASTER/SLAVE MODE
>> #------------------
>> master_slave_mode = on
>> master_slave_sub_mode = 'stream'
>>
>> sr_check_period = 5
>> sr_check_user = 'user1'
>> sr_check_password = 'password'
>> sr_check_database = 'database1'
>> delay_threshold = 10000
>>
>> follow_master_command = ''
>>
>> #-------------
>> # HEALTH CHECK
>> #-------------
>> health_check_period = 5
>> health_check_timeout = 20
>> health_check_user = 'user1'
>> health_check_password = 'password'
>> health_check_database = ''
>> health_check_max_retries = 0
>> health_check_retry_delay = 1
>>
>> connect_timeout = 10000
>>
>> #----------------------
>> # FAILOVER AND FAILBACK
>> #----------------------
>> failover_command = '(path)/failover.sh %H %R'
>> failback_command = ''
>> fail_over_on_backend_error = on
>> search_primary_node_timeout = 10
>>
>> #----------------
>> # ONLINE RECOVERY
>> #----------------
>>
>> recovery_user = 'user1'
>> recovery_password = 'password'
>> recovery_1st_stage_command = ''
>> recovery_2nd_stage_command = ''
>> recovery_timeout = 90
>> client_idle_limit_in_recovery = 0
>>
>> #---------
>> # WATCHDOG
>> #---------
>>
>> use_watchdog = on
>>
>> trusted_servers = ''
>> ping_path = '(path)'
>> wd_hostname = 'pgpool-server1'
>> wd_port = 9000
>> wd_priority = 1
>> wd_authkey = ''
>> wd_ipc_socket_dir = '/tmp'
>>
>>
>> delegate_IP = '172.16.XXX.XXX'
>> if_cmd_path = '(path)'
>> if_up_cmd = 'ip addr add $_IP_$/24 dev eth1 label eth1:0'
>> if_down_cmd = 'ip addr del $_IP_$/24 dev eth1'
>> arping_path = '(path)'
>> arping_cmd = 'arping -U $_IP_$ -w 1 -I eth1'
>>
>> clear_memqcache_on_escalation = on
>> wd_escalation_command = ''
>> wd_de_escalation_command = ''
>>
>>
>> wd_monitoring_interfaces_list = ''
>> wd_lifecheck_method = 'heartbeat'
>> wd_interval = 10
>>
>> wd_heartbeat_port = 9694
>> wd_heartbeat_keepalive = 2
>> wd_heartbeat_deadtime = 30
>> heartbeat_destination0 = 'pgpool-server2'
>> heartbeat_destination_port0 = 9694
>> heartbeat_device0 = ''
>>
>> wd_life_point = 3
>> wd_lifecheck_query = 'SELECT 1'
>> wd_lifecheck_dbname = 'template1'
>> wd_lifecheck_user = 'nobody'
>> wd_lifecheck_password = ''
>>
>> other_pgpool_hostname0 = 'pgpool-server2'
>> other_pgpool_port0 = 9999
>> other_wd_port0 = 9000
>>
>> #-------
>> # OTHERS
>> #-------
>> relcache_expire = 0
>> relcache_size = 256
>> check_temp_table = on
>> check_unlogged_table = on
>>
>> #-----------------------------
>> # IN MEMORY QUERY MEMORY CACHE
>> #-----------------------------
>>
>> memory_cache_enabled = off
>> memqcache_method = 'shmem'
>> memqcache_memcached_host = 'localhost'
>> memqcache_memcached_port = 11211
>> memqcache_total_size = 67108864
>> memqcache_max_num_cache = 1000000
>> memqcache_expire = 0
>> memqcache_auto_cache_invalidation = on
>> memqcache_maxcache = 409600
>> memqcache_cache_block_size = 1048576
>> memqcache_oiddir = '(path)/oiddir'
>> white_memqcache_table_list = ''
>> black_memqcache_table_list = ''
>>
>>
>> 以上です。
>>
>>
>> _______________________________________________
>> pgpool-general-jp mailing list
>> pgpool-general-jp ＠ sraoss.jp
>> http://www.sraoss.jp/mailman/listinfo/pgpool-general-jp