78
78
add(client_state, 'state')
80
for relid in relation_ids(relation_types=['db', 'db-admin']):
81
relation_set(client_state, relid)
80
for relid in hookenv.relation_ids('db'):
81
hookenv.relation_set(relid, client_state)
83
for relid in hookenv.relation_ids('db-admin'):
84
hookenv.relation_set(relid, client_state)
83
86
replication_state = dict(client_state)
1575
1578
'''Connect the database as a streaming replica of the master.'''
1576
1579
master_relation = hookenv.relation_get(unit=master)
1578
recovery_conf = dedent("""\
1580
primary_conninfo = 'host={} user=juju_replication'
1581
""".format(master_relation['private-address']))
1582
log(recovery_conf, DEBUG)
1581
recovery_conf = Template(
1582
open("templates/recovery.conf.tmpl").read()).render({
1583
'host': master_relation['private-address'],
1584
'password': local_state['replication_password']})
1585
juju_log(MSG_DEBUG, recovery_conf)
1585
1588
os.path.join(postgresql_cluster_dir, 'recovery.conf'),
1586
1589
owner="postgres", group="postgres")
1587
1591
postgresql_restart()
1660
1664
# Now that pg_hba.conf has been regenerated and loaded, inform related
1661
1665
# units that they have been granted replication access.
1662
1666
authorized_units = set()
1663
for relid in relation_ids(relation_types=replication_relation_types):
1664
for unit in relation_list(relid):
1665
authorized_units.add(unit)
1667
for unit in hookenv.related_units():
1668
authorized_units.add(unit)
1666
1669
local_state['authorized'] = authorized_units
1668
1671
master = elected_master()
1812
1819
assert remote_unit is not None
1814
log("{} {} has left the peer group".format(remote_state, remote_unit))
1816
# If the unit being removed was our master, we need to failover.
1817
if local_state.get('following', None) == remote_unit:
1819
# Prepare for failover. We need to suspend replication to ensure
1820
# that the replay point remains consistent throughout the
1821
# election, and publish that replay point. By comparing these
1822
# replay points, the most up to date hot standby can be
1823
# identified and promoted to the new master.
1821
log("{} has left the peer group".format(remote_unit))
1823
# If we are the last unit standing, we become standalone
1824
remaining_peers = set(hookenv.related_units(hookenv.relation_id()))
1825
remaining_peers.discard(remote_unit) # Bug #1192433
1827
# True if we were following the departed unit.
1828
following_departed = (local_state.get('following', None) == remote_unit)
1830
if remaining_peers and not following_departed:
1831
log("Remaining {}".format(local_state['state']))
1833
elif remaining_peers and following_departed:
1834
# If the unit being removed was our master, prepare for failover.
1835
# We need to suspend replication to ensure that the replay point
1836
# remains consistent throughout the election, and publish that
1837
# replay point. Once all units have entered this steady state,
1838
# we can identify the most up to date hot standby and promote it
1839
# to be the new master.
1840
log("Entering failover state")
1824
1841
cur = db_cursor(autocommit=True)
1826
"SELECT pg_is_xlog_replay_paused()")
1842
cur.execute("SELECT pg_is_xlog_replay_paused()")
1827
1843
already_paused = cur.fetchone()[0]
1828
1844
local_state["paused_at_failover"] = already_paused
1829
1845
if not already_paused:
1830
1846
cur.execute("SELECT pg_xlog_replay_pause()")
1847
# Switch to failover state. Don't cleanup the 'following'
1848
# setting because having access to the former master is still
1831
1850
local_state['state'] = 'failover'
1832
1851
local_state['wal_received_offset'] = postgresql_wal_received_offset()
1834
# Now do nothing. We can't elect a new master until all the
1835
# remaining peers are in a steady state and have published their
1836
# wal_received_offset. Only then can we select a node to be
1854
log("Last unit standing. Switching from {} to standalone.".format(
1855
local_state['state']))
1857
local_state['state'] = 'standalone'
1858
if 'following' in local_state:
1859
del local_state['following']
1860
if 'wal_received_offset' in local_state:
1861
del local_state['wal_received_offset']
1862
if 'paused_at_failover' in local_state:
1863
del local_state['paused_at_failover']
1840
1865
config_changed(postgresql_config)
1841
1866
local_state.publish()
1844
1869
def replication_relation_broken():
1870
# This unit has been removed from the service.
1845
1871
promote_database()
1846
local_state['state'] = 'standalone'
1848
1872
if os.path.exists(charm_pgpass):
1849
1873
os.unlink(charm_pgpass)
1850
1874
config_changed(postgresql_config)