~stub/charms/precise/postgresql/bug-1205286

« back to all changes in this revision

Viewing changes to hooks/hooks.py

Failover test passing

Show diffs side-by-side

added added

removed removed

Lines of Context:
77
77
        client_state = {}
78
78
        add(client_state, 'state')
79
79
 
80
 
        for relid in relation_ids(relation_types=['db', 'db-admin']):
81
 
            relation_set(client_state, relid)
 
80
        for relid in hookenv.relation_ids('db'):
 
81
            hookenv.relation_set(relid, client_state)
 
82
 
 
83
        for relid in hookenv.relation_ids('db-admin'):
 
84
            hookenv.relation_set(relid, client_state)
82
85
 
83
86
        replication_state = dict(client_state)
84
87
 
92
95
        if authorized:
93
96
            replication_state['authorized'] = ' '.join(sorted(authorized))
94
97
 
95
 
        for relid in relation_ids(relation_types=replication_relation_types):
96
 
            relation_set(replication_state, relid)
 
98
        for relid in hookenv.relation_ids('replication'):
 
99
            hookenv.relation_set(relid, replication_state)
97
100
 
98
101
        self.save()
99
102
 
1575
1578
    '''Connect the database as a streaming replica of the master.'''
1576
1579
    master_relation = hookenv.relation_get(unit=master)
1577
1580
 
1578
 
    recovery_conf = dedent("""\
1579
 
        standby_mode = on
1580
 
        primary_conninfo = 'host={} user=juju_replication'
1581
 
        """.format(master_relation['private-address']))
1582
 
    log(recovery_conf, DEBUG)
 
1581
    recovery_conf = Template(
 
1582
        open("templates/recovery.conf.tmpl").read()).render({
 
1583
            'host': master_relation['private-address'],
 
1584
            'password': local_state['replication_password']})
 
1585
    juju_log(MSG_DEBUG, recovery_conf)
1583
1586
    install_file(
1584
1587
        recovery_conf,
1585
1588
        os.path.join(postgresql_cluster_dir, 'recovery.conf'),
1586
1589
        owner="postgres", group="postgres")
 
1590
 
1587
1591
    postgresql_restart()
1588
1592
 
1589
1593
 
1660
1664
    # Now that pg_hba.conf has been regenerated and loaded, inform related
1661
1665
    # units that they have been granted replication access.
1662
1666
    authorized_units = set()
1663
 
    for relid in relation_ids(relation_types=replication_relation_types):
1664
 
        for unit in relation_list(relid):
1665
 
            authorized_units.add(unit)
 
1667
    for unit in hookenv.related_units():
 
1668
        authorized_units.add(unit)
1666
1669
    local_state['authorized'] = authorized_units
1667
1670
 
1668
1671
    master = elected_master()
1681
1684
            promote_database()
1682
1685
            if 'following' in local_state:
1683
1686
                del local_state['following']
 
1687
            if 'wal_received_offset' in local_state:
 
1688
                del local_state['wal_received_offset']
 
1689
            if 'paused_at_failover' in local_state:
 
1690
                del local_state['paused_at_failover']
1684
1691
            local_state['state'] = 'master'
1685
1692
 
1686
1693
            # Publish credentials to hot standbys so they can connect.
1811
1818
 
1812
1819
    assert remote_unit is not None
1813
1820
 
1814
 
    log("{} {} has left the peer group".format(remote_state, remote_unit))
1815
 
 
1816
 
    # If the unit being removed was our master, we need to failover.
1817
 
    if local_state.get('following', None) == remote_unit:
1818
 
 
1819
 
        # Prepare for failover. We need to suspend replication to ensure
1820
 
        # that the replay point remains consistent throughout the
1821
 
        # election, and publish that replay point. By comparing these
1822
 
        # replay points, the most up to date hot standby can be
1823
 
        # identified and promoted to the new master.
 
1821
    log("{} has left the peer group".format(remote_unit))
 
1822
 
 
1823
    # If we are the last unit standing, we become standalone
 
1824
    remaining_peers = set(hookenv.related_units(hookenv.relation_id()))
 
1825
    remaining_peers.discard(remote_unit)  # Bug #1192433
 
1826
 
 
1827
    # True if we were following the departed unit.
 
1828
    following_departed = (local_state.get('following', None) == remote_unit)
 
1829
 
 
1830
    if remaining_peers and not following_departed:
 
1831
        log("Remaining {}".format(local_state['state']))
 
1832
 
 
1833
    elif remaining_peers and following_departed:
 
1834
        # If the unit being removed was our master, prepare for failover.
 
1835
        # We need to suspend replication to ensure that the replay point
 
1836
        # remains consistent throughout the election, and publish that
 
1837
        # replay point. Once all units have entered this steady state,
 
1838
        # we can identify the most up to date hot standby and promote it
 
1839
        # to be the new master.
 
1840
        log("Entering failover state")
1824
1841
        cur = db_cursor(autocommit=True)
1825
 
        cur.execute(
1826
 
            "SELECT pg_is_xlog_replay_paused()")
 
1842
        cur.execute("SELECT pg_is_xlog_replay_paused()")
1827
1843
        already_paused = cur.fetchone()[0]
1828
1844
        local_state["paused_at_failover"] = already_paused
1829
1845
        if not already_paused:
1830
1846
            cur.execute("SELECT pg_xlog_replay_pause()")
 
1847
        # Switch to failover state. Don't cleanup the 'following'
 
1848
        # setting because having access to the former master is still
 
1849
        # useful.
1831
1850
        local_state['state'] = 'failover'
1832
1851
        local_state['wal_received_offset'] = postgresql_wal_received_offset()
1833
1852
 
1834
 
        # Now do nothing. We can't elect a new master until all the
1835
 
        # remaining peers are in a steady state and have published their
1836
 
        # wal_received_offset. Only then can we select a node to be
1837
 
        # master.
1838
 
        pass
 
1853
    else:
 
1854
        log("Last unit standing. Switching from {} to standalone.".format(
 
1855
            local_state['state']))
 
1856
        promote_database()
 
1857
        local_state['state'] = 'standalone'
 
1858
        if 'following' in local_state:
 
1859
            del local_state['following']
 
1860
        if 'wal_received_offset' in local_state:
 
1861
            del local_state['wal_received_offset']
 
1862
        if 'paused_at_failover' in local_state:
 
1863
            del local_state['paused_at_failover']
1839
1864
 
1840
1865
    config_changed(postgresql_config)
1841
1866
    local_state.publish()
1842
1867
 
1843
1868
 
1844
1869
def replication_relation_broken():
 
1870
    # This unit has been removed from the service.
1845
1871
    promote_database()
1846
 
    local_state['state'] = 'standalone'
1847
 
    local_state.save()
1848
1872
    if os.path.exists(charm_pgpass):
1849
1873
        os.unlink(charm_pgpass)
1850
1874
    config_changed(postgresql_config)
2123
2147
    elif hook_name == "nrpe-external-master-relation-changed":
2124
2148
        update_nrpe_checks()
2125
2149
 
2126
 
    elif hook_name.startswith('master') or hook_name.startswith('slave'):
2127
 
        raise NotImplementedError(hook_name)
2128
 
 
2129
2150
    elif hook_name == 'replication-relation-joined':
2130
2151
        replication_relation_joined_changed()
2131
2152