~andrewjbeach/juju-ci-tools/make-local-patcher

« back to all changes in this revision

Viewing changes to assess_recovery.py

  • Committer: Aaron Bentley
  • Date: 2014-02-24 17:18:29 UTC
  • mto: This revision was merged to the branch mainline in revision 252.
  • Revision ID: aaron.bentley@canonical.com-20140224171829-sz644yhoygu7m9dm
Use tags to identify and shut down instances.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
#!/usr/bin/env python
2
 
# Backup and restore a stack.
3
 
 
4
 
from __future__ import print_function
5
 
 
6
 
from argparse import ArgumentParser
7
 
from contextlib import contextmanager
8
 
import logging
9
 
import re
10
 
from subprocess import CalledProcessError
11
 
import sys
12
 
 
13
 
from deploy_stack import (
14
 
    BootstrapManager,
15
 
    deploy_dummy_stack,
16
 
    get_token_from_status,
17
 
    wait_for_state_server_to_shutdown,
18
 
)
19
 
from jujupy import (
20
 
    parse_new_state_server_from_error,
21
 
)
22
 
from substrate import (
23
 
    convert_to_azure_ids,
24
 
    terminate_instances,
25
 
)
26
 
from utility import (
27
 
    add_basic_testing_arguments,
28
 
    configure_logging,
29
 
    JujuAssertionError,
30
 
    LoggedException,
31
 
    until_timeout,
32
 
)
33
 
 
34
 
 
35
 
__metaclass__ = type
36
 
 
37
 
 
38
 
running_instance_pattern = re.compile('\["([^"]+)"\]')
39
 
 
40
 
 
41
 
log = logging.getLogger("assess_recovery")
42
 
 
43
 
 
44
 
def check_token(client, token):
45
 
    for ignored in until_timeout(300):
46
 
        found = get_token_from_status(client)
47
 
        if found and token in found:
48
 
            return found
49
 
    raise JujuAssertionError('Token is not {}: {}'.format(
50
 
                             token, found))
51
 
 
52
 
 
53
 
def deploy_stack(client, charm_series):
54
 
    """"Deploy a simple stack, state-server and ubuntu."""
55
 
    deploy_dummy_stack(client, charm_series)
56
 
    client.set_config('dummy-source', {'token': 'One'})
57
 
    client.wait_for_workloads()
58
 
    check_token(client, 'One')
59
 
    log.info("%s is ready to testing", client.env.environment)
60
 
 
61
 
 
62
 
def show_controller(client):
63
 
    controller_info = client.show_controller(format='yaml')
64
 
    log.info('Controller is:\n{}'.format(controller_info))
65
 
 
66
 
 
67
 
def restore_present_state_server(controller_client, backup_file):
68
 
    """juju-restore won't restore when the state-server is still present."""
69
 
    try:
70
 
        controller_client.restore_backup(backup_file)
71
 
    except CalledProcessError:
72
 
        log.info(
73
 
            "juju-restore correctly refused to restore "
74
 
            "because the state-server was still up.")
75
 
        return
76
 
    else:
77
 
        raise Exception(
78
 
            "juju-restore restored to an operational state-serve")
79
 
 
80
 
 
81
 
def delete_controller_members(client, leader_only=False):
82
 
    """Delete controller members.
83
 
 
84
 
    The all members are delete by default. The followers are deleted before the
85
 
    leader to simulates a total controller failure. When leader_only is true,
86
 
    the leader is deleted to trigger a new leader election.
87
 
    """
88
 
    if leader_only:
89
 
        leader = client.get_controller_leader()
90
 
        members = [leader]
91
 
    else:
92
 
        members = client.get_controller_members()
93
 
        members.reverse()
94
 
    deleted_machines = []
95
 
    for machine in members:
96
 
        instance_id = machine.info.get('instance-id')
97
 
        if client.env.config['type'] == 'azure':
98
 
            instance_id = convert_to_azure_ids(client, [instance_id])[0]
99
 
        host = machine.info.get('dns-name')
100
 
        log.info("Instrumenting node failure for member {}: {} at {}".format(
101
 
                 machine.machine_id, instance_id, host))
102
 
        terminate_instances(client.env, [instance_id])
103
 
        wait_for_state_server_to_shutdown(
104
 
            host, client, instance_id, timeout=120)
105
 
        deleted_machines.append(machine.machine_id)
106
 
    return deleted_machines
107
 
 
108
 
 
109
 
def restore_missing_state_server(client, controller_client, backup_file,
110
 
                                 check_controller=True):
111
 
    """juju-restore creates a replacement state-server for the services."""
112
 
    log.info("Starting restore.")
113
 
    try:
114
 
        controller_client.restore_backup(backup_file)
115
 
    except CalledProcessError as e:
116
 
        log.info('Call of juju restore exited with an error\n')
117
 
        log.info('Call:  %r\n', e.cmd)
118
 
        log.exception(e)
119
 
        raise LoggedException(e)
120
 
    if check_controller:
121
 
        controller_client.wait_for_started(600)
122
 
    show_controller(client)
123
 
    client.set_config('dummy-source', {'token': 'Two'})
124
 
    client.wait_for_started()
125
 
    client.wait_for_workloads()
126
 
    check_token(client, 'Two')
127
 
    log.info("%s restored", client.env.environment)
128
 
    log.info("PASS")
129
 
 
130
 
 
131
 
def parse_args(argv=None):
132
 
    parser = ArgumentParser(description='Test recovery strategies.')
133
 
    add_basic_testing_arguments(parser)
134
 
    parser.add_argument(
135
 
        '--charm-series', help='Charm series.', default='')
136
 
    strategy = parser.add_argument_group('test strategy')
137
 
    strategy.add_argument(
138
 
        '--ha', action='store_const', dest='strategy', const='ha',
139
 
        default='backup', help="Test HA.")
140
 
    strategy.add_argument(
141
 
        '--backup', action='store_const', dest='strategy', const='backup',
142
 
        help="Test backup/restore.")
143
 
    strategy.add_argument(
144
 
        '--ha-backup', action='store_const', dest='strategy',
145
 
        const='ha-backup', help="Test backup/restore of HA.")
146
 
    return parser.parse_args(argv)
147
 
 
148
 
 
149
 
@contextmanager
150
 
def detect_bootstrap_machine(bs_manager):
151
 
    try:
152
 
        yield
153
 
    except Exception as e:
154
 
        bs_manager.known_hosts['0'] = parse_new_state_server_from_error(e)
155
 
        raise
156
 
 
157
 
 
158
 
def assess_recovery(bs_manager, strategy, charm_series):
159
 
    log.info("Setting up test.")
160
 
    client = bs_manager.client
161
 
    deploy_stack(client, charm_series)
162
 
    client.set_config('dummy-source', {'token': ''})
163
 
    log.info("Setup complete.")
164
 
    log.info("Test started.")
165
 
    controller_client = client.get_controller_client()
166
 
    if strategy in ('ha', 'ha-backup'):
167
 
        controller_client.enable_ha()
168
 
        controller_client.wait_for_ha()
169
 
    if strategy in ('ha-backup', 'backup'):
170
 
        backup_file = controller_client.backup()
171
 
        restore_present_state_server(controller_client, backup_file)
172
 
    if strategy == 'ha':
173
 
        leader_only = True
174
 
    else:
175
 
        leader_only = False
176
 
    deleted_machine_ids = delete_controller_members(
177
 
        controller_client, leader_only=leader_only)
178
 
    log.info("Deleted {}".format(deleted_machine_ids))
179
 
    for m_id in deleted_machine_ids:
180
 
        if bs_manager.known_hosts.get(m_id):
181
 
            del bs_manager.known_hosts[m_id]
182
 
    if strategy == 'ha':
183
 
        client.get_status(600)
184
 
        log.info("HA recovered from leader failure.")
185
 
        log.info("PASS")
186
 
    else:
187
 
        check_controller = strategy != 'ha-backup'
188
 
        restore_missing_state_server(client, controller_client, backup_file,
189
 
                                     check_controller=check_controller)
190
 
    log.info("Test complete.")
191
 
 
192
 
 
193
 
def main(argv):
194
 
    args = parse_args(argv)
195
 
    configure_logging(args.verbose)
196
 
    bs_manager = BootstrapManager.from_args(args)
197
 
    with bs_manager.booted_context(upload_tools=args.upload_tools):
198
 
        with detect_bootstrap_machine(bs_manager):
199
 
            assess_recovery(bs_manager, args.strategy, args.charm_series)
200
 
 
201
 
 
202
 
if __name__ == '__main__':
203
 
    main(sys.argv[1:])