~andrewjbeach/juju-ci-tools/make-local-patcher

« back to all changes in this revision

Viewing changes to assess_recovery.py

  • Committer: Curtis Hovey
  • Date: 2015-06-16 20:18:37 UTC
  • mto: This revision was merged to the branch mainline in revision 995.
  • Revision ID: curtis@canonical.com-20150616201837-l44eyp22o501g6ee
Ensure the env name is in the config.

Show diffs side-by-side

added added

removed removed

Lines of Context:
3
3
 
4
4
from __future__ import print_function
5
5
 
 
6
__metaclass__ = type
 
7
 
6
8
from argparse import ArgumentParser
7
 
from contextlib import contextmanager
8
9
import logging
 
10
import os
9
11
import re
10
 
from subprocess import CalledProcessError
 
12
import subprocess
11
13
import sys
12
14
 
13
15
from deploy_stack import (
14
 
    BootstrapManager,
15
 
    deploy_dummy_stack,
16
 
    get_token_from_status,
 
16
    dump_env_logs,
 
17
    get_machine_dns_name,
17
18
    wait_for_state_server_to_shutdown,
18
19
)
 
20
from jujuconfig import (
 
21
    get_jenv_path,
 
22
    get_juju_home,
 
23
)
19
24
from jujupy import (
 
25
    temp_bootstrap_env,
 
26
    until_timeout,
 
27
    make_client,
20
28
    parse_new_state_server_from_error,
21
29
)
22
30
from substrate import (
23
 
    convert_to_azure_ids,
24
31
    terminate_instances,
25
32
)
26
33
from utility import (
27
 
    add_basic_testing_arguments,
28
 
    configure_logging,
29
 
    JujuAssertionError,
30
 
    LoggedException,
31
 
    until_timeout,
 
34
    ensure_deleted,
 
35
    print_now,
32
36
)
33
37
 
34
38
 
35
 
__metaclass__ = type
36
 
 
37
 
 
38
39
running_instance_pattern = re.compile('\["([^"]+)"\]')
39
40
 
40
41
 
41
 
log = logging.getLogger("assess_recovery")
42
 
 
43
 
 
44
 
def check_token(client, token):
45
 
    for ignored in until_timeout(300):
46
 
        found = get_token_from_status(client)
47
 
        if found and token in found:
48
 
            return found
49
 
    raise JujuAssertionError('Token is not {}: {}'.format(
50
 
                             token, found))
51
 
 
52
 
 
53
 
def deploy_stack(client, charm_series):
 
42
def setup_juju_path(juju_path):
 
43
    """Ensure the binaries and scripts under test are found first."""
 
44
    full_path = os.path.abspath(juju_path)
 
45
    if not os.path.isdir(full_path):
 
46
        raise ValueError("The juju_path does not exist: %s" % full_path)
 
47
    os.environ['PATH'] = '%s:%s' % (full_path, os.environ['PATH'])
 
48
    sys.path.insert(0, full_path)
 
49
 
 
50
 
 
51
def deploy_stack(client, charm_prefix):
54
52
    """"Deploy a simple stack, state-server and ubuntu."""
55
 
    deploy_dummy_stack(client, charm_series)
56
 
    client.set_config('dummy-source', {'token': 'One'})
57
 
    client.wait_for_workloads()
58
 
    check_token(client, 'One')
59
 
    log.info("%s is ready to testing", client.env.environment)
60
 
 
61
 
 
62
 
def show_controller(client):
63
 
    controller_info = client.show_controller(format='yaml')
64
 
    log.info('Controller is:\n{}'.format(controller_info))
65
 
 
66
 
 
67
 
def restore_present_state_server(controller_client, backup_file):
 
53
    if charm_prefix and not charm_prefix.endswith('/'):
 
54
        charm_prefix = charm_prefix + '/'
 
55
    agent_version = client.get_matching_agent_version()
 
56
    instance_id = client.get_status().status['machines']['0']['instance-id']
 
57
    for ignored in until_timeout(30):
 
58
        agent_versions = client.get_status().get_agent_versions()
 
59
        if 'unknown' not in agent_versions and len(agent_versions) == 1:
 
60
            break
 
61
    if agent_versions.keys() != [agent_version]:
 
62
        print_now("Current versions: %s" % ', '.join(agent_versions.keys()))
 
63
        client.juju('upgrade-juju', ('--version', agent_version))
 
64
    client.wait_for_version(client.get_matching_agent_version())
 
65
    client.juju('deploy', (charm_prefix + 'ubuntu',))
 
66
    client.wait_for_started().status
 
67
    print_now("%s is ready to testing" % client.env.environment)
 
68
    return instance_id
 
69
 
 
70
 
 
71
def restore_present_state_server(client, backup_file):
68
72
    """juju-restore won't restore when the state-server is still present."""
69
 
    try:
70
 
        controller_client.restore_backup(backup_file)
71
 
    except CalledProcessError:
72
 
        log.info(
 
73
    environ = dict(os.environ)
 
74
    proc = subprocess.Popen(
 
75
        ['juju', '--show-log', 'restore', '-e', client.env.environment,
 
76
         backup_file],
 
77
        env=environ, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
78
    output, err = proc.communicate()
 
79
    if proc.returncode == 0:
 
80
        raise Exception(
 
81
            "juju-restore restored to an operational state-server: %s" % err)
 
82
    else:
 
83
        print_now(
73
84
            "juju-restore correctly refused to restore "
74
85
            "because the state-server was still up.")
75
 
        return
76
 
    else:
77
 
        raise Exception(
78
 
            "juju-restore restored to an operational state-serve")
79
 
 
80
 
 
81
 
def delete_controller_members(client, leader_only=False):
82
 
    """Delete controller members.
83
 
 
84
 
    The all members are delete by default. The followers are deleted before the
85
 
    leader to simulates a total controller failure. When leader_only is true,
86
 
    the leader is deleted to trigger a new leader election.
87
 
    """
88
 
    if leader_only:
89
 
        leader = client.get_controller_leader()
90
 
        members = [leader]
91
 
    else:
92
 
        members = client.get_controller_members()
93
 
        members.reverse()
94
 
    deleted_machines = []
95
 
    for machine in members:
96
 
        instance_id = machine.info.get('instance-id')
97
 
        if client.env.config['type'] == 'azure':
98
 
            instance_id = convert_to_azure_ids(client, [instance_id])[0]
99
 
        host = machine.info.get('dns-name')
100
 
        log.info("Instrumenting node failure for member {}: {} at {}".format(
101
 
                 machine.machine_id, instance_id, host))
102
 
        terminate_instances(client.env, [instance_id])
103
 
        wait_for_state_server_to_shutdown(
104
 
            host, client, instance_id, timeout=120)
105
 
        deleted_machines.append(machine.machine_id)
106
 
    return deleted_machines
107
 
 
108
 
 
109
 
def restore_missing_state_server(client, controller_client, backup_file,
110
 
                                 check_controller=True):
 
86
        match = running_instance_pattern.search(err)
 
87
        if match is None:
 
88
            print_now("WARNING: Could not find the instance_id in output:")
 
89
            print_now(err)
 
90
            print_now("")
 
91
            return None
 
92
        instance_id = match.group(1)
 
93
    return instance_id
 
94
 
 
95
 
 
96
def delete_instance(client, instance_id):
 
97
    """Delete the instance using the providers tools."""
 
98
    print_now("Instrumenting a bootstrap node failure.")
 
99
    return terminate_instances(client.env, [instance_id])
 
100
 
 
101
 
 
102
def delete_extra_state_servers(client, instance_id):
 
103
    """Delete the extra state-server instances."""
 
104
    status = client.get_status()
 
105
    for machine, info in status.iter_machines():
 
106
        extra_instance_id = info.get('instance-id')
 
107
        status = info.get('state-server-member-status')
 
108
        if extra_instance_id != instance_id and status is not None:
 
109
            print_now("Deleting state-server-member {}".format(machine))
 
110
            host = get_machine_dns_name(client, machine)
 
111
            delete_instance(client, extra_instance_id)
 
112
            wait_for_state_server_to_shutdown(host, client, extra_instance_id)
 
113
 
 
114
 
 
115
def restore_missing_state_server(client, backup_file):
111
116
    """juju-restore creates a replacement state-server for the services."""
112
 
    log.info("Starting restore.")
113
 
    try:
114
 
        controller_client.restore_backup(backup_file)
115
 
    except CalledProcessError as e:
116
 
        log.info('Call of juju restore exited with an error\n')
117
 
        log.info('Call:  %r\n', e.cmd)
118
 
        log.exception(e)
119
 
        raise LoggedException(e)
120
 
    if check_controller:
121
 
        controller_client.wait_for_started(600)
122
 
    show_controller(client)
123
 
    client.set_config('dummy-source', {'token': 'Two'})
124
 
    client.wait_for_started()
125
 
    client.wait_for_workloads()
126
 
    check_token(client, 'Two')
127
 
    log.info("%s restored", client.env.environment)
128
 
    log.info("PASS")
 
117
    environ = dict(os.environ)
 
118
    print_now("Starting restore.")
 
119
    proc = subprocess.Popen(
 
120
        ['juju', '--show-log', 'restore', '-e', client.env.environment,
 
121
         '--constraints', 'mem=2G', backup_file],
 
122
        env=environ, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
123
    output, err = proc.communicate()
 
124
    if proc.returncode != 0:
 
125
        print_now('Call of juju restore exited with an error\n')
 
126
        message = 'Restore failed: \n%s' % err
 
127
        print_now(message)
 
128
        print_now('\n')
 
129
        raise Exception(message)
 
130
    print_now(output)
 
131
    client.wait_for_started(600).status
 
132
    print_now("%s restored" % client.env.environment)
 
133
    print_now("PASS")
129
134
 
130
135
 
131
136
def parse_args(argv=None):
132
 
    parser = ArgumentParser(description='Test recovery strategies.')
133
 
    add_basic_testing_arguments(parser)
134
 
    parser.add_argument(
135
 
        '--charm-series', help='Charm series.', default='')
 
137
    parser = ArgumentParser('Test recovery strategies.')
 
138
    parser.add_argument(
 
139
        '--charm-prefix', help='A prefix for charm urls.', default='')
 
140
    parser.add_argument(
 
141
        '--debug', action='store_true', default=False,
 
142
        help='Use --debug juju logging.')
136
143
    strategy = parser.add_argument_group('test strategy')
137
144
    strategy.add_argument(
138
145
        '--ha', action='store_const', dest='strategy', const='ha',
143
150
    strategy.add_argument(
144
151
        '--ha-backup', action='store_const', dest='strategy',
145
152
        const='ha-backup', help="Test backup/restore of HA.")
 
153
    parser.add_argument('juju_path')
 
154
    parser.add_argument('env_name')
 
155
    parser.add_argument('logs', help='Directory to store logs in.')
 
156
    parser.add_argument(
 
157
        'temp_env_name', nargs='?',
 
158
        help='Temporary environment name to use for this test.')
146
159
    return parser.parse_args(argv)
147
160
 
148
161
 
149
 
@contextmanager
150
 
def detect_bootstrap_machine(bs_manager):
 
162
def main():
 
163
    args = parse_args()
 
164
    log_dir = args.logs
151
165
    try:
152
 
        yield
 
166
        setup_juju_path(args.juju_path)
 
167
        client = make_client(args.juju_path, args.debug, args.env_name,
 
168
                             args.temp_env_name)
 
169
        juju_home = get_juju_home()
 
170
        ensure_deleted(get_jenv_path(juju_home, client.env.environment))
 
171
        with temp_bootstrap_env(juju_home, client):
 
172
            client.bootstrap()
 
173
        bootstrap_host = get_machine_dns_name(client, 0)
 
174
        try:
 
175
            instance_id = deploy_stack(client, args.charm_prefix)
 
176
            if args.strategy in ('ha', 'ha-backup'):
 
177
                client.juju('ensure-availability', ('-n', '3'))
 
178
                client.wait_for_ha()
 
179
            if args.strategy in ('ha-backup', 'backup'):
 
180
                backup_file = client.backup()
 
181
                restore_present_state_server(client, backup_file)
 
182
            if args.strategy == 'ha-backup':
 
183
                delete_extra_state_servers(client, instance_id)
 
184
            delete_instance(client, instance_id)
 
185
            wait_for_state_server_to_shutdown(bootstrap_host, client,
 
186
                                              instance_id)
 
187
            bootstrap_host = None
 
188
            if args.strategy == 'ha':
 
189
                client.get_status(600)
 
190
            else:
 
191
                restore_missing_state_server(client, backup_file)
 
192
        except Exception as e:
 
193
            if bootstrap_host is None:
 
194
                bootstrap_host = parse_new_state_server_from_error(e)
 
195
            dump_env_logs(client, bootstrap_host, log_dir)
 
196
            raise
 
197
        finally:
 
198
            client.destroy_environment()
153
199
    except Exception as e:
154
 
        bs_manager.known_hosts['0'] = parse_new_state_server_from_error(e)
155
 
        raise
156
 
 
157
 
 
158
 
def assess_recovery(bs_manager, strategy, charm_series):
159
 
    log.info("Setting up test.")
160
 
    client = bs_manager.client
161
 
    deploy_stack(client, charm_series)
162
 
    client.set_config('dummy-source', {'token': ''})
163
 
    log.info("Setup complete.")
164
 
    log.info("Test started.")
165
 
    controller_client = client.get_controller_client()
166
 
    if strategy in ('ha', 'ha-backup'):
167
 
        controller_client.enable_ha()
168
 
        controller_client.wait_for_ha()
169
 
    if strategy in ('ha-backup', 'backup'):
170
 
        backup_file = controller_client.backup()
171
 
        restore_present_state_server(controller_client, backup_file)
172
 
    if strategy == 'ha':
173
 
        leader_only = True
174
 
    else:
175
 
        leader_only = False
176
 
    deleted_machine_ids = delete_controller_members(
177
 
        controller_client, leader_only=leader_only)
178
 
    log.info("Deleted {}".format(deleted_machine_ids))
179
 
    for m_id in deleted_machine_ids:
180
 
        if bs_manager.known_hosts.get(m_id):
181
 
            del bs_manager.known_hosts[m_id]
182
 
    if strategy == 'ha':
183
 
        client.get_status(600)
184
 
        log.info("HA recovered from leader failure.")
185
 
        log.info("PASS")
186
 
    else:
187
 
        check_controller = strategy != 'ha-backup'
188
 
        restore_missing_state_server(client, controller_client, backup_file,
189
 
                                     check_controller=check_controller)
190
 
    log.info("Test complete.")
191
 
 
192
 
 
193
 
def main(argv):
194
 
    args = parse_args(argv)
195
 
    configure_logging(args.verbose)
196
 
    bs_manager = BootstrapManager.from_args(args)
197
 
    with bs_manager.booted_context(upload_tools=args.upload_tools):
198
 
        with detect_bootstrap_machine(bs_manager):
199
 
            assess_recovery(bs_manager, args.strategy, args.charm_series)
 
200
        print_now("\nEXCEPTION CAUGHT:\n")
 
201
        logging.exception(e)
 
202
        if getattr(e, 'output', None):
 
203
            print_now('\n')
 
204
            print_now(e.output)
 
205
        print_now("\nFAIL")
 
206
        sys.exit(1)
200
207
 
201
208
 
202
209
if __name__ == '__main__':
203
 
    main(sys.argv[1:])
 
210
    main()