~abentley/juju-ci-tools/client-from-config-4

« back to all changes in this revision

Viewing changes to assess_recovery.py

  • Committer: Aaron Bentley
  • Date: 2016-03-02 16:04:04 UTC
  • mto: This revision was merged to the branch mainline in revision 1307.
  • Revision ID: aaron.bentley@canonical.com-20160302160404-0l1z6lb3oanrgvxo
assess_heterogeneous_control does not tear down with inappropriate client.

Show diffs side-by-side

added added

removed removed

Lines of Context:
4
4
from __future__ import print_function
5
5
 
6
6
from argparse import ArgumentParser
7
 
from contextlib import contextmanager
8
 
import logging
9
7
import re
10
8
from subprocess import CalledProcessError
11
9
import sys
14
12
    BootstrapManager,
15
13
    wait_for_state_server_to_shutdown,
16
14
)
17
 
from jujucharm import (
18
 
    local_charm_path,
19
 
)
20
15
from jujupy import (
 
16
    get_machine_dns_name,
 
17
    make_client,
21
18
    parse_new_state_server_from_error,
22
19
)
23
20
from substrate import (
24
 
    convert_to_azure_ids,
25
21
    terminate_instances,
26
22
)
27
23
from utility import (
28
 
    add_basic_testing_arguments,
29
 
    configure_logging,
30
 
    LoggedException,
 
24
    print_now,
31
25
)
32
26
 
33
27
 
37
31
running_instance_pattern = re.compile('\["([^"]+)"\]')
38
32
 
39
33
 
40
 
log = logging.getLogger("assess_recovery")
41
 
 
42
 
 
43
 
def deploy_stack(client, charm_series):
 
34
def deploy_stack(client, charm_prefix):
44
35
    """"Deploy a simple stack, state-server and ubuntu."""
45
 
    charm = local_charm_path(
46
 
        charm='ubuntu', juju_ver=client.version, series=charm_series)
47
 
    client.deploy(charm, series=charm_series)
 
36
    if charm_prefix and not charm_prefix.endswith('/'):
 
37
        charm_prefix = charm_prefix + '/'
 
38
    client.juju('deploy', (charm_prefix + 'ubuntu',))
48
39
    client.wait_for_started().status
49
 
    log.info("%s is ready to testing", client.env.environment)
50
 
 
51
 
 
52
 
def restore_present_state_server(controller_client, backup_file):
 
40
    print_now("%s is ready to testing" % client.env.environment)
 
41
    instance_id = client.get_status().status['machines']['0']['instance-id']
 
42
    return instance_id
 
43
 
 
44
 
 
45
def restore_present_state_server(client, backup_file):
53
46
    """juju-restore won't restore when the state-server is still present."""
54
47
    try:
55
 
        output = controller_client.restore_backup(backup_file)
 
48
        output = client.restore_backup(backup_file)
56
49
    except CalledProcessError as e:
57
 
        log.info(
 
50
        print_now(
58
51
            "juju-restore correctly refused to restore "
59
52
            "because the state-server was still up.")
60
53
        match = running_instance_pattern.search(e.stderr)
61
54
        if match is None:
62
 
            log.warning("Could not find the instance_id in output:\n%s\n",
63
 
                        e.stderr)
 
55
            print_now("WARNING: Could not find the instance_id in output:")
 
56
            print_now(e.stderr)
 
57
            print_now("")
64
58
            return None
65
59
        return match.group(1)
66
60
    else:
69
63
            output)
70
64
 
71
65
 
72
 
def delete_controller_members(client, leader_only=False):
73
 
    """Delete controller members.
74
 
 
75
 
    The all members are delete by default. The followers are deleted before the
76
 
    leader to simulates a total controller failure. When leader_only is true,
77
 
    the leader is deleted to trigger a new leader election.
78
 
    """
79
 
    if leader_only:
80
 
        leader = client.get_controller_leader()
81
 
        members = [leader]
82
 
    else:
83
 
        members = client.get_controller_members()
84
 
        members.reverse()
85
 
    deleted_machines = []
86
 
    for machine in members:
87
 
        instance_id = machine.info.get('instance-id')
88
 
        if client.env.config['type'] == 'azure':
89
 
            instance_id = convert_to_azure_ids(client, [instance_id])[0]
90
 
        host = machine.info.get('dns-name')
91
 
        log.info("Instrumenting node failure for member {}: {} at {}".format(
92
 
                 machine.machine_id, instance_id, host))
93
 
        terminate_instances(client.env, [instance_id])
94
 
        wait_for_state_server_to_shutdown(
95
 
            host, client, instance_id, timeout=120)
96
 
        deleted_machines.append(machine.machine_id)
97
 
    return deleted_machines
98
 
 
99
 
 
100
 
def restore_missing_state_server(client, controller_client, backup_file):
 
66
def delete_instance(client, instance_id):
 
67
    """Delete the instance using the providers tools."""
 
68
    print_now("Instrumenting a bootstrap node failure.")
 
69
    return terminate_instances(client.env, [instance_id])
 
70
 
 
71
 
 
72
def delete_extra_state_servers(client, instance_id):
 
73
    """Delete the extra state-server instances."""
 
74
    status = client.get_status()
 
75
    for machine, info in status.iter_machines():
 
76
        extra_instance_id = info.get('instance-id')
 
77
        status = client.get_controller_member_status(info)
 
78
        if extra_instance_id != instance_id and status is not None:
 
79
            print_now("Deleting state-server-member {}".format(machine))
 
80
            host = get_machine_dns_name(client, machine)
 
81
            delete_instance(client, extra_instance_id)
 
82
            wait_for_state_server_to_shutdown(host, client, extra_instance_id)
 
83
 
 
84
 
 
85
def restore_missing_state_server(client, backup_file):
101
86
    """juju-restore creates a replacement state-server for the services."""
102
 
    log.info("Starting restore.")
 
87
    print_now("Starting restore.")
103
88
    try:
104
 
        output = controller_client.restore_backup(backup_file)
 
89
        output = client.restore_backup(backup_file)
105
90
    except CalledProcessError as e:
106
 
        log.info('Call of juju restore exited with an error\n')
107
 
        log.info('Call:  %r\n', e.cmd)
108
 
        log.info('Restore failed: \n%s\n', e.stderr)
109
 
        log.exception(e)
110
 
        raise LoggedException(e)
111
 
    log.info(output)
112
 
    controller_client.wait_for_started(600).status
113
 
    log.info("%s restored", client.env.environment)
114
 
    log.info("PASS")
 
91
        print_now('Call of juju restore exited with an error\n')
 
92
        message = 'Restore failed: \n%s' % e.stderr
 
93
        print_now(message)
 
94
        print_now('\n')
 
95
        raise Exception(message)
 
96
    print_now(output)
 
97
    client.wait_for_started(600).status
 
98
    print_now("%s restored" % client.env.environment)
 
99
    print_now("PASS")
115
100
 
116
101
 
117
102
def parse_args(argv=None):
118
 
    parser = ArgumentParser(description='Test recovery strategies.')
119
 
    add_basic_testing_arguments(parser)
120
 
    parser.add_argument(
121
 
        '--charm-series', help='Charm series.', default='')
 
103
    parser = ArgumentParser('Test recovery strategies.')
 
104
    parser.add_argument(
 
105
        '--charm-prefix', help='A prefix for charm urls.', default='')
 
106
    parser.add_argument(
 
107
        '--debug', action='store_true', default=False,
 
108
        help='Use --debug juju logging.')
122
109
    strategy = parser.add_argument_group('test strategy')
123
110
    strategy.add_argument(
124
111
        '--ha', action='store_const', dest='strategy', const='ha',
129
116
    strategy.add_argument(
130
117
        '--ha-backup', action='store_const', dest='strategy',
131
118
        const='ha-backup', help="Test backup/restore of HA.")
 
119
    parser.add_argument('juju_path')
 
120
    parser.add_argument('env_name')
 
121
    parser.add_argument('logs', help='Directory to store logs in.')
 
122
    parser.add_argument(
 
123
        'temp_env_name', nargs='?',
 
124
        help='Temporary environment name to use for this test.')
 
125
    parser.add_argument(
 
126
        '--agent-stream', help='Stream for retrieving agent binaries.')
 
127
    parser.add_argument(
 
128
        '--series', help='Name of the Ubuntu series to use.')
132
129
    return parser.parse_args(argv)
133
130
 
134
131
 
135
 
@contextmanager
136
 
def detect_bootstrap_machine(bs_manager):
137
 
    try:
138
 
        yield
139
 
    except Exception as e:
140
 
        bs_manager.known_hosts['0'] = parse_new_state_server_from_error(e)
141
 
        raise
142
 
 
143
 
 
144
 
def assess_recovery(bs_manager, strategy, charm_series):
145
 
    log.info("Setting up test.")
146
 
    client = bs_manager.client
147
 
    deploy_stack(client, charm_series)
148
 
    log.info("Setup complete.")
149
 
    log.info("Test started.")
150
 
    controller_client = client.get_controller_client()
151
 
    if strategy in ('ha', 'ha-backup'):
152
 
        controller_client.enable_ha()
153
 
        controller_client.wait_for_ha()
154
 
    if strategy in ('ha-backup', 'backup'):
155
 
        backup_file = controller_client.backup()
156
 
        restore_present_state_server(controller_client, backup_file)
157
 
    if strategy == 'ha':
158
 
        leader_only = True
159
 
    else:
160
 
        leader_only = False
161
 
    deleted_machine_ids = delete_controller_members(
162
 
        controller_client, leader_only=leader_only)
163
 
    log.info("Deleted {}".format(deleted_machine_ids))
164
 
    for m_id in deleted_machine_ids:
165
 
        if bs_manager.known_hosts.get(m_id):
166
 
            del bs_manager.known_hosts[m_id]
167
 
    if strategy == 'ha':
168
 
        client.get_status(600)
169
 
        log.info("HA recovered from leader failure.")
170
 
        log.info("PASS")
171
 
    else:
172
 
        restore_missing_state_server(client, controller_client, backup_file)
173
 
    log.info("Test complete.")
 
132
def make_client_from_args(args):
 
133
    return make_client(args.juju_path, args.debug, args.env_name,
 
134
                       args.temp_env_name)
174
135
 
175
136
 
176
137
def main(argv):
177
138
    args = parse_args(argv)
178
 
    configure_logging(args.verbose)
179
 
    bs_manager = BootstrapManager.from_args(args)
180
 
    with bs_manager.booted_context(upload_tools=args.upload_tools):
181
 
        with detect_bootstrap_machine(bs_manager):
182
 
            assess_recovery(bs_manager, args.strategy, args.charm_series)
 
139
    client = make_client_from_args(args)
 
140
    jes_enabled = client.is_jes_enabled()
 
141
    bs_manager = BootstrapManager(
 
142
        client.env.environment, client, client, None, [], args.series,
 
143
        agent_url=None, agent_stream=args.agent_stream, region=None,
 
144
        log_dir=args.logs, keep_env=False, permanent=jes_enabled,
 
145
        jes_enabled=jes_enabled)
 
146
    with bs_manager.booted_context(upload_tools=False):
 
147
        try:
 
148
            instance_id = deploy_stack(client, args.charm_prefix)
 
149
            if args.strategy in ('ha', 'ha-backup'):
 
150
                client.enable_ha()
 
151
                client.wait_for_ha()
 
152
            if args.strategy in ('ha-backup', 'backup'):
 
153
                backup_file = client.backup()
 
154
                restore_present_state_server(client, backup_file)
 
155
            if args.strategy == 'ha-backup':
 
156
                delete_extra_state_servers(client, instance_id)
 
157
            delete_instance(client, instance_id)
 
158
            wait_for_state_server_to_shutdown(
 
159
                bs_manager.known_hosts['0'], client, instance_id)
 
160
            del bs_manager.known_hosts['0']
 
161
            if args.strategy == 'ha':
 
162
                client.get_status(600)
 
163
            else:
 
164
                restore_missing_state_server(client, backup_file)
 
165
        except Exception as e:
 
166
            bs_manager.known_hosts['0'] = parse_new_state_server_from_error(e)
 
167
            raise
183
168
 
184
169
 
185
170
if __name__ == '__main__':