~andrewjbeach/juju-ci-tools/make-local-patcher

« back to all changes in this revision

Viewing changes to assess_recovery.py

  • Committer: Curtis Hovey
  • Date: 2016-09-20 01:59:47 UTC
  • mto: This revision was merged to the branch mainline in revision 1602.
  • Revision ID: curtis@canonical.com-20160920015947-ko27xkj3a4i774h6
Convert juju instance=ids to true azuzre ids.

Show diffs side-by-side

added added

removed removed

Lines of Context:
12
12
 
13
13
from deploy_stack import (
14
14
    BootstrapManager,
 
15
    deploy_dummy_stack,
 
16
    get_token_from_status,
15
17
    wait_for_state_server_to_shutdown,
16
18
)
17
19
from jujupy import (
18
 
    make_client,
19
20
    parse_new_state_server_from_error,
20
21
)
21
22
from substrate import (
 
23
    convert_to_azure_ids,
22
24
    terminate_instances,
23
25
)
24
26
from utility import (
25
 
    local_charm_path,
 
27
    add_basic_testing_arguments,
 
28
    configure_logging,
 
29
    JujuAssertionError,
26
30
    LoggedException,
27
 
    print_now,
28
31
)
29
32
 
30
33
 
34
37
running_instance_pattern = re.compile('\["([^"]+)"\]')
35
38
 
36
39
 
 
40
log = logging.getLogger("assess_recovery")
 
41
 
 
42
 
 
43
def check_token(client, token):
 
44
    found = get_token_from_status(client)
 
45
    if token not in found:
 
46
        raise JujuAssertionError('Token is not {}: {}'.format(
 
47
            token, found))
 
48
 
 
49
 
37
50
def deploy_stack(client, charm_series):
38
51
    """"Deploy a simple stack, state-server and ubuntu."""
39
 
    charm = local_charm_path(
40
 
        charm='ubuntu', juju_ver=client.version, series=charm_series)
41
 
    client.deploy(charm, series=charm_series)
42
 
    client.wait_for_started().status
43
 
    print_now("%s is ready to testing" % client.env.environment)
44
 
 
45
 
 
46
 
def restore_present_state_server(admin_client, backup_file):
 
52
    deploy_dummy_stack(client, charm_series)
 
53
    client.set_config('dummy-source', {'token': 'One'})
 
54
    client.wait_for_workloads()
 
55
    check_token(client, 'One')
 
56
    log.info("%s is ready to testing", client.env.environment)
 
57
 
 
58
 
 
59
def show_controller(client):
 
60
    controller_info = client.show_controller(format='yaml')
 
61
    log.info('Controller is:\n{}'.format(controller_info))
 
62
 
 
63
 
 
64
def restore_present_state_server(controller_client, backup_file):
47
65
    """juju-restore won't restore when the state-server is still present."""
48
66
    try:
49
 
        output = admin_client.restore_backup(backup_file)
50
 
    except CalledProcessError as e:
51
 
        print_now(
 
67
        controller_client.restore_backup(backup_file)
 
68
    except CalledProcessError:
 
69
        log.info(
52
70
            "juju-restore correctly refused to restore "
53
71
            "because the state-server was still up.")
54
 
        match = running_instance_pattern.search(e.stderr)
55
 
        if match is None:
56
 
            print_now("WARNING: Could not find the instance_id in output:")
57
 
            print_now(e.stderr)
58
 
            print_now("")
59
 
            return None
60
 
        return match.group(1)
 
72
        return
61
73
    else:
62
74
        raise Exception(
63
 
            "juju-restore restored to an operational state-server: %s" %
64
 
            output)
 
75
            "juju-restore restored to an operational state-serve")
65
76
 
66
77
 
67
78
def delete_controller_members(client, leader_only=False):
80
91
    deleted_machines = []
81
92
    for machine in members:
82
93
        instance_id = machine.info.get('instance-id')
 
94
        if client.env.config['type'] == 'azure':
 
95
            instance_id = convert_to_azure_ids(client, [instance_id])[0]
83
96
        host = machine.info.get('dns-name')
84
 
        print_now("Instrumenting node failure for member {}: {} at {}".format(
85
 
                  machine.machine_id, instance_id, host))
 
97
        log.info("Instrumenting node failure for member {}: {} at {}".format(
 
98
                 machine.machine_id, instance_id, host))
86
99
        terminate_instances(client.env, [instance_id])
87
 
        wait_for_state_server_to_shutdown(host, client, instance_id)
 
100
        wait_for_state_server_to_shutdown(
 
101
            host, client, instance_id, timeout=120)
88
102
        deleted_machines.append(machine.machine_id)
89
103
    return deleted_machines
90
104
 
91
105
 
92
 
def restore_missing_state_server(client, admin_client, backup_file):
 
106
def restore_missing_state_server(client, controller_client, backup_file):
93
107
    """juju-restore creates a replacement state-server for the services."""
94
 
    print_now("Starting restore.")
 
108
    log.info("Starting restore.")
95
109
    try:
96
 
        output = admin_client.restore_backup(backup_file)
 
110
        controller_client.restore_backup(backup_file)
97
111
    except CalledProcessError as e:
98
 
        print_now('Call of juju restore exited with an error\n')
99
 
        print_now('Call: {} \n'.format(e.cmd))
100
 
        message = 'Restore failed: \n%s' % e.stderr
101
 
        print_now(message)
102
 
        print_now('\n')
103
 
        logging.exception(e)
 
112
        log.info('Call of juju restore exited with an error\n')
 
113
        log.info('Call:  %r\n', e.cmd)
 
114
        log.exception(e)
104
115
        raise LoggedException(e)
105
 
    print_now(output)
106
 
    admin_client.wait_for_started(600).status
107
 
    print_now("%s restored" % client.env.environment)
108
 
    print_now("PASS")
 
116
    controller_client.wait_for_started(600)
 
117
    show_controller(client)
 
118
    client.set_config('dummy-source', {'token': 'Two'})
 
119
    client.wait_for_started()
 
120
    client.wait_for_workloads()
 
121
    check_token(client, 'Two')
 
122
    log.info("%s restored", client.env.environment)
 
123
    log.info("PASS")
109
124
 
110
125
 
111
126
def parse_args(argv=None):
112
 
    parser = ArgumentParser('Test recovery strategies.')
 
127
    parser = ArgumentParser(description='Test recovery strategies.')
 
128
    add_basic_testing_arguments(parser)
113
129
    parser.add_argument(
114
130
        '--charm-series', help='Charm series.', default='')
115
 
    parser.add_argument(
116
 
        '--debug', action='store_true', default=False,
117
 
        help='Use --debug juju logging.')
118
131
    strategy = parser.add_argument_group('test strategy')
119
132
    strategy.add_argument(
120
133
        '--ha', action='store_const', dest='strategy', const='ha',
125
138
    strategy.add_argument(
126
139
        '--ha-backup', action='store_const', dest='strategy',
127
140
        const='ha-backup', help="Test backup/restore of HA.")
128
 
    parser.add_argument('juju_path')
129
 
    parser.add_argument('env_name')
130
 
    parser.add_argument('logs', help='Directory to store logs in.')
131
 
    parser.add_argument(
132
 
        'temp_env_name', nargs='?',
133
 
        help='Temporary environment name to use for this test.')
134
 
    parser.add_argument(
135
 
        '--agent-stream', help='Stream for retrieving agent binaries.')
136
 
    parser.add_argument(
137
 
        '--series', help='Name of the Ubuntu series to use.')
138
141
    return parser.parse_args(argv)
139
142
 
140
143
 
141
 
def make_client_from_args(args):
142
 
    return make_client(args.juju_path, args.debug, args.env_name,
143
 
                       args.temp_env_name)
144
 
 
145
 
 
146
144
@contextmanager
147
145
def detect_bootstrap_machine(bs_manager):
148
146
    try:
153
151
 
154
152
 
155
153
def assess_recovery(bs_manager, strategy, charm_series):
 
154
    log.info("Setting up test.")
156
155
    client = bs_manager.client
157
156
    deploy_stack(client, charm_series)
158
 
    admin_client = client.get_admin_client()
 
157
    log.info("Setup complete.")
 
158
    log.info("Test started.")
 
159
    controller_client = client.get_controller_client()
159
160
    if strategy in ('ha', 'ha-backup'):
160
 
        admin_client.enable_ha()
161
 
        admin_client.wait_for_ha()
 
161
        controller_client.enable_ha()
 
162
        controller_client.wait_for_ha()
162
163
    if strategy in ('ha-backup', 'backup'):
163
 
        backup_file = admin_client.backup()
164
 
        restore_present_state_server(admin_client, backup_file)
 
164
        backup_file = controller_client.backup()
 
165
        restore_present_state_server(controller_client, backup_file)
165
166
    if strategy == 'ha':
166
167
        leader_only = True
167
168
    else:
168
169
        leader_only = False
169
170
    deleted_machine_ids = delete_controller_members(
170
 
        admin_client, leader_only=leader_only)
 
171
        controller_client, leader_only=leader_only)
 
172
    log.info("Deleted {}".format(deleted_machine_ids))
171
173
    for m_id in deleted_machine_ids:
172
174
        if bs_manager.known_hosts.get(m_id):
173
175
            del bs_manager.known_hosts[m_id]
174
176
    if strategy == 'ha':
175
177
        client.get_status(600)
 
178
        log.info("HA recovered from leader failure.")
 
179
        log.info("PASS")
176
180
    else:
177
 
        restore_missing_state_server(client, admin_client, backup_file)
 
181
        restore_missing_state_server(client, controller_client, backup_file)
 
182
    log.info("Test complete.")
178
183
 
179
184
 
180
185
def main(argv):
181
186
    args = parse_args(argv)
182
 
    client = make_client_from_args(args)
183
 
    jes_enabled = client.is_jes_enabled()
184
 
    bs_manager = BootstrapManager(
185
 
        client.env.environment, client, client, None, [], args.series,
186
 
        agent_url=None, agent_stream=args.agent_stream, region=None,
187
 
        log_dir=args.logs, keep_env=False, permanent=jes_enabled,
188
 
        jes_enabled=jes_enabled)
189
 
    with bs_manager.booted_context(upload_tools=False):
 
187
    configure_logging(args.verbose)
 
188
    bs_manager = BootstrapManager.from_args(args)
 
189
    with bs_manager.booted_context(upload_tools=args.upload_tools):
190
190
        with detect_bootstrap_machine(bs_manager):
191
191
            assess_recovery(bs_manager, args.strategy, args.charm_series)
192
192