~andrewjbeach/juju-ci-tools/make-local-patcher

« back to all changes in this revision

Viewing changes to assess_recovery.py

  • Committer: Martin Packman
  • Date: 2016-04-29 00:19:30 UTC
  • mto: This revision was merged to the branch mainline in revision 1389.
  • Revision ID: martin.packman@canonical.com-20160429001930-yoju030ik6lwqf0e
Add clean_maas.py script for releasing machines

Show diffs side-by-side

added added

removed removed

Lines of Context:
12
12
 
13
13
from deploy_stack import (
14
14
    BootstrapManager,
15
 
    deploy_dummy_stack,
16
 
    get_token_from_status,
17
15
    wait_for_state_server_to_shutdown,
18
16
)
19
17
from jujupy import (
 
18
    make_client,
20
19
    parse_new_state_server_from_error,
21
20
)
22
21
from substrate import (
23
 
    convert_to_azure_ids,
24
22
    terminate_instances,
25
23
)
26
24
from utility import (
27
 
    add_basic_testing_arguments,
28
 
    configure_logging,
29
 
    JujuAssertionError,
 
25
    local_charm_path,
30
26
    LoggedException,
31
 
    until_timeout,
 
27
    print_now,
32
28
)
33
29
 
34
30
 
38
34
running_instance_pattern = re.compile('\["([^"]+)"\]')
39
35
 
40
36
 
41
 
log = logging.getLogger("assess_recovery")
42
 
 
43
 
 
44
 
def check_token(client, token):
45
 
    for ignored in until_timeout(300):
46
 
        found = get_token_from_status(client)
47
 
        if found and token in found:
48
 
            return found
49
 
    raise JujuAssertionError('Token is not {}: {}'.format(
50
 
                             token, found))
51
 
 
52
 
 
53
37
def deploy_stack(client, charm_series):
54
38
    """"Deploy a simple stack, state-server and ubuntu."""
55
 
    deploy_dummy_stack(client, charm_series)
56
 
    client.set_config('dummy-source', {'token': 'One'})
57
 
    client.wait_for_workloads()
58
 
    check_token(client, 'One')
59
 
    log.info("%s is ready to testing", client.env.environment)
60
 
 
61
 
 
62
 
def show_controller(client):
63
 
    controller_info = client.show_controller(format='yaml')
64
 
    log.info('Controller is:\n{}'.format(controller_info))
65
 
 
66
 
 
67
 
def restore_present_state_server(controller_client, backup_file):
 
39
    charm = local_charm_path(
 
40
        charm='ubuntu', juju_ver=client.version, series=charm_series)
 
41
    client.deploy(charm, series=charm_series)
 
42
    client.wait_for_started().status
 
43
    print_now("%s is ready to testing" % client.env.environment)
 
44
 
 
45
 
 
46
def restore_present_state_server(admin_client, backup_file):
68
47
    """juju-restore won't restore when the state-server is still present."""
69
48
    try:
70
 
        controller_client.restore_backup(backup_file)
71
 
    except CalledProcessError:
72
 
        log.info(
 
49
        output = admin_client.restore_backup(backup_file)
 
50
    except CalledProcessError as e:
 
51
        print_now(
73
52
            "juju-restore correctly refused to restore "
74
53
            "because the state-server was still up.")
75
 
        return
 
54
        match = running_instance_pattern.search(e.stderr)
 
55
        if match is None:
 
56
            print_now("WARNING: Could not find the instance_id in output:")
 
57
            print_now(e.stderr)
 
58
            print_now("")
 
59
            return None
 
60
        return match.group(1)
76
61
    else:
77
62
        raise Exception(
78
 
            "juju-restore restored to an operational state-serve")
 
63
            "juju-restore restored to an operational state-server: %s" %
 
64
            output)
79
65
 
80
66
 
81
67
def delete_controller_members(client, leader_only=False):
94
80
    deleted_machines = []
95
81
    for machine in members:
96
82
        instance_id = machine.info.get('instance-id')
97
 
        if client.env.config['type'] == 'azure':
98
 
            instance_id = convert_to_azure_ids(client, [instance_id])[0]
99
83
        host = machine.info.get('dns-name')
100
 
        log.info("Instrumenting node failure for member {}: {} at {}".format(
101
 
                 machine.machine_id, instance_id, host))
 
84
        print_now("Instrumenting node failure for member {}: {} at {}".format(
 
85
                  machine.machine_id, instance_id, host))
102
86
        terminate_instances(client.env, [instance_id])
103
 
        wait_for_state_server_to_shutdown(
104
 
            host, client, instance_id, timeout=120)
 
87
        wait_for_state_server_to_shutdown(host, client, instance_id)
105
88
        deleted_machines.append(machine.machine_id)
106
89
    return deleted_machines
107
90
 
108
91
 
109
 
def restore_missing_state_server(client, controller_client, backup_file,
110
 
                                 check_controller=True):
 
92
def restore_missing_state_server(client, admin_client, backup_file):
111
93
    """juju-restore creates a replacement state-server for the services."""
112
 
    log.info("Starting restore.")
 
94
    print_now("Starting restore.")
113
95
    try:
114
 
        controller_client.restore_backup(backup_file)
 
96
        output = admin_client.restore_backup(backup_file)
115
97
    except CalledProcessError as e:
116
 
        log.info('Call of juju restore exited with an error\n')
117
 
        log.info('Call:  %r\n', e.cmd)
118
 
        log.exception(e)
 
98
        print_now('Call of juju restore exited with an error\n')
 
99
        print_now('Call: {} \n'.format(e.cmd))
 
100
        message = 'Restore failed: \n%s' % e.stderr
 
101
        print_now(message)
 
102
        print_now('\n')
 
103
        logging.exception(e)
119
104
        raise LoggedException(e)
120
 
    if check_controller:
121
 
        controller_client.wait_for_started(600)
122
 
    show_controller(client)
123
 
    client.set_config('dummy-source', {'token': 'Two'})
124
 
    client.wait_for_started()
125
 
    client.wait_for_workloads()
126
 
    check_token(client, 'Two')
127
 
    log.info("%s restored", client.env.environment)
128
 
    log.info("PASS")
 
105
    print_now(output)
 
106
    admin_client.wait_for_started(600).status
 
107
    print_now("%s restored" % client.env.environment)
 
108
    print_now("PASS")
129
109
 
130
110
 
131
111
def parse_args(argv=None):
132
 
    parser = ArgumentParser(description='Test recovery strategies.')
133
 
    add_basic_testing_arguments(parser)
 
112
    parser = ArgumentParser('Test recovery strategies.')
134
113
    parser.add_argument(
135
114
        '--charm-series', help='Charm series.', default='')
 
115
    parser.add_argument(
 
116
        '--debug', action='store_true', default=False,
 
117
        help='Use --debug juju logging.')
136
118
    strategy = parser.add_argument_group('test strategy')
137
119
    strategy.add_argument(
138
120
        '--ha', action='store_const', dest='strategy', const='ha',
143
125
    strategy.add_argument(
144
126
        '--ha-backup', action='store_const', dest='strategy',
145
127
        const='ha-backup', help="Test backup/restore of HA.")
 
128
    parser.add_argument('juju_path')
 
129
    parser.add_argument('env_name')
 
130
    parser.add_argument('logs', help='Directory to store logs in.')
 
131
    parser.add_argument(
 
132
        'temp_env_name', nargs='?',
 
133
        help='Temporary environment name to use for this test.')
 
134
    parser.add_argument(
 
135
        '--agent-stream', help='Stream for retrieving agent binaries.')
 
136
    parser.add_argument(
 
137
        '--series', help='Name of the Ubuntu series to use.')
146
138
    return parser.parse_args(argv)
147
139
 
148
140
 
 
141
def make_client_from_args(args):
 
142
    return make_client(args.juju_path, args.debug, args.env_name,
 
143
                       args.temp_env_name)
 
144
 
 
145
 
149
146
@contextmanager
150
147
def detect_bootstrap_machine(bs_manager):
151
148
    try:
156
153
 
157
154
 
158
155
def assess_recovery(bs_manager, strategy, charm_series):
159
 
    log.info("Setting up test.")
160
156
    client = bs_manager.client
161
157
    deploy_stack(client, charm_series)
162
 
    client.set_config('dummy-source', {'token': ''})
163
 
    log.info("Setup complete.")
164
 
    log.info("Test started.")
165
 
    controller_client = client.get_controller_client()
 
158
    admin_client = client.get_admin_client()
166
159
    if strategy in ('ha', 'ha-backup'):
167
 
        controller_client.enable_ha()
168
 
        controller_client.wait_for_ha()
 
160
        admin_client.enable_ha()
 
161
        admin_client.wait_for_ha()
169
162
    if strategy in ('ha-backup', 'backup'):
170
 
        backup_file = controller_client.backup()
171
 
        restore_present_state_server(controller_client, backup_file)
 
163
        backup_file = admin_client.backup()
 
164
        restore_present_state_server(admin_client, backup_file)
172
165
    if strategy == 'ha':
173
166
        leader_only = True
174
167
    else:
175
168
        leader_only = False
176
169
    deleted_machine_ids = delete_controller_members(
177
 
        controller_client, leader_only=leader_only)
178
 
    log.info("Deleted {}".format(deleted_machine_ids))
 
170
        admin_client, leader_only=leader_only)
179
171
    for m_id in deleted_machine_ids:
180
172
        if bs_manager.known_hosts.get(m_id):
181
173
            del bs_manager.known_hosts[m_id]
182
174
    if strategy == 'ha':
183
175
        client.get_status(600)
184
 
        log.info("HA recovered from leader failure.")
185
 
        log.info("PASS")
186
176
    else:
187
 
        check_controller = strategy != 'ha-backup'
188
 
        restore_missing_state_server(client, controller_client, backup_file,
189
 
                                     check_controller=check_controller)
190
 
    log.info("Test complete.")
 
177
        restore_missing_state_server(client, admin_client, backup_file)
191
178
 
192
179
 
193
180
def main(argv):
194
181
    args = parse_args(argv)
195
 
    configure_logging(args.verbose)
196
 
    bs_manager = BootstrapManager.from_args(args)
197
 
    with bs_manager.booted_context(upload_tools=args.upload_tools):
 
182
    client = make_client_from_args(args)
 
183
    jes_enabled = client.is_jes_enabled()
 
184
    bs_manager = BootstrapManager(
 
185
        client.env.environment, client, client, None, [], args.series,
 
186
        agent_url=None, agent_stream=args.agent_stream, region=None,
 
187
        log_dir=args.logs, keep_env=False, permanent=jes_enabled,
 
188
        jes_enabled=jes_enabled)
 
189
    with bs_manager.booted_context(upload_tools=False):
198
190
        with detect_bootstrap_machine(bs_manager):
199
191
            assess_recovery(bs_manager, args.strategy, args.charm_series)
200
192