37
31
running_instance_pattern = re.compile('\["([^"]+)"\]')
40
log = logging.getLogger("assess_recovery")
43
def deploy_stack(client, charm_series):
34
def deploy_stack(client, charm_prefix):
44
35
""""Deploy a simple stack, state-server and ubuntu."""
45
charm = local_charm_path(
46
charm='ubuntu', juju_ver=client.version, series=charm_series)
47
client.deploy(charm, series=charm_series)
36
if charm_prefix and not charm_prefix.endswith('/'):
37
charm_prefix = charm_prefix + '/'
38
client.juju('deploy', (charm_prefix + 'ubuntu',))
48
39
client.wait_for_started().status
49
log.info("%s is ready to testing", client.env.environment)
52
def restore_present_state_server(controller_client, backup_file):
40
print_now("%s is ready to testing" % client.env.environment)
41
instance_id = client.get_status().status['machines']['0']['instance-id']
45
def restore_present_state_server(client, backup_file):
53
46
"""juju-restore won't restore when the state-server is still present."""
55
output = controller_client.restore_backup(backup_file)
48
output = client.restore_backup(backup_file)
56
49
except CalledProcessError as e:
58
51
"juju-restore correctly refused to restore "
59
52
"because the state-server was still up.")
60
53
match = running_instance_pattern.search(e.stderr)
62
log.warning("Could not find the instance_id in output:\n%s\n",
55
print_now("WARNING: Could not find the instance_id in output:")
65
59
return match.group(1)
72
def delete_controller_members(client, leader_only=False):
73
"""Delete controller members.
75
The all members are delete by default. The followers are deleted before the
76
leader to simulates a total controller failure. When leader_only is true,
77
the leader is deleted to trigger a new leader election.
80
leader = client.get_controller_leader()
83
members = client.get_controller_members()
86
for machine in members:
87
instance_id = machine.info.get('instance-id')
88
if client.env.config['type'] == 'azure':
89
instance_id = convert_to_azure_ids(client, [instance_id])[0]
90
host = machine.info.get('dns-name')
91
log.info("Instrumenting node failure for member {}: {} at {}".format(
92
machine.machine_id, instance_id, host))
93
terminate_instances(client.env, [instance_id])
94
wait_for_state_server_to_shutdown(
95
host, client, instance_id, timeout=120)
96
deleted_machines.append(machine.machine_id)
97
return deleted_machines
100
def restore_missing_state_server(client, controller_client, backup_file):
66
def delete_instance(client, instance_id):
67
"""Delete the instance using the providers tools."""
68
print_now("Instrumenting a bootstrap node failure.")
69
return terminate_instances(client.env, [instance_id])
72
def delete_extra_state_servers(client, instance_id):
73
"""Delete the extra state-server instances."""
74
status = client.get_status()
75
for machine, info in status.iter_machines():
76
extra_instance_id = info.get('instance-id')
77
status = client.get_controller_member_status(info)
78
if extra_instance_id != instance_id and status is not None:
79
print_now("Deleting state-server-member {}".format(machine))
80
host = get_machine_dns_name(client, machine)
81
delete_instance(client, extra_instance_id)
82
wait_for_state_server_to_shutdown(host, client, extra_instance_id)
85
def restore_missing_state_server(client, backup_file):
101
86
"""juju-restore creates a replacement state-server for the services."""
102
log.info("Starting restore.")
87
print_now("Starting restore.")
104
output = controller_client.restore_backup(backup_file)
89
output = client.restore_backup(backup_file)
105
90
except CalledProcessError as e:
106
log.info('Call of juju restore exited with an error\n')
107
log.info('Call: %r\n', e.cmd)
108
log.info('Restore failed: \n%s\n', e.stderr)
110
raise LoggedException(e)
112
controller_client.wait_for_started(600).status
113
log.info("%s restored", client.env.environment)
91
print_now('Call of juju restore exited with an error\n')
92
message = 'Restore failed: \n%s' % e.stderr
95
raise Exception(message)
97
client.wait_for_started(600).status
98
print_now("%s restored" % client.env.environment)
117
102
def parse_args(argv=None):
118
parser = ArgumentParser(description='Test recovery strategies.')
119
add_basic_testing_arguments(parser)
121
'--charm-series', help='Charm series.', default='')
103
parser = ArgumentParser('Test recovery strategies.')
105
'--charm-prefix', help='A prefix for charm urls.', default='')
107
'--debug', action='store_true', default=False,
108
help='Use --debug juju logging.')
122
109
strategy = parser.add_argument_group('test strategy')
123
110
strategy.add_argument(
124
111
'--ha', action='store_const', dest='strategy', const='ha',
129
116
strategy.add_argument(
130
117
'--ha-backup', action='store_const', dest='strategy',
131
118
const='ha-backup', help="Test backup/restore of HA.")
119
parser.add_argument('juju_path')
120
parser.add_argument('env_name')
121
parser.add_argument('logs', help='Directory to store logs in.')
123
'temp_env_name', nargs='?',
124
help='Temporary environment name to use for this test.')
126
'--agent-stream', help='Stream for retrieving agent binaries.')
128
'--series', help='Name of the Ubuntu series to use.')
132
129
return parser.parse_args(argv)
136
def detect_bootstrap_machine(bs_manager):
139
except Exception as e:
140
bs_manager.known_hosts['0'] = parse_new_state_server_from_error(e)
144
def assess_recovery(bs_manager, strategy, charm_series):
145
log.info("Setting up test.")
146
client = bs_manager.client
147
deploy_stack(client, charm_series)
148
log.info("Setup complete.")
149
log.info("Test started.")
150
controller_client = client.get_controller_client()
151
if strategy in ('ha', 'ha-backup'):
152
controller_client.enable_ha()
153
controller_client.wait_for_ha()
154
if strategy in ('ha-backup', 'backup'):
155
backup_file = controller_client.backup()
156
restore_present_state_server(controller_client, backup_file)
161
deleted_machine_ids = delete_controller_members(
162
controller_client, leader_only=leader_only)
163
log.info("Deleted {}".format(deleted_machine_ids))
164
for m_id in deleted_machine_ids:
165
if bs_manager.known_hosts.get(m_id):
166
del bs_manager.known_hosts[m_id]
168
client.get_status(600)
169
log.info("HA recovered from leader failure.")
172
restore_missing_state_server(client, controller_client, backup_file)
173
log.info("Test complete.")
132
def make_client_from_args(args):
133
return make_client(args.juju_path, args.debug, args.env_name,
177
138
args = parse_args(argv)
178
configure_logging(args.verbose)
179
bs_manager = BootstrapManager.from_args(args)
180
with bs_manager.booted_context(upload_tools=args.upload_tools):
181
with detect_bootstrap_machine(bs_manager):
182
assess_recovery(bs_manager, args.strategy, args.charm_series)
139
client = make_client_from_args(args)
140
jes_enabled = client.is_jes_enabled()
141
bs_manager = BootstrapManager(
142
client.env.environment, client, client, None, [], args.series,
143
agent_url=None, agent_stream=args.agent_stream, region=None,
144
log_dir=args.logs, keep_env=False, permanent=jes_enabled,
145
jes_enabled=jes_enabled)
146
with bs_manager.booted_context(upload_tools=False):
148
instance_id = deploy_stack(client, args.charm_prefix)
149
if args.strategy in ('ha', 'ha-backup'):
152
if args.strategy in ('ha-backup', 'backup'):
153
backup_file = client.backup()
154
restore_present_state_server(client, backup_file)
155
if args.strategy == 'ha-backup':
156
delete_extra_state_servers(client, instance_id)
157
delete_instance(client, instance_id)
158
wait_for_state_server_to_shutdown(
159
bs_manager.known_hosts['0'], client, instance_id)
160
del bs_manager.known_hosts['0']
161
if args.strategy == 'ha':
162
client.get_status(600)
164
restore_missing_state_server(client, backup_file)
165
except Exception as e:
166
bs_manager.known_hosts['0'] = parse_new_state_server_from_error(e)
185
170
if __name__ == '__main__':