4
4
from __future__ import print_function
6
8
from argparse import ArgumentParser
7
from contextlib import contextmanager
10
from subprocess import CalledProcessError
13
15
from deploy_stack import (
16
get_token_from_status,
17
18
wait_for_state_server_to_shutdown,
20
from jujuconfig import (
19
24
from jujupy import (
20
28
parse_new_state_server_from_error,
22
30
from substrate import (
24
31
terminate_instances,
26
33
from utility import (
27
add_basic_testing_arguments,
38
39
running_instance_pattern = re.compile('\["([^"]+)"\]')
41
log = logging.getLogger("assess_recovery")
44
def check_token(client, token):
45
for ignored in until_timeout(300):
46
found = get_token_from_status(client)
47
if found and token in found:
49
raise JujuAssertionError('Token is not {}: {}'.format(
53
def deploy_stack(client, charm_series):
42
def setup_juju_path(juju_path):
43
"""Ensure the binaries and scripts under test are found first."""
44
full_path = os.path.abspath(juju_path)
45
if not os.path.isdir(full_path):
46
raise ValueError("The juju_path does not exist: %s" % full_path)
47
os.environ['PATH'] = '%s:%s' % (full_path, os.environ['PATH'])
48
sys.path.insert(0, full_path)
51
def deploy_stack(client, charm_prefix):
54
52
""""Deploy a simple stack, state-server and ubuntu."""
55
deploy_dummy_stack(client, charm_series)
56
client.set_config('dummy-source', {'token': 'One'})
57
client.wait_for_workloads()
58
check_token(client, 'One')
59
log.info("%s is ready to testing", client.env.environment)
62
def show_controller(client):
63
controller_info = client.show_controller(format='yaml')
64
log.info('Controller is:\n{}'.format(controller_info))
67
def restore_present_state_server(controller_client, backup_file):
53
if charm_prefix and not charm_prefix.endswith('/'):
54
charm_prefix = charm_prefix + '/'
55
agent_version = client.get_matching_agent_version()
56
instance_id = client.get_status().status['machines']['0']['instance-id']
57
for ignored in until_timeout(30):
58
agent_versions = client.get_status().get_agent_versions()
59
if 'unknown' not in agent_versions and len(agent_versions) == 1:
61
if agent_versions.keys() != [agent_version]:
62
print_now("Current versions: %s" % ', '.join(agent_versions.keys()))
63
client.juju('upgrade-juju', ('--version', agent_version))
64
client.wait_for_version(client.get_matching_agent_version())
65
client.juju('deploy', (charm_prefix + 'ubuntu',))
66
client.wait_for_started().status
67
print_now("%s is ready to testing" % client.env.environment)
71
def restore_present_state_server(client, backup_file):
68
72
"""juju-restore won't restore when the state-server is still present."""
70
controller_client.restore_backup(backup_file)
71
except CalledProcessError:
73
environ = dict(os.environ)
74
proc = subprocess.Popen(
75
['juju', '--show-log', 'restore', '-e', client.env.environment,
77
env=environ, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
78
output, err = proc.communicate()
79
if proc.returncode == 0:
81
"juju-restore restored to an operational state-server: %s" % err)
73
84
"juju-restore correctly refused to restore "
74
85
"because the state-server was still up.")
78
"juju-restore restored to an operational state-serve")
81
def delete_controller_members(client, leader_only=False):
82
"""Delete controller members.
84
The all members are delete by default. The followers are deleted before the
85
leader to simulates a total controller failure. When leader_only is true,
86
the leader is deleted to trigger a new leader election.
89
leader = client.get_controller_leader()
92
members = client.get_controller_members()
95
for machine in members:
96
instance_id = machine.info.get('instance-id')
97
if client.env.config['type'] == 'azure':
98
instance_id = convert_to_azure_ids(client, [instance_id])[0]
99
host = machine.info.get('dns-name')
100
log.info("Instrumenting node failure for member {}: {} at {}".format(
101
machine.machine_id, instance_id, host))
102
terminate_instances(client.env, [instance_id])
103
wait_for_state_server_to_shutdown(
104
host, client, instance_id, timeout=120)
105
deleted_machines.append(machine.machine_id)
106
return deleted_machines
109
def restore_missing_state_server(client, controller_client, backup_file,
110
check_controller=True):
86
match = running_instance_pattern.search(err)
88
print_now("WARNING: Could not find the instance_id in output:")
92
instance_id = match.group(1)
96
def delete_instance(client, instance_id):
97
"""Delete the instance using the providers tools."""
98
print_now("Instrumenting a bootstrap node failure.")
99
return terminate_instances(client.env, [instance_id])
102
def delete_extra_state_servers(client, instance_id):
103
"""Delete the extra state-server instances."""
104
status = client.get_status()
105
for machine, info in status.iter_machines():
106
extra_instance_id = info.get('instance-id')
107
status = info.get('state-server-member-status')
108
if extra_instance_id != instance_id and status is not None:
109
print_now("Deleting state-server-member {}".format(machine))
110
host = get_machine_dns_name(client, machine)
111
delete_instance(client, extra_instance_id)
112
wait_for_state_server_to_shutdown(host, client, extra_instance_id)
115
def restore_missing_state_server(client, backup_file):
111
116
"""juju-restore creates a replacement state-server for the services."""
112
log.info("Starting restore.")
114
controller_client.restore_backup(backup_file)
115
except CalledProcessError as e:
116
log.info('Call of juju restore exited with an error\n')
117
log.info('Call: %r\n', e.cmd)
119
raise LoggedException(e)
121
controller_client.wait_for_started(600)
122
show_controller(client)
123
client.set_config('dummy-source', {'token': 'Two'})
124
client.wait_for_started()
125
client.wait_for_workloads()
126
check_token(client, 'Two')
127
log.info("%s restored", client.env.environment)
117
environ = dict(os.environ)
118
print_now("Starting restore.")
119
proc = subprocess.Popen(
120
['juju', '--show-log', 'restore', '-e', client.env.environment,
121
'--constraints', 'mem=2G', backup_file],
122
env=environ, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
123
output, err = proc.communicate()
124
if proc.returncode != 0:
125
print_now('Call of juju restore exited with an error\n')
126
message = 'Restore failed: \n%s' % err
129
raise Exception(message)
131
client.wait_for_started(600).status
132
print_now("%s restored" % client.env.environment)
131
136
def parse_args(argv=None):
132
parser = ArgumentParser(description='Test recovery strategies.')
133
add_basic_testing_arguments(parser)
135
'--charm-series', help='Charm series.', default='')
137
parser = ArgumentParser('Test recovery strategies.')
139
'--charm-prefix', help='A prefix for charm urls.', default='')
141
'--debug', action='store_true', default=False,
142
help='Use --debug juju logging.')
136
143
strategy = parser.add_argument_group('test strategy')
137
144
strategy.add_argument(
138
145
'--ha', action='store_const', dest='strategy', const='ha',
143
150
strategy.add_argument(
144
151
'--ha-backup', action='store_const', dest='strategy',
145
152
const='ha-backup', help="Test backup/restore of HA.")
153
parser.add_argument('juju_path')
154
parser.add_argument('env_name')
155
parser.add_argument('logs', help='Directory to store logs in.')
157
'temp_env_name', nargs='?',
158
help='Temporary environment name to use for this test.')
146
159
return parser.parse_args(argv)
150
def detect_bootstrap_machine(bs_manager):
166
setup_juju_path(args.juju_path)
167
client = make_client(args.juju_path, args.debug, args.env_name,
169
juju_home = get_juju_home()
170
ensure_deleted(get_jenv_path(juju_home, client.env.environment))
171
with temp_bootstrap_env(juju_home, client):
173
bootstrap_host = get_machine_dns_name(client, 0)
175
instance_id = deploy_stack(client, args.charm_prefix)
176
if args.strategy in ('ha', 'ha-backup'):
177
client.juju('ensure-availability', ('-n', '3'))
179
if args.strategy in ('ha-backup', 'backup'):
180
backup_file = client.backup()
181
restore_present_state_server(client, backup_file)
182
if args.strategy == 'ha-backup':
183
delete_extra_state_servers(client, instance_id)
184
delete_instance(client, instance_id)
185
wait_for_state_server_to_shutdown(bootstrap_host, client,
187
bootstrap_host = None
188
if args.strategy == 'ha':
189
client.get_status(600)
191
restore_missing_state_server(client, backup_file)
192
except Exception as e:
193
if bootstrap_host is None:
194
bootstrap_host = parse_new_state_server_from_error(e)
195
dump_env_logs(client, bootstrap_host, log_dir)
198
client.destroy_environment()
153
199
except Exception as e:
154
bs_manager.known_hosts['0'] = parse_new_state_server_from_error(e)
158
def assess_recovery(bs_manager, strategy, charm_series):
159
log.info("Setting up test.")
160
client = bs_manager.client
161
deploy_stack(client, charm_series)
162
client.set_config('dummy-source', {'token': ''})
163
log.info("Setup complete.")
164
log.info("Test started.")
165
controller_client = client.get_controller_client()
166
if strategy in ('ha', 'ha-backup'):
167
controller_client.enable_ha()
168
controller_client.wait_for_ha()
169
if strategy in ('ha-backup', 'backup'):
170
backup_file = controller_client.backup()
171
restore_present_state_server(controller_client, backup_file)
176
deleted_machine_ids = delete_controller_members(
177
controller_client, leader_only=leader_only)
178
log.info("Deleted {}".format(deleted_machine_ids))
179
for m_id in deleted_machine_ids:
180
if bs_manager.known_hosts.get(m_id):
181
del bs_manager.known_hosts[m_id]
183
client.get_status(600)
184
log.info("HA recovered from leader failure.")
187
check_controller = strategy != 'ha-backup'
188
restore_missing_state_server(client, controller_client, backup_file,
189
check_controller=check_controller)
190
log.info("Test complete.")
194
args = parse_args(argv)
195
configure_logging(args.verbose)
196
bs_manager = BootstrapManager.from_args(args)
197
with bs_manager.booted_context(upload_tools=args.upload_tools):
198
with detect_bootstrap_machine(bs_manager):
199
assess_recovery(bs_manager, args.strategy, args.charm_series)
200
print_now("\nEXCEPTION CAUGHT:\n")
202
if getattr(e, 'output', None):
202
209
if __name__ == '__main__':