2
# Backup and restore a stack.
4
from __future__ import print_function
6
from argparse import ArgumentParser
7
from contextlib import contextmanager
10
from subprocess import CalledProcessError
13
from deploy_stack import (
16
get_token_from_status,
17
wait_for_state_server_to_shutdown,
20
parse_new_state_server_from_error,
22
from substrate import (
27
add_basic_testing_arguments,
38
running_instance_pattern = re.compile('\["([^"]+)"\]')
41
log = logging.getLogger("assess_recovery")
44
def check_token(client, token):
45
for ignored in until_timeout(300):
46
found = get_token_from_status(client)
47
if found and token in found:
49
raise JujuAssertionError('Token is not {}: {}'.format(
53
def deploy_stack(client, charm_series):
54
""""Deploy a simple stack, state-server and ubuntu."""
55
deploy_dummy_stack(client, charm_series)
56
client.set_config('dummy-source', {'token': 'One'})
57
client.wait_for_workloads()
58
check_token(client, 'One')
59
log.info("%s is ready to testing", client.env.environment)
62
def show_controller(client):
63
controller_info = client.show_controller(format='yaml')
64
log.info('Controller is:\n{}'.format(controller_info))
67
def restore_present_state_server(controller_client, backup_file):
68
"""juju-restore won't restore when the state-server is still present."""
70
controller_client.restore_backup(backup_file)
71
except CalledProcessError:
73
"juju-restore correctly refused to restore "
74
"because the state-server was still up.")
78
"juju-restore restored to an operational state-serve")
81
def delete_controller_members(client, leader_only=False):
82
"""Delete controller members.
84
The all members are delete by default. The followers are deleted before the
85
leader to simulates a total controller failure. When leader_only is true,
86
the leader is deleted to trigger a new leader election.
89
leader = client.get_controller_leader()
92
members = client.get_controller_members()
95
for machine in members:
96
instance_id = machine.info.get('instance-id')
97
if client.env.config['type'] == 'azure':
98
instance_id = convert_to_azure_ids(client, [instance_id])[0]
99
host = machine.info.get('dns-name')
100
log.info("Instrumenting node failure for member {}: {} at {}".format(
101
machine.machine_id, instance_id, host))
102
terminate_instances(client.env, [instance_id])
103
wait_for_state_server_to_shutdown(
104
host, client, instance_id, timeout=120)
105
deleted_machines.append(machine.machine_id)
106
return deleted_machines
109
def restore_missing_state_server(client, controller_client, backup_file,
110
check_controller=True):
111
"""juju-restore creates a replacement state-server for the services."""
112
log.info("Starting restore.")
114
controller_client.restore_backup(backup_file)
115
except CalledProcessError as e:
116
log.info('Call of juju restore exited with an error\n')
117
log.info('Call: %r\n', e.cmd)
119
raise LoggedException(e)
121
controller_client.wait_for_started(600)
122
show_controller(client)
123
client.set_config('dummy-source', {'token': 'Two'})
124
client.wait_for_started()
125
client.wait_for_workloads()
126
check_token(client, 'Two')
127
log.info("%s restored", client.env.environment)
131
def parse_args(argv=None):
132
parser = ArgumentParser(description='Test recovery strategies.')
133
add_basic_testing_arguments(parser)
135
'--charm-series', help='Charm series.', default='')
136
strategy = parser.add_argument_group('test strategy')
137
strategy.add_argument(
138
'--ha', action='store_const', dest='strategy', const='ha',
139
default='backup', help="Test HA.")
140
strategy.add_argument(
141
'--backup', action='store_const', dest='strategy', const='backup',
142
help="Test backup/restore.")
143
strategy.add_argument(
144
'--ha-backup', action='store_const', dest='strategy',
145
const='ha-backup', help="Test backup/restore of HA.")
146
return parser.parse_args(argv)
150
def detect_bootstrap_machine(bs_manager):
153
except Exception as e:
154
bs_manager.known_hosts['0'] = parse_new_state_server_from_error(e)
158
def assess_recovery(bs_manager, strategy, charm_series):
159
log.info("Setting up test.")
160
client = bs_manager.client
161
deploy_stack(client, charm_series)
162
client.set_config('dummy-source', {'token': ''})
163
log.info("Setup complete.")
164
log.info("Test started.")
165
controller_client = client.get_controller_client()
166
if strategy in ('ha', 'ha-backup'):
167
controller_client.enable_ha()
168
controller_client.wait_for_ha()
169
if strategy in ('ha-backup', 'backup'):
170
backup_file = controller_client.backup()
171
restore_present_state_server(controller_client, backup_file)
176
deleted_machine_ids = delete_controller_members(
177
controller_client, leader_only=leader_only)
178
log.info("Deleted {}".format(deleted_machine_ids))
179
for m_id in deleted_machine_ids:
180
if bs_manager.known_hosts.get(m_id):
181
del bs_manager.known_hosts[m_id]
183
client.get_status(600)
184
log.info("HA recovered from leader failure.")
187
check_controller = strategy != 'ha-backup'
188
restore_missing_state_server(client, controller_client, backup_file,
189
check_controller=check_controller)
190
log.info("Test complete.")
194
args = parse_args(argv)
195
configure_logging(args.verbose)
196
bs_manager = BootstrapManager.from_args(args)
197
with bs_manager.booted_context(upload_tools=args.upload_tools):
198
with detect_bootstrap_machine(bs_manager):
199
assess_recovery(bs_manager, args.strategy, args.charm_series)
202
if __name__ == '__main__':