34
37
running_instance_pattern = re.compile('\["([^"]+)"\]')
40
log = logging.getLogger("assess_recovery")
43
def check_token(client, token):
44
found = get_token_from_status(client)
45
if token not in found:
46
raise JujuAssertionError('Token is not {}: {}'.format(
37
50
def deploy_stack(client, charm_series):
38
51
""""Deploy a simple stack, state-server and ubuntu."""
39
charm = local_charm_path(
40
charm='ubuntu', juju_ver=client.version, series=charm_series)
41
client.deploy(charm, series=charm_series)
42
client.wait_for_started().status
43
print_now("%s is ready to testing" % client.env.environment)
46
def restore_present_state_server(admin_client, backup_file):
52
deploy_dummy_stack(client, charm_series)
53
client.set_config('dummy-source', {'token': 'One'})
54
client.wait_for_workloads()
55
check_token(client, 'One')
56
log.info("%s is ready to testing", client.env.environment)
59
def show_controller(client):
60
controller_info = client.show_controller(format='yaml')
61
log.info('Controller is:\n{}'.format(controller_info))
64
def restore_present_state_server(controller_client, backup_file):
47
65
"""juju-restore won't restore when the state-server is still present."""
49
output = admin_client.restore_backup(backup_file)
50
except CalledProcessError as e:
67
controller_client.restore_backup(backup_file)
68
except CalledProcessError:
52
70
"juju-restore correctly refused to restore "
53
71
"because the state-server was still up.")
54
match = running_instance_pattern.search(e.stderr)
56
print_now("WARNING: Could not find the instance_id in output:")
63
"juju-restore restored to an operational state-server: %s" %
75
"juju-restore restored to an operational state-serve")
67
78
def delete_controller_members(client, leader_only=False):
80
91
deleted_machines = []
81
92
for machine in members:
82
93
instance_id = machine.info.get('instance-id')
94
if client.env.config['type'] == 'azure':
95
instance_id = convert_to_azure_ids(client, [instance_id])[0]
83
96
host = machine.info.get('dns-name')
84
print_now("Instrumenting node failure for member {}: {} at {}".format(
85
machine.machine_id, instance_id, host))
97
log.info("Instrumenting node failure for member {}: {} at {}".format(
98
machine.machine_id, instance_id, host))
86
99
terminate_instances(client.env, [instance_id])
87
wait_for_state_server_to_shutdown(host, client, instance_id)
100
wait_for_state_server_to_shutdown(
101
host, client, instance_id, timeout=120)
88
102
deleted_machines.append(machine.machine_id)
89
103
return deleted_machines
92
def restore_missing_state_server(client, admin_client, backup_file):
106
def restore_missing_state_server(client, controller_client, backup_file):
93
107
"""juju-restore creates a replacement state-server for the services."""
94
print_now("Starting restore.")
108
log.info("Starting restore.")
96
output = admin_client.restore_backup(backup_file)
110
controller_client.restore_backup(backup_file)
97
111
except CalledProcessError as e:
98
print_now('Call of juju restore exited with an error\n')
99
print_now('Call: {} \n'.format(e.cmd))
100
message = 'Restore failed: \n%s' % e.stderr
112
log.info('Call of juju restore exited with an error\n')
113
log.info('Call: %r\n', e.cmd)
104
115
raise LoggedException(e)
106
admin_client.wait_for_started(600).status
107
print_now("%s restored" % client.env.environment)
116
controller_client.wait_for_started(600)
117
show_controller(client)
118
client.set_config('dummy-source', {'token': 'Two'})
119
client.wait_for_started()
120
client.wait_for_workloads()
121
check_token(client, 'Two')
122
log.info("%s restored", client.env.environment)
111
126
def parse_args(argv=None):
112
parser = ArgumentParser('Test recovery strategies.')
127
parser = ArgumentParser(description='Test recovery strategies.')
128
add_basic_testing_arguments(parser)
113
129
parser.add_argument(
114
130
'--charm-series', help='Charm series.', default='')
116
'--debug', action='store_true', default=False,
117
help='Use --debug juju logging.')
118
131
strategy = parser.add_argument_group('test strategy')
119
132
strategy.add_argument(
120
133
'--ha', action='store_const', dest='strategy', const='ha',
125
138
strategy.add_argument(
126
139
'--ha-backup', action='store_const', dest='strategy',
127
140
const='ha-backup', help="Test backup/restore of HA.")
128
parser.add_argument('juju_path')
129
parser.add_argument('env_name')
130
parser.add_argument('logs', help='Directory to store logs in.')
132
'temp_env_name', nargs='?',
133
help='Temporary environment name to use for this test.')
135
'--agent-stream', help='Stream for retrieving agent binaries.')
137
'--series', help='Name of the Ubuntu series to use.')
138
141
return parser.parse_args(argv)
141
def make_client_from_args(args):
142
return make_client(args.juju_path, args.debug, args.env_name,
147
145
def detect_bootstrap_machine(bs_manager):
155
153
def assess_recovery(bs_manager, strategy, charm_series):
154
log.info("Setting up test.")
156
155
client = bs_manager.client
157
156
deploy_stack(client, charm_series)
158
admin_client = client.get_admin_client()
157
log.info("Setup complete.")
158
log.info("Test started.")
159
controller_client = client.get_controller_client()
159
160
if strategy in ('ha', 'ha-backup'):
160
admin_client.enable_ha()
161
admin_client.wait_for_ha()
161
controller_client.enable_ha()
162
controller_client.wait_for_ha()
162
163
if strategy in ('ha-backup', 'backup'):
163
backup_file = admin_client.backup()
164
restore_present_state_server(admin_client, backup_file)
164
backup_file = controller_client.backup()
165
restore_present_state_server(controller_client, backup_file)
165
166
if strategy == 'ha':
166
167
leader_only = True
168
169
leader_only = False
169
170
deleted_machine_ids = delete_controller_members(
170
admin_client, leader_only=leader_only)
171
controller_client, leader_only=leader_only)
172
log.info("Deleted {}".format(deleted_machine_ids))
171
173
for m_id in deleted_machine_ids:
172
174
if bs_manager.known_hosts.get(m_id):
173
175
del bs_manager.known_hosts[m_id]
174
176
if strategy == 'ha':
175
177
client.get_status(600)
178
log.info("HA recovered from leader failure.")
177
restore_missing_state_server(client, admin_client, backup_file)
181
restore_missing_state_server(client, controller_client, backup_file)
182
log.info("Test complete.")
181
186
args = parse_args(argv)
182
client = make_client_from_args(args)
183
jes_enabled = client.is_jes_enabled()
184
bs_manager = BootstrapManager(
185
client.env.environment, client, client, None, [], args.series,
186
agent_url=None, agent_stream=args.agent_stream, region=None,
187
log_dir=args.logs, keep_env=False, permanent=jes_enabled,
188
jes_enabled=jes_enabled)
189
with bs_manager.booted_context(upload_tools=False):
187
configure_logging(args.verbose)
188
bs_manager = BootstrapManager.from_args(args)
189
with bs_manager.booted_context(upload_tools=args.upload_tools):
190
190
with detect_bootstrap_machine(bs_manager):
191
191
assess_recovery(bs_manager, args.strategy, args.charm_series)