38
34
running_instance_pattern = re.compile('\["([^"]+)"\]')
41
log = logging.getLogger("assess_recovery")
44
def check_token(client, token):
45
for ignored in until_timeout(300):
46
found = get_token_from_status(client)
47
if found and token in found:
49
raise JujuAssertionError('Token is not {}: {}'.format(
53
37
def deploy_stack(client, charm_series):
54
38
""""Deploy a simple stack, state-server and ubuntu."""
55
deploy_dummy_stack(client, charm_series)
56
client.set_config('dummy-source', {'token': 'One'})
57
client.wait_for_workloads()
58
check_token(client, 'One')
59
log.info("%s is ready to testing", client.env.environment)
62
def show_controller(client):
63
controller_info = client.show_controller(format='yaml')
64
log.info('Controller is:\n{}'.format(controller_info))
67
def restore_present_state_server(controller_client, backup_file):
39
charm = local_charm_path(
40
charm='ubuntu', juju_ver=client.version, series=charm_series)
41
client.deploy(charm, series=charm_series)
42
client.wait_for_started().status
43
print_now("%s is ready to testing" % client.env.environment)
46
def restore_present_state_server(admin_client, backup_file):
68
47
"""juju-restore won't restore when the state-server is still present."""
70
controller_client.restore_backup(backup_file)
71
except CalledProcessError:
49
output = admin_client.restore_backup(backup_file)
50
except CalledProcessError as e:
73
52
"juju-restore correctly refused to restore "
74
53
"because the state-server was still up.")
54
match = running_instance_pattern.search(e.stderr)
56
print_now("WARNING: Could not find the instance_id in output:")
78
"juju-restore restored to an operational state-serve")
63
"juju-restore restored to an operational state-server: %s" %
81
67
def delete_controller_members(client, leader_only=False):
94
80
deleted_machines = []
95
81
for machine in members:
96
82
instance_id = machine.info.get('instance-id')
97
if client.env.config['type'] == 'azure':
98
instance_id = convert_to_azure_ids(client, [instance_id])[0]
99
83
host = machine.info.get('dns-name')
100
log.info("Instrumenting node failure for member {}: {} at {}".format(
101
machine.machine_id, instance_id, host))
84
print_now("Instrumenting node failure for member {}: {} at {}".format(
85
machine.machine_id, instance_id, host))
102
86
terminate_instances(client.env, [instance_id])
103
wait_for_state_server_to_shutdown(
104
host, client, instance_id, timeout=120)
87
wait_for_state_server_to_shutdown(host, client, instance_id)
105
88
deleted_machines.append(machine.machine_id)
106
89
return deleted_machines
109
def restore_missing_state_server(client, controller_client, backup_file,
110
check_controller=True):
92
def restore_missing_state_server(client, admin_client, backup_file):
111
93
"""juju-restore creates a replacement state-server for the services."""
112
log.info("Starting restore.")
94
print_now("Starting restore.")
114
controller_client.restore_backup(backup_file)
96
output = admin_client.restore_backup(backup_file)
115
97
except CalledProcessError as e:
116
log.info('Call of juju restore exited with an error\n')
117
log.info('Call: %r\n', e.cmd)
98
print_now('Call of juju restore exited with an error\n')
99
print_now('Call: {} \n'.format(e.cmd))
100
message = 'Restore failed: \n%s' % e.stderr
119
104
raise LoggedException(e)
121
controller_client.wait_for_started(600)
122
show_controller(client)
123
client.set_config('dummy-source', {'token': 'Two'})
124
client.wait_for_started()
125
client.wait_for_workloads()
126
check_token(client, 'Two')
127
log.info("%s restored", client.env.environment)
106
admin_client.wait_for_started(600).status
107
print_now("%s restored" % client.env.environment)
131
111
def parse_args(argv=None):
132
parser = ArgumentParser(description='Test recovery strategies.')
133
add_basic_testing_arguments(parser)
112
parser = ArgumentParser('Test recovery strategies.')
134
113
parser.add_argument(
135
114
'--charm-series', help='Charm series.', default='')
116
'--debug', action='store_true', default=False,
117
help='Use --debug juju logging.')
136
118
strategy = parser.add_argument_group('test strategy')
137
119
strategy.add_argument(
138
120
'--ha', action='store_const', dest='strategy', const='ha',
143
125
strategy.add_argument(
144
126
'--ha-backup', action='store_const', dest='strategy',
145
127
const='ha-backup', help="Test backup/restore of HA.")
128
parser.add_argument('juju_path')
129
parser.add_argument('env_name')
130
parser.add_argument('logs', help='Directory to store logs in.')
132
'temp_env_name', nargs='?',
133
help='Temporary environment name to use for this test.')
135
'--agent-stream', help='Stream for retrieving agent binaries.')
137
'--series', help='Name of the Ubuntu series to use.')
146
138
return parser.parse_args(argv)
141
def make_client_from_args(args):
142
return make_client(args.juju_path, args.debug, args.env_name,
150
147
def detect_bootstrap_machine(bs_manager):
158
155
def assess_recovery(bs_manager, strategy, charm_series):
159
log.info("Setting up test.")
160
156
client = bs_manager.client
161
157
deploy_stack(client, charm_series)
162
client.set_config('dummy-source', {'token': ''})
163
log.info("Setup complete.")
164
log.info("Test started.")
165
controller_client = client.get_controller_client()
158
admin_client = client.get_admin_client()
166
159
if strategy in ('ha', 'ha-backup'):
167
controller_client.enable_ha()
168
controller_client.wait_for_ha()
160
admin_client.enable_ha()
161
admin_client.wait_for_ha()
169
162
if strategy in ('ha-backup', 'backup'):
170
backup_file = controller_client.backup()
171
restore_present_state_server(controller_client, backup_file)
163
backup_file = admin_client.backup()
164
restore_present_state_server(admin_client, backup_file)
172
165
if strategy == 'ha':
173
166
leader_only = True
175
168
leader_only = False
176
169
deleted_machine_ids = delete_controller_members(
177
controller_client, leader_only=leader_only)
178
log.info("Deleted {}".format(deleted_machine_ids))
170
admin_client, leader_only=leader_only)
179
171
for m_id in deleted_machine_ids:
180
172
if bs_manager.known_hosts.get(m_id):
181
173
del bs_manager.known_hosts[m_id]
182
174
if strategy == 'ha':
183
175
client.get_status(600)
184
log.info("HA recovered from leader failure.")
187
check_controller = strategy != 'ha-backup'
188
restore_missing_state_server(client, controller_client, backup_file,
189
check_controller=check_controller)
190
log.info("Test complete.")
177
restore_missing_state_server(client, admin_client, backup_file)
194
181
args = parse_args(argv)
195
configure_logging(args.verbose)
196
bs_manager = BootstrapManager.from_args(args)
197
with bs_manager.booted_context(upload_tools=args.upload_tools):
182
client = make_client_from_args(args)
183
jes_enabled = client.is_jes_enabled()
184
bs_manager = BootstrapManager(
185
client.env.environment, client, client, None, [], args.series,
186
agent_url=None, agent_stream=args.agent_stream, region=None,
187
log_dir=args.logs, keep_env=False, permanent=jes_enabled,
188
jes_enabled=jes_enabled)
189
with bs_manager.booted_context(upload_tools=False):
198
190
with detect_bootstrap_machine(bs_manager):
199
191
assess_recovery(bs_manager, args.strategy, args.charm_series)