296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
1 |
#!/usr/bin/env python
|
2 |
# Backup and restore a stack.
|
|
3 |
||
4 |
from __future__ import print_function |
|
5 |
||
6 |
from argparse import ArgumentParser |
|
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
7 |
from contextlib import contextmanager |
1326.1.4
by Aaron Bentley
Improve restore error handling. |
8 |
import logging |
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
9 |
import re |
1173.4.10
by Aaron Bentley
Implement restore via juju client. |
10 |
from subprocess import CalledProcessError |
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
11 |
import sys |
12 |
||
451
by Aaron Bentley
Handle logging and cleanup in python. |
13 |
from deploy_stack import ( |
1173.4.3
by Aaron Bentley
Switch to BootstrapManager. |
14 |
BootstrapManager, |
1593.2.3
by Curtis Hovey
Check token to verify model is working. |
15 |
deploy_dummy_stack, |
1727.1.3
by Curtis Hovey
Update controller_client.known_hosts |
16 |
get_remote_machines, |
1593.2.3
by Curtis Hovey
Check token to verify model is working. |
17 |
get_token_from_status, |
717.2.2
by Aaron Bentley
Checkpoint with assess_recovery working. |
18 |
wait_for_state_server_to_shutdown, |
953.3.9
by Nate Finch
more code review changes |
19 |
)
|
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
20 |
from jujupy import ( |
1153.4.3
by Martin Packman
Changes from review by abentley |
21 |
parse_new_state_server_from_error, |
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
22 |
)
|
717.2.2
by Aaron Bentley
Checkpoint with assess_recovery working. |
23 |
from substrate import ( |
1452.3.8
by Curtis Hovey
Call convert_to_azure_ids for azure. |
24 |
convert_to_azure_ids, |
717.2.2
by Aaron Bentley
Checkpoint with assess_recovery working. |
25 |
terminate_instances, |
953.3.9
by Nate Finch
more code review changes |
26 |
)
|
379.1.1
by Aaron Bentley
Move portions of deploy job to Python. |
27 |
from utility import ( |
1386.2.1
by Martin Packman
Make assess_recovery use standard argument parsing and client setup |
28 |
add_basic_testing_arguments, |
29 |
configure_logging, |
|
1593.2.3
by Curtis Hovey
Check token to verify model is working. |
30 |
JujuAssertionError, |
1326.1.4
by Aaron Bentley
Improve restore error handling. |
31 |
LoggedException, |
1610.2.1
by Curtis Hovey
Poll for the token, whcih might be None. |
32 |
until_timeout, |
379.1.1
by Aaron Bentley
Move portions of deploy job to Python. |
33 |
)
|
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
34 |
|
35 |
||
1092.2.2
by Aaron Bentley
Fix lint. |
36 |
__metaclass__ = type |
37 |
||
38 |
||
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
39 |
running_instance_pattern = re.compile('\["([^"]+)"\]') |
40 |
||
41 |
||
1386.2.2
by Martin Packman
Switch assess_recovery from print_now to named logger for output |
42 |
log = logging.getLogger("assess_recovery") |
43 |
||
44 |
||
1727.1.1
by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes. |
45 |
class HARecoveryError(Exception): |
46 |
"""The controllers failed to respond."""
|
|
47 |
||
48 |
||
1593.2.3
by Curtis Hovey
Check token to verify model is working. |
49 |
def check_token(client, token): |
1610.2.1
by Curtis Hovey
Poll for the token, whcih might be None. |
50 |
for ignored in until_timeout(300): |
51 |
found = get_token_from_status(client) |
|
52 |
if found and token in found: |
|
53 |
return found |
|
54 |
raise JujuAssertionError('Token is not {}: {}'.format( |
|
55 |
token, found)) |
|
1593.2.3
by Curtis Hovey
Check token to verify model is working. |
56 |
|
57 |
||
1345.1.5
by Seman
Deploy charm by path #2. |
58 |
def deploy_stack(client, charm_series): |
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
59 |
""""Deploy a simple stack, state-server and ubuntu."""
|
1593.2.3
by Curtis Hovey
Check token to verify model is working. |
60 |
deploy_dummy_stack(client, charm_series) |
61 |
client.set_config('dummy-source', {'token': 'One'}) |
|
62 |
client.wait_for_workloads() |
|
63 |
check_token(client, 'One') |
|
1386.2.2
by Martin Packman
Switch assess_recovery from print_now to named logger for output |
64 |
log.info("%s is ready to testing", client.env.environment) |
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
65 |
|
66 |
||
1593.2.5
by Curtis Hovey
Fix tests. |
67 |
def show_controller(client): |
68 |
controller_info = client.show_controller(format='yaml') |
|
69 |
log.info('Controller is:\n{}'.format(controller_info)) |
|
70 |
||
71 |
||
1727.1.3
by Curtis Hovey
Update controller_client.known_hosts |
72 |
def enable_ha(bs_manager, controller_client): |
1727.1.1
by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes. |
73 |
"""Enable HA and wait for the controllers to be ready."""
|
74 |
controller_client.enable_ha() |
|
75 |
controller_client.wait_for_ha() |
|
76 |
show_controller(controller_client) |
|
1727.1.4
by Curtis Hovey
Update controller_client.known_hosts |
77 |
remote_machines = get_remote_machines( |
78 |
controller_client, bs_manager.known_hosts) |
|
1727.1.3
by Curtis Hovey
Update controller_client.known_hosts |
79 |
bs_manager.known_hosts = remote_machines |
1727.1.1
by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes. |
80 |
|
81 |
||
82 |
def assess_ha_recovery(bs_manager, client): |
|
83 |
"""Verify that the client can talk to a controller.
|
|
84 |
||
85 |
||
86 |
The controller is given 5 minutes to respond to the client's request.
|
|
1727.1.5
by Curtis Hovey
Allow the controller to recover when the first status call does not work. |
87 |
Another possibly 5 minutes is given to return a sensible status.
|
1727.1.1
by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes. |
88 |
"""
|
89 |
# Juju commands will hang when the controller is down, so ensure the
|
|
1727.1.5
by Curtis Hovey
Allow the controller to recover when the first status call does not work. |
90 |
# call is interrupted and raise HARecoveryError. The controller
|
91 |
# might return an error, but it still has
|
|
1727.1.1
by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes. |
92 |
try: |
1729
by Curtis Hovey
Restore check=True when getting the first status. |
93 |
client.juju('status', (), check=True, timeout=300) |
1727.1.5
by Curtis Hovey
Allow the controller to recover when the first status call does not work. |
94 |
client.get_status(300) |
1727.1.6
by Curtis Hovey
Only convert CalledProcessError to HARecoveryError. |
95 |
except CalledProcessError: |
1727.1.1
by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes. |
96 |
raise HARecoveryError() |
97 |
bs_manager.has_controller = True |
|
98 |
log.info("HA recovered from leader failure.") |
|
99 |
log.info("PASS") |
|
100 |
||
101 |
||
1493.1.1
by Martin
Rename methods and variables refering to admin model to new term controller model |
102 |
def restore_present_state_server(controller_client, backup_file): |
717.2.1
by Aaron Bentley
Extract EnvJujuClient.backup from assess_recovery. |
103 |
"""juju-restore won't restore when the state-server is still present."""
|
1173.4.10
by Aaron Bentley
Implement restore via juju client. |
104 |
try: |
1593.2.1
by Curtis Hovey
Show the restore output as it happens; do not call status if None was returned. |
105 |
controller_client.restore_backup(backup_file) |
106 |
except CalledProcessError: |
|
1386.2.2
by Martin Packman
Switch assess_recovery from print_now to named logger for output |
107 |
log.info( |
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
108 |
"juju-restore correctly refused to restore "
|
109 |
"because the state-server was still up.") |
|
1593.2.1
by Curtis Hovey
Show the restore output as it happens; do not call status if None was returned. |
110 |
return
|
1173.4.10
by Aaron Bentley
Implement restore via juju client. |
111 |
else: |
112 |
raise Exception( |
|
1593.2.1
by Curtis Hovey
Show the restore output as it happens; do not call status if None was returned. |
113 |
"juju-restore restored to an operational state-serve") |
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
114 |
|
115 |
||
1727.1.7
by Curtis Hovey
Move delete known_hosts into delete_controller_members. |
116 |
def delete_controller_members(bs_manager, client, leader_only=False): |
1306.1.13
by Curtis Hovey
Add delete_controller_members. |
117 |
"""Delete controller members.
|
118 |
||
119 |
The all members are delete by default. The followers are deleted before the
|
|
120 |
leader to simulates a total controller failure. When leader_only is true,
|
|
121 |
the leader is deleted to trigger a new leader election.
|
|
122 |
"""
|
|
123 |
if leader_only: |
|
124 |
leader = client.get_controller_leader() |
|
125 |
members = [leader] |
|
126 |
else: |
|
127 |
members = client.get_controller_members() |
|
128 |
members.reverse() |
|
1306.1.15
by Curtis Hovey
Added rule to delete known_hosts. |
129 |
deleted_machines = [] |
1306.1.13
by Curtis Hovey
Add delete_controller_members. |
130 |
for machine in members: |
131 |
instance_id = machine.info.get('instance-id') |
|
1674.1.3
by Aaron Bentley
Use provider rather than get_provider. |
132 |
if client.env.provider == 'azure': |
1452.3.8
by Curtis Hovey
Call convert_to_azure_ids for azure. |
133 |
instance_id = convert_to_azure_ids(client, [instance_id])[0] |
1306.1.13
by Curtis Hovey
Add delete_controller_members. |
134 |
host = machine.info.get('dns-name') |
1386.2.2
by Martin Packman
Switch assess_recovery from print_now to named logger for output |
135 |
log.info("Instrumenting node failure for member {}: {} at {}".format( |
1452.3.7
by Curtis Hovey
Added a timeout for slow azure. |
136 |
machine.machine_id, instance_id, host)) |
1306.1.13
by Curtis Hovey
Add delete_controller_members. |
137 |
terminate_instances(client.env, [instance_id]) |
1452.3.7
by Curtis Hovey
Added a timeout for slow azure. |
138 |
wait_for_state_server_to_shutdown( |
139 |
host, client, instance_id, timeout=120) |
|
1315.2.1
by Aaron Bentley
Use machine id instead of machine number. |
140 |
deleted_machines.append(machine.machine_id) |
1727.1.7
by Curtis Hovey
Move delete known_hosts into delete_controller_members. |
141 |
log.info("Deleted {}".format(deleted_machines)) |
142 |
# Do not gather data about the deleted controller.
|
|
1787.1.1
by Aaron Bentley
Only set has_controller=False if leader_only is False. |
143 |
if not leader_only: |
144 |
bs_manager.has_controller = False |
|
1727.1.7
by Curtis Hovey
Move delete known_hosts into delete_controller_members. |
145 |
for m_id in deleted_machines: |
146 |
if bs_manager.known_hosts.get(m_id): |
|
147 |
del bs_manager.known_hosts[m_id] |
|
1306.1.15
by Curtis Hovey
Added rule to delete known_hosts. |
148 |
return deleted_machines |
1306.1.13
by Curtis Hovey
Add delete_controller_members. |
149 |
|
150 |
||
1787.1.2
by Aaron Bentley
Ensure has_controller is True at appropriate times. |
151 |
def restore_missing_state_server(bs_manager, controller_client, backup_file, |
1608.1.1
by Curtis Hovey
Do not wait for dead controller machines to be resurrected. |
152 |
check_controller=True): |
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
153 |
"""juju-restore creates a replacement state-server for the services."""
|
1386.2.2
by Martin Packman
Switch assess_recovery from print_now to named logger for output |
154 |
log.info("Starting restore.") |
1173.4.10
by Aaron Bentley
Implement restore via juju client. |
155 |
try: |
1593.2.1
by Curtis Hovey
Show the restore output as it happens; do not call status if None was returned. |
156 |
controller_client.restore_backup(backup_file) |
1173.4.10
by Aaron Bentley
Implement restore via juju client. |
157 |
except CalledProcessError as e: |
1386.2.2
by Martin Packman
Switch assess_recovery from print_now to named logger for output |
158 |
log.info('Call of juju restore exited with an error\n') |
159 |
log.info('Call: %r\n', e.cmd) |
|
160 |
log.exception(e) |
|
1326.1.4
by Aaron Bentley
Improve restore error handling. |
161 |
raise LoggedException(e) |
1608.1.1
by Curtis Hovey
Do not wait for dead controller machines to be resurrected. |
162 |
if check_controller: |
163 |
controller_client.wait_for_started(600) |
|
1787.1.2
by Aaron Bentley
Ensure has_controller is True at appropriate times. |
164 |
show_controller(bs_manager.client) |
165 |
bs_manager.has_controller = True |
|
166 |
bs_manager.client.set_config('dummy-source', {'token': 'Two'}) |
|
167 |
bs_manager.client.wait_for_started() |
|
168 |
bs_manager.client.wait_for_workloads() |
|
169 |
check_token(bs_manager.client, 'Two') |
|
170 |
log.info("%s restored", bs_manager.client.env.environment) |
|
1386.2.2
by Martin Packman
Switch assess_recovery from print_now to named logger for output |
171 |
log.info("PASS") |
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
172 |
|
173 |
||
817.1.3
by Aaron Bentley
Add --debug to assess_recovery. |
174 |
def parse_args(argv=None): |
1386.2.1
by Martin Packman
Make assess_recovery use standard argument parsing and client setup |
175 |
parser = ArgumentParser(description='Test recovery strategies.') |
176 |
add_basic_testing_arguments(parser) |
|
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
177 |
parser.add_argument( |
1345.1.5
by Seman
Deploy charm by path #2. |
178 |
'--charm-series', help='Charm series.', default='') |
372.1.6
by Aaron Bentley
Tweakage and import fixing. |
179 |
strategy = parser.add_argument_group('test strategy') |
180 |
strategy.add_argument( |
|
181 |
'--ha', action='store_const', dest='strategy', const='ha', |
|
182 |
default='backup', help="Test HA.") |
|
183 |
strategy.add_argument( |
|
184 |
'--backup', action='store_const', dest='strategy', const='backup', |
|
185 |
help="Test backup/restore.") |
|
481.1.1
by Curtis Hovey
Add support for ha-backup. This scenario starts with HA and |
186 |
strategy.add_argument( |
684
by Curtis Hovey
Do not raise an exception for a warning. just warn. |
187 |
'--ha-backup', action='store_const', dest='strategy', |
188 |
const='ha-backup', help="Test backup/restore of HA.") |
|
817.1.3
by Aaron Bentley
Add --debug to assess_recovery. |
189 |
return parser.parse_args(argv) |
190 |
||
191 |
||
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
192 |
@contextmanager
|
193 |
def detect_bootstrap_machine(bs_manager): |
|
194 |
try: |
|
195 |
yield
|
|
196 |
except Exception as e: |
|
1706.1.2
by Martin Packman
Never set a known_hosts address as None |
197 |
address = parse_new_state_server_from_error(e) |
198 |
if address is not None: |
|
199 |
bs_manager.known_hosts['0'] = address |
|
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
200 |
raise
|
201 |
||
202 |
||
1345.1.5
by Seman
Deploy charm by path #2. |
203 |
def assess_recovery(bs_manager, strategy, charm_series): |
1449.2.1
by Curtis Hovey
Fix list-models and show the state of the env. |
204 |
log.info("Setting up test.") |
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
205 |
client = bs_manager.client |
1345.1.5
by Seman
Deploy charm by path #2. |
206 |
deploy_stack(client, charm_series) |
1608.1.2
by Curtis Hovey
Put the charm in a non-ready state after setup for the test to check on new ready. |
207 |
client.set_config('dummy-source', {'token': ''}) |
1449.2.1
by Curtis Hovey
Fix list-models and show the state of the env. |
208 |
log.info("Setup complete.") |
209 |
log.info("Test started.") |
|
1493.1.1
by Martin
Rename methods and variables refering to admin model to new term controller model |
210 |
controller_client = client.get_controller_client() |
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
211 |
if strategy in ('ha', 'ha-backup'): |
1727.1.3
by Curtis Hovey
Update controller_client.known_hosts |
212 |
enable_ha(bs_manager, controller_client) |
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
213 |
if strategy in ('ha-backup', 'backup'): |
1493.1.1
by Martin
Rename methods and variables refering to admin model to new term controller model |
214 |
backup_file = controller_client.backup() |
215 |
restore_present_state_server(controller_client, backup_file) |
|
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
216 |
if strategy == 'ha': |
217 |
leader_only = True |
|
218 |
else: |
|
219 |
leader_only = False |
|
1727.1.7
by Curtis Hovey
Move delete known_hosts into delete_controller_members. |
220 |
delete_controller_members( |
221 |
bs_manager, controller_client, leader_only=leader_only) |
|
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
222 |
if strategy == 'ha': |
1727.1.5
by Curtis Hovey
Allow the controller to recover when the first status call does not work. |
223 |
assess_ha_recovery(bs_manager, client) |
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
224 |
else: |
1608.1.1
by Curtis Hovey
Do not wait for dead controller machines to be resurrected. |
225 |
check_controller = strategy != 'ha-backup' |
1787.1.2
by Aaron Bentley
Ensure has_controller is True at appropriate times. |
226 |
restore_missing_state_server( |
227 |
bs_manager, controller_client, backup_file, |
|
228 |
check_controller=check_controller) |
|
1449.2.1
by Curtis Hovey
Fix list-models and show the state of the env. |
229 |
log.info("Test complete.") |
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
230 |
|
231 |
||
990.1.5
by Curtis Hovey
Added main test for assess_recovery.py ha. |
232 |
def main(argv): |
233 |
args = parse_args(argv) |
|
1386.2.1
by Martin Packman
Make assess_recovery use standard argument parsing and client setup |
234 |
configure_logging(args.verbose) |
235 |
bs_manager = BootstrapManager.from_args(args) |
|
236 |
with bs_manager.booted_context(upload_tools=args.upload_tools): |
|
1321.1.1
by Aaron Bentley
assess_recovery: perform all state-server operations in admin model |
237 |
with detect_bootstrap_machine(bs_manager): |
1345.1.5
by Seman
Deploy charm by path #2. |
238 |
assess_recovery(bs_manager, args.strategy, args.charm_series) |
296
by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't |
239 |
|
240 |
||
241 |
if __name__ == '__main__': |
|
990.1.5
by Curtis Hovey
Added main test for assess_recovery.py ha. |
242 |
main(sys.argv[1:]) |