~veebers/juju-ci-tools/migration-add-migrate-back-to-original

296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
1
#!/usr/bin/env python
2
# Backup and restore a stack.
3
4
from __future__ import print_function
5
6
from argparse import ArgumentParser
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
7
from contextlib import contextmanager
1326.1.4 by Aaron Bentley
Improve restore error handling.
8
import logging
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
9
import re
1173.4.10 by Aaron Bentley
Implement restore via juju client.
10
from subprocess import CalledProcessError
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
11
import sys
12
451 by Aaron Bentley
Handle logging and cleanup in python.
13
from deploy_stack import (
1173.4.3 by Aaron Bentley
Switch to BootstrapManager.
14
    BootstrapManager,
1593.2.3 by Curtis Hovey
Check token to verify model is working.
15
    deploy_dummy_stack,
1727.1.3 by Curtis Hovey
Update controller_client.known_hosts
16
    get_remote_machines,
1593.2.3 by Curtis Hovey
Check token to verify model is working.
17
    get_token_from_status,
717.2.2 by Aaron Bentley
Checkpoint with assess_recovery working.
18
    wait_for_state_server_to_shutdown,
953.3.9 by Nate Finch
more code review changes
19
)
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
20
from jujupy import (
1153.4.3 by Martin Packman
Changes from review by abentley
21
    parse_new_state_server_from_error,
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
22
)
717.2.2 by Aaron Bentley
Checkpoint with assess_recovery working.
23
from substrate import (
1452.3.8 by Curtis Hovey
Call convert_to_azure_ids for azure.
24
    convert_to_azure_ids,
717.2.2 by Aaron Bentley
Checkpoint with assess_recovery working.
25
    terminate_instances,
953.3.9 by Nate Finch
more code review changes
26
)
379.1.1 by Aaron Bentley
Move portions of deploy job to Python.
27
from utility import (
1386.2.1 by Martin Packman
Make assess_recovery use standard argument parsing and client setup
28
    add_basic_testing_arguments,
29
    configure_logging,
1593.2.3 by Curtis Hovey
Check token to verify model is working.
30
    JujuAssertionError,
1326.1.4 by Aaron Bentley
Improve restore error handling.
31
    LoggedException,
1610.2.1 by Curtis Hovey
Poll for the token, whcih might be None.
32
    until_timeout,
379.1.1 by Aaron Bentley
Move portions of deploy job to Python.
33
)
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
34
35
1092.2.2 by Aaron Bentley
Fix lint.
36
__metaclass__ = type
37
38
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
39
running_instance_pattern = re.compile('\["([^"]+)"\]')
40
41
1386.2.2 by Martin Packman
Switch assess_recovery from print_now to named logger for output
42
log = logging.getLogger("assess_recovery")
43
44
1727.1.1 by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes.
45
class HARecoveryError(Exception):
46
    """The controllers failed to respond."""
47
48
1593.2.3 by Curtis Hovey
Check token to verify model is working.
49
def check_token(client, token):
1610.2.1 by Curtis Hovey
Poll for the token, whcih might be None.
50
    for ignored in until_timeout(300):
51
        found = get_token_from_status(client)
52
        if found and token in found:
53
            return found
54
    raise JujuAssertionError('Token is not {}: {}'.format(
55
                             token, found))
1593.2.3 by Curtis Hovey
Check token to verify model is working.
56
57
1345.1.5 by Seman
Deploy charm by path #2.
58
def deploy_stack(client, charm_series):
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
59
    """"Deploy a simple stack, state-server and ubuntu."""
1593.2.3 by Curtis Hovey
Check token to verify model is working.
60
    deploy_dummy_stack(client, charm_series)
61
    client.set_config('dummy-source', {'token': 'One'})
62
    client.wait_for_workloads()
63
    check_token(client, 'One')
1386.2.2 by Martin Packman
Switch assess_recovery from print_now to named logger for output
64
    log.info("%s is ready to testing", client.env.environment)
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
65
66
1593.2.5 by Curtis Hovey
Fix tests.
67
def show_controller(client):
68
    controller_info = client.show_controller(format='yaml')
69
    log.info('Controller is:\n{}'.format(controller_info))
70
71
1727.1.3 by Curtis Hovey
Update controller_client.known_hosts
72
def enable_ha(bs_manager, controller_client):
1727.1.1 by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes.
73
    """Enable HA and wait for the controllers to be ready."""
74
    controller_client.enable_ha()
75
    controller_client.wait_for_ha()
76
    show_controller(controller_client)
1727.1.4 by Curtis Hovey
Update controller_client.known_hosts
77
    remote_machines = get_remote_machines(
78
        controller_client, bs_manager.known_hosts)
1727.1.3 by Curtis Hovey
Update controller_client.known_hosts
79
    bs_manager.known_hosts = remote_machines
1727.1.1 by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes.
80
81
82
def assess_ha_recovery(bs_manager, client):
83
    """Verify that the client can talk to a controller.
84
85
86
    The controller is given 5 minutes to respond to the client's request.
1727.1.5 by Curtis Hovey
Allow the controller to recover when the first status call does not work.
87
    Another possibly 5 minutes is given to return a sensible status.
1727.1.1 by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes.
88
    """
89
    # Juju commands will hang when the controller is down, so ensure the
1727.1.5 by Curtis Hovey
Allow the controller to recover when the first status call does not work.
90
    # call is interrupted and raise HARecoveryError. The controller
91
    # might return an error, but it still has
1727.1.1 by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes.
92
    try:
1729 by Curtis Hovey
Restore check=True when getting the first status.
93
        client.juju('status', (), check=True, timeout=300)
1727.1.5 by Curtis Hovey
Allow the controller to recover when the first status call does not work.
94
        client.get_status(300)
1727.1.6 by Curtis Hovey
Only convert CalledProcessError to HARecoveryError.
95
    except CalledProcessError:
1727.1.1 by Curtis Hovey
Added rules to set has_controller and limit ha check to 5 minutes.
96
        raise HARecoveryError()
97
    bs_manager.has_controller = True
98
    log.info("HA recovered from leader failure.")
99
    log.info("PASS")
100
101
1493.1.1 by Martin
Rename methods and variables refering to admin model to new term controller model
102
def restore_present_state_server(controller_client, backup_file):
717.2.1 by Aaron Bentley
Extract EnvJujuClient.backup from assess_recovery.
103
    """juju-restore won't restore when the state-server is still present."""
1173.4.10 by Aaron Bentley
Implement restore via juju client.
104
    try:
1593.2.1 by Curtis Hovey
Show the restore output as it happens; do not call status if None was returned.
105
        controller_client.restore_backup(backup_file)
106
    except CalledProcessError:
1386.2.2 by Martin Packman
Switch assess_recovery from print_now to named logger for output
107
        log.info(
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
108
            "juju-restore correctly refused to restore "
109
            "because the state-server was still up.")
1593.2.1 by Curtis Hovey
Show the restore output as it happens; do not call status if None was returned.
110
        return
1173.4.10 by Aaron Bentley
Implement restore via juju client.
111
    else:
112
        raise Exception(
1593.2.1 by Curtis Hovey
Show the restore output as it happens; do not call status if None was returned.
113
            "juju-restore restored to an operational state-serve")
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
114
115
1727.1.7 by Curtis Hovey
Move delete known_hosts into delete_controller_members.
116
def delete_controller_members(bs_manager, client, leader_only=False):
1306.1.13 by Curtis Hovey
Add delete_controller_members.
117
    """Delete controller members.
118
119
    The all members are delete by default. The followers are deleted before the
120
    leader to simulates a total controller failure. When leader_only is true,
121
    the leader is deleted to trigger a new leader election.
122
    """
123
    if leader_only:
124
        leader = client.get_controller_leader()
125
        members = [leader]
126
    else:
127
        members = client.get_controller_members()
128
        members.reverse()
1306.1.15 by Curtis Hovey
Added rule to delete known_hosts.
129
    deleted_machines = []
1306.1.13 by Curtis Hovey
Add delete_controller_members.
130
    for machine in members:
131
        instance_id = machine.info.get('instance-id')
1674.1.3 by Aaron Bentley
Use provider rather than get_provider.
132
        if client.env.provider == 'azure':
1452.3.8 by Curtis Hovey
Call convert_to_azure_ids for azure.
133
            instance_id = convert_to_azure_ids(client, [instance_id])[0]
1306.1.13 by Curtis Hovey
Add delete_controller_members.
134
        host = machine.info.get('dns-name')
1386.2.2 by Martin Packman
Switch assess_recovery from print_now to named logger for output
135
        log.info("Instrumenting node failure for member {}: {} at {}".format(
1452.3.7 by Curtis Hovey
Added a timeout for slow azure.
136
                 machine.machine_id, instance_id, host))
1306.1.13 by Curtis Hovey
Add delete_controller_members.
137
        terminate_instances(client.env, [instance_id])
1452.3.7 by Curtis Hovey
Added a timeout for slow azure.
138
        wait_for_state_server_to_shutdown(
139
            host, client, instance_id, timeout=120)
1315.2.1 by Aaron Bentley
Use machine id instead of machine number.
140
        deleted_machines.append(machine.machine_id)
1727.1.7 by Curtis Hovey
Move delete known_hosts into delete_controller_members.
141
    log.info("Deleted {}".format(deleted_machines))
142
    # Do not gather data about the deleted controller.
1787.1.1 by Aaron Bentley
Only set has_controller=False if leader_only is False.
143
    if not leader_only:
144
        bs_manager.has_controller = False
1727.1.7 by Curtis Hovey
Move delete known_hosts into delete_controller_members.
145
    for m_id in deleted_machines:
146
        if bs_manager.known_hosts.get(m_id):
147
            del bs_manager.known_hosts[m_id]
1306.1.15 by Curtis Hovey
Added rule to delete known_hosts.
148
    return deleted_machines
1306.1.13 by Curtis Hovey
Add delete_controller_members.
149
150
1787.1.2 by Aaron Bentley
Ensure has_controller is True at appropriate times.
151
def restore_missing_state_server(bs_manager, controller_client, backup_file,
1608.1.1 by Curtis Hovey
Do not wait for dead controller machines to be resurrected.
152
                                 check_controller=True):
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
153
    """juju-restore creates a replacement state-server for the services."""
1386.2.2 by Martin Packman
Switch assess_recovery from print_now to named logger for output
154
    log.info("Starting restore.")
1173.4.10 by Aaron Bentley
Implement restore via juju client.
155
    try:
1593.2.1 by Curtis Hovey
Show the restore output as it happens; do not call status if None was returned.
156
        controller_client.restore_backup(backup_file)
1173.4.10 by Aaron Bentley
Implement restore via juju client.
157
    except CalledProcessError as e:
1386.2.2 by Martin Packman
Switch assess_recovery from print_now to named logger for output
158
        log.info('Call of juju restore exited with an error\n')
159
        log.info('Call:  %r\n', e.cmd)
160
        log.exception(e)
1326.1.4 by Aaron Bentley
Improve restore error handling.
161
        raise LoggedException(e)
1608.1.1 by Curtis Hovey
Do not wait for dead controller machines to be resurrected.
162
    if check_controller:
163
        controller_client.wait_for_started(600)
1787.1.2 by Aaron Bentley
Ensure has_controller is True at appropriate times.
164
    show_controller(bs_manager.client)
165
    bs_manager.has_controller = True
166
    bs_manager.client.set_config('dummy-source', {'token': 'Two'})
167
    bs_manager.client.wait_for_started()
168
    bs_manager.client.wait_for_workloads()
169
    check_token(bs_manager.client, 'Two')
170
    log.info("%s restored", bs_manager.client.env.environment)
1386.2.2 by Martin Packman
Switch assess_recovery from print_now to named logger for output
171
    log.info("PASS")
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
172
173
817.1.3 by Aaron Bentley
Add --debug to assess_recovery.
174
def parse_args(argv=None):
1386.2.1 by Martin Packman
Make assess_recovery use standard argument parsing and client setup
175
    parser = ArgumentParser(description='Test recovery strategies.')
176
    add_basic_testing_arguments(parser)
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
177
    parser.add_argument(
1345.1.5 by Seman
Deploy charm by path #2.
178
        '--charm-series', help='Charm series.', default='')
372.1.6 by Aaron Bentley
Tweakage and import fixing.
179
    strategy = parser.add_argument_group('test strategy')
180
    strategy.add_argument(
181
        '--ha', action='store_const', dest='strategy', const='ha',
182
        default='backup', help="Test HA.")
183
    strategy.add_argument(
184
        '--backup', action='store_const', dest='strategy', const='backup',
185
        help="Test backup/restore.")
481.1.1 by Curtis Hovey
Add support for ha-backup. This scenario starts with HA and
186
    strategy.add_argument(
684 by Curtis Hovey
Do not raise an exception for a warning. just warn.
187
        '--ha-backup', action='store_const', dest='strategy',
188
        const='ha-backup', help="Test backup/restore of HA.")
817.1.3 by Aaron Bentley
Add --debug to assess_recovery.
189
    return parser.parse_args(argv)
190
191
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
192
@contextmanager
193
def detect_bootstrap_machine(bs_manager):
194
    try:
195
        yield
196
    except Exception as e:
1706.1.2 by Martin Packman
Never set a known_hosts address as None
197
        address = parse_new_state_server_from_error(e)
198
        if address is not None:
199
            bs_manager.known_hosts['0'] = address
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
200
        raise
201
202
1345.1.5 by Seman
Deploy charm by path #2.
203
def assess_recovery(bs_manager, strategy, charm_series):
1449.2.1 by Curtis Hovey
Fix list-models and show the state of the env.
204
    log.info("Setting up test.")
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
205
    client = bs_manager.client
1345.1.5 by Seman
Deploy charm by path #2.
206
    deploy_stack(client, charm_series)
1608.1.2 by Curtis Hovey
Put the charm in a non-ready state after setup for the test to check on new ready.
207
    client.set_config('dummy-source', {'token': ''})
1449.2.1 by Curtis Hovey
Fix list-models and show the state of the env.
208
    log.info("Setup complete.")
209
    log.info("Test started.")
1493.1.1 by Martin
Rename methods and variables refering to admin model to new term controller model
210
    controller_client = client.get_controller_client()
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
211
    if strategy in ('ha', 'ha-backup'):
1727.1.3 by Curtis Hovey
Update controller_client.known_hosts
212
        enable_ha(bs_manager, controller_client)
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
213
    if strategy in ('ha-backup', 'backup'):
1493.1.1 by Martin
Rename methods and variables refering to admin model to new term controller model
214
        backup_file = controller_client.backup()
215
        restore_present_state_server(controller_client, backup_file)
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
216
    if strategy == 'ha':
217
        leader_only = True
218
    else:
219
        leader_only = False
1727.1.7 by Curtis Hovey
Move delete known_hosts into delete_controller_members.
220
    delete_controller_members(
221
        bs_manager, controller_client, leader_only=leader_only)
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
222
    if strategy == 'ha':
1727.1.5 by Curtis Hovey
Allow the controller to recover when the first status call does not work.
223
        assess_ha_recovery(bs_manager, client)
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
224
    else:
1608.1.1 by Curtis Hovey
Do not wait for dead controller machines to be resurrected.
225
        check_controller = strategy != 'ha-backup'
1787.1.2 by Aaron Bentley
Ensure has_controller is True at appropriate times.
226
        restore_missing_state_server(
227
            bs_manager, controller_client, backup_file,
228
            check_controller=check_controller)
1449.2.1 by Curtis Hovey
Fix list-models and show the state of the env.
229
    log.info("Test complete.")
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
230
231
990.1.5 by Curtis Hovey
Added main test for assess_recovery.py ha.
232
def main(argv):
233
    args = parse_args(argv)
1386.2.1 by Martin Packman
Make assess_recovery use standard argument parsing and client setup
234
    configure_logging(args.verbose)
235
    bs_manager = BootstrapManager.from_args(args)
236
    with bs_manager.booted_context(upload_tools=args.upload_tools):
1321.1.1 by Aaron Bentley
assess_recovery: perform all state-server operations in admin model
237
        with detect_bootstrap_machine(bs_manager):
1345.1.5 by Seman
Deploy charm by path #2.
238
            assess_recovery(bs_manager, args.strategy, args.charm_series)
296 by Curtis Hovey
Added first draft of the backup_restore_juju.py test. It doesn't
239
240
241
if __name__ == '__main__':
990.1.5 by Curtis Hovey
Added main test for assess_recovery.py ha.
242
    main(sys.argv[1:])