1014.2.1
by John George
background_chaos WIP |
1 |
#!/usr/bin/env python
|
2 |
from collections import defaultdict |
|
3 |
from contextlib import contextmanager |
|
4 |
from datetime import ( |
|
5 |
datetime, |
|
6 |
timedelta, |
|
7 |
)
|
|
8 |
import logging |
|
1014.2.5
by John George
deploy_dummy_stack with chaos. |
9 |
import os |
1014.2.1
by John George
background_chaos WIP |
10 |
import subprocess |
11 |
import sys |
|
12 |
||
1485.1.1
by Martin
Switch all imports of local_charm_path to using jujucharm over utility |
13 |
from jujucharm import ( |
14 |
local_charm_path, |
|
15 |
)
|
|
1014.2.5
by John George
deploy_dummy_stack with chaos. |
16 |
from remote import remote_from_unit |
1014.2.1
by John George
background_chaos WIP |
17 |
from utility import ( |
18 |
until_timeout, |
|
19 |
)
|
|
20 |
||
21 |
||
1092.2.2
by Aaron Bentley
Fix lint. |
22 |
__metaclass__ = type |
23 |
||
24 |
||
1014.2.1
by John George
background_chaos WIP |
25 |
@contextmanager
|
1014.2.5
by John George
deploy_dummy_stack with chaos. |
26 |
def background_chaos(env, client, log_dir, time): |
27 |
monkey = MonkeyRunner(env, client, enablement_timeout=time) |
|
1014.2.1
by John George
background_chaos WIP |
28 |
monkey.deploy_chaos_monkey() |
1014.2.5
by John George
deploy_dummy_stack with chaos. |
29 |
monkey_ids = monkey.unleash_once() |
1014.2.1
by John George
background_chaos WIP |
30 |
monkey.wait_for_chaos(state='start') |
31 |
try: |
|
32 |
yield
|
|
1089.1.1
by John George
Pass the enablement_timeout when wait_for_chaos() is called to wait for the complete state. |
33 |
monkey.wait_for_chaos(state='complete', timeout=time) |
1014.2.1
by John George
background_chaos WIP |
34 |
except BaseException as e: |
35 |
logging.exception(e) |
|
36 |
sys.exit(1) |
|
1014.2.5
by John George
deploy_dummy_stack with chaos. |
37 |
finally: |
38 |
# Copy the chaos logs to the log directory.
|
|
39 |
# Get the remote machine. Currently the remote machine will always be
|
|
40 |
# ubuntu/0. IF background_chaos() is enhanced to take a target service,
|
|
41 |
# then log collection will also need to be updated.
|
|
42 |
remote = remote_from_unit(client, "ubuntu/0") |
|
43 |
for id in monkey_ids: |
|
44 |
monkey_log = ['chaos-monkey/chaos_monkey.{}/log/*'.format(id)] |
|
45 |
dest_dir = '{}/chaos-monkey-{}'.format(log_dir, id) |
|
46 |
os.mkdir(dest_dir) |
|
47 |
try: |
|
48 |
remote.copy(dest_dir, monkey_log) |
|
49 |
except subprocess.CalledProcessError as e: |
|
50 |
logging.warning( |
|
51 |
'Could not retrieve Chaos Monkey log for {}:'.format(id)) |
|
52 |
logging.warning(e.output) |
|
1014.2.1
by John George
background_chaos WIP |
53 |
|
54 |
||
55 |
class MonkeyRunner: |
|
56 |
||
57 |
def __init__(self, env, client, service='0', health_checker=None, |
|
58 |
enablement_timeout=120, pause_timeout=0, total_timeout=0): |
|
59 |
self.env = env |
|
60 |
if service == '0': |
|
61 |
self.service = 'ubuntu' |
|
62 |
self.machine = '0' |
|
63 |
else: |
|
64 |
self.service = service |
|
65 |
self.machine = None |
|
66 |
self.health_checker = health_checker |
|
67 |
self.client = client |
|
68 |
self.enablement_timeout = enablement_timeout |
|
69 |
self.pause_timeout = pause_timeout |
|
70 |
self.total_timeout = total_timeout |
|
71 |
self.expire_time = (datetime.now() + timedelta(seconds=total_timeout)) |
|
72 |
self.monkey_ids = {} |
|
73 |
||
74 |
def deploy_chaos_monkey(self): |
|
75 |
"""Juju deploy chaos-monkey and add a relation.
|
|
76 |
||
77 |
JUJU_REPOSITORY must be set in the OS environment so a local
|
|
78 |
chaos-monkey charm can be found.
|
|
79 |
"""
|
|
80 |
if self.machine: |
|
81 |
logging.debug( |
|
82 |
'Deploying ubuntu to machine {}.'.format(self.machine)) |
|
1345.1.3
by Seman
Deploy charm by path. |
83 |
charm = local_charm_path( |
84 |
charm='ubuntu', juju_ver=self.client.version) |
|
85 |
self.client.deploy(charm, to=self.machine) |
|
1014.2.1
by John George
background_chaos WIP |
86 |
logging.debug('Deploying local:chaos-monkey.') |
1345.1.3
by Seman
Deploy charm by path. |
87 |
charm = local_charm_path( |
88 |
charm='chaos-monkey', juju_ver=self.client.version) |
|
89 |
self.client.deploy(charm) |
|
1014.2.1
by John George
background_chaos WIP |
90 |
logging.debug('Relating chaos-monkey to {}.'.format(self.service)) |
91 |
self.client.juju('add-relation', (self.service, 'chaos-monkey')) |
|
92 |
logging.debug('Waiting for services to start.') |
|
93 |
self.client.wait_for_started() |
|
94 |
self.client.wait_for_subordinate_units(self.service, 'chaos-monkey') |
|
95 |
||
96 |
def iter_chaos_monkey_units(self): |
|
97 |
status = self.client.get_status() |
|
98 |
for unit_name, unit in status.service_subordinate_units(self.service): |
|
99 |
if not unit_name.startswith('chaos-monkey'): |
|
100 |
continue
|
|
101 |
yield unit_name, unit |
|
102 |
||
103 |
def unleash_once(self): |
|
104 |
for unit_name, unit in self.iter_chaos_monkey_units(): |
|
105 |
logging.info('Starting the chaos monkey on: {}'.format(unit_name)) |
|
106 |
enablement_arg = ('enablement-timeout={}'.format( |
|
107 |
self.enablement_timeout)) |
|
108 |
monkey_id = self.monkey_ids.get(unit_name) |
|
109 |
args = (unit_name,) + ('start',) + ('mode=single',) |
|
110 |
args = args + (enablement_arg,) |
|
111 |
if monkey_id is not None: |
|
112 |
args = args + ('monkey-id={}'.format(monkey_id),) |
|
1221.1.24
by Aaron Bentley
Switch chaos.py over to EnvJujuClient.action_do. |
113 |
|
114 |
id = self.client.action_do(*args) |
|
1014.2.1
by John George
background_chaos WIP |
115 |
if not self.monkey_ids.get(unit_name): |
116 |
logging.info('Setting the monkey-id for {} to: {}'.format( |
|
117 |
unit_name, id)) |
|
118 |
self.monkey_ids[unit_name] = id |
|
119 |
return self.monkey_ids.values() |
|
120 |
||
121 |
def is_healthy(self): |
|
122 |
"""Returns a boolean after running the health_checker."""
|
|
123 |
if self.health_checker: |
|
124 |
try: |
|
125 |
sub_output = subprocess.check_output(self.health_checker) |
|
126 |
logging.info('Health check output: {}'.format(sub_output)) |
|
127 |
except OSError as e: |
|
128 |
logging.error( |
|
129 |
'The health check failed to execute with: {}'.format( |
|
130 |
e)) |
|
131 |
raise
|
|
132 |
except subprocess.CalledProcessError as e: |
|
133 |
logging.error('Non-zero exit code returned from {}: {}'.format( |
|
134 |
self.health_checker, e)) |
|
135 |
logging.error(e.output) |
|
136 |
return False |
|
137 |
return True |
|
138 |
||
139 |
def get_unit_status(self, unit_name): |
|
140 |
"""Return 'done' if no lock file otherwise 'running'"""
|
|
141 |
service_config = self.client.get_service_config('chaos-monkey') |
|
142 |
logging.debug('{}'.format(service_config)) |
|
143 |
logging.debug('Checking if chaos is done on: {}'.format(unit_name)) |
|
144 |
check_cmd = '[ -f ' |
|
145 |
check_cmd += service_config['settings']['chaos-dir']['value'] |
|
146 |
check_cmd += '/chaos_monkey.' + self.monkey_ids[unit_name] |
|
147 |
check_cmd += '/chaos_runner.lock' |
|
148 |
check_cmd += ' ]' |
|
149 |
if self.client.juju('run', ('--unit', unit_name, check_cmd), |
|
150 |
check=False): |
|
151 |
return 'done' |
|
152 |
return 'running' |
|
153 |
||
154 |
def wait_for_chaos(self, state='complete', timeout=300): |
|
155 |
if not ('complete' in state or 'start' in state): |
|
156 |
raise Exception('Unexpected state value: {}'.format(state)) |
|
157 |
for ignored in until_timeout(timeout): |
|
158 |
locks = defaultdict(list) |
|
159 |
for unit_name, unit in self.iter_chaos_monkey_units(): |
|
160 |
locks[self.get_unit_status(unit_name)].append(unit_name) |
|
161 |
if state == 'complete' and locks.keys() == ['done']: |
|
162 |
logging.debug( |
|
163 |
'All lock files removed, chaos complete: {}'.format(locks)) |
|
164 |
break
|
|
165 |
if state == 'start' and locks.keys() == ['running']: |
|
166 |
logging.debug( |
|
167 |
'All lock files found, chaos started: {}'.format(locks)) |
|
168 |
break
|
|
169 |
else: |
|
170 |
raise Exception('Chaos operations did not {}.'.format(state)) |