1
1
#!/usr/bin/env python
2
4
from argparse import ArgumentParser
3
from datetime import datetime
5
from collections import defaultdict
6
13
from time import sleep
8
from chaos import MonkeyRunner
9
15
from jujupy import (
12
from utility import configure_logging
18
def run_while_healthy_or_timeout(monkey):
19
logging.debug('run_while_healthy_or_timeout')
20
while monkey.is_healthy():
21
logging.debug('Unleashing chaos.')
23
monkey.wait_for_chaos()
24
if datetime.now() > monkey.expire_time:
26
'Reached run timeout, all done running chaos.')
28
if monkey.pause_timeout:
30
'Pausing {} seconds after running chaos.'.format(
31
monkey.pause_timeout))
32
sleep(monkey.pause_timeout)
34
logging.error('The health check reported an error: {}'.format(
35
monkey.health_checker))
28
def from_config(cls, args):
29
"""Return a class instance populated with values from args.
31
Lets EnvJujuClient.by_version() default to the juju binary in
34
client = EnvJujuClient.by_version(
35
SimpleEnvironment.from_config(args.env))
36
return cls(args.env, args.service, args.health_checker, client,
37
enablement_timeout=args.enablement_timeout,
38
pause_timeout=args.pause_timeout,
39
total_timeout=args.total_timeout)
41
def __init__(self, env, service, health_checker, client,
42
enablement_timeout=0, pause_timeout=0, total_timeout=0):
44
self.service = service
45
self.health_checker = health_checker
47
self.enablement_timeout = enablement_timeout
48
self.pause_timeout = pause_timeout
49
self.total_timeout = total_timeout
50
self.expire_time = (datetime.now() + timedelta(seconds=total_timeout))
53
def deploy_chaos_monkey(self):
54
"""Juju deploy chaos-monkey and add a relation.
56
JUJU_REPOSITORY must be set in the OS environment so a local
57
chaos-monkey charm can be found.
59
logging.debug('Deploying local:chaos-monkey.')
60
self.client.deploy('local:chaos-monkey')
61
logging.debug('Relating chaos-monkey to {}.'.format(self.service))
62
self.client.juju('add-relation', (self.service, 'chaos-monkey'))
63
logging.debug('Waiting for services to start.')
64
self.client.wait_for_started()
65
self.client.wait_for_subordinate_units(self.service, 'chaos-monkey')
67
def iter_chaos_monkey_units(self):
68
status = self.client.get_status()
69
for unit_name, unit in status.service_subordinate_units(self.service):
70
if not unit_name.startswith('chaos-monkey'):
74
def unleash_once(self):
75
for unit_name, unit in self.iter_chaos_monkey_units():
76
logging.info('Starting the chaos monkey on: {}'.format(unit_name))
77
enablement_arg = ('enablement-timeout={}'.format(
78
self.enablement_timeout))
79
action_out = self.client.get_juju_output(
80
'action do', unit_name, 'start', 'mode=single', enablement_arg)
81
if not action_out.startswith('Action queued with id'):
83
'Unexpected output from "juju action do": {}'.format(
85
logging.info(action_out)
86
self.monkey_ids[unit_name] = action_out.split().pop()
87
# Allow chaos time to run
88
sleep(self.enablement_timeout)
91
"""Returns a boolean after running the health_checker."""
93
sub_output = subprocess.check_output(self.health_checker)
94
logging.info('Health check output: {}'.format(sub_output))
97
'The health check script failed to execute with: {}'.format(
100
except subprocess.CalledProcessError as e:
101
logging.error('Non-zero exit code returned from {}: {}'.format(
102
self.health_checker, e))
103
logging.error(e.output)
107
def get_unit_status(self, unit_name):
108
"""Return 'done' if no lock file otherwise 'running'"""
109
service_config = self.client.get_service_config('chaos-monkey')
110
logging.debug('{}'.format(service_config))
111
logging.debug('Checking if chaos is done on: {}'.format(unit_name))
113
check_cmd += service_config['settings']['chaos-dir']['value']
114
check_cmd += '/chaos_monkey.' + self.monkey_ids[unit_name]
115
check_cmd += '/chaos_runner.lock'
117
if self.client.juju('run', ('--unit', unit_name, check_cmd),
122
def wait_for_chaos_complete(self, timeout=300):
123
for ignored in until_timeout(timeout):
124
locks = defaultdict(list)
125
for unit_name, unit in self.iter_chaos_monkey_units():
126
locks[self.get_unit_status(unit_name)].append(unit_name)
127
if locks.keys() == ['done']:
129
'All lock files have been removed: {}'.format(locks))
132
raise Exception('Chaos operations did not complete.')
134
def run_while_healthy_or_timeout(self):
135
logging.debug('run_while_healthy_or_timeout')
136
while self.is_healthy():
137
logging.debug('Unleashing chaos.')
139
self.wait_for_chaos_complete()
140
if datetime.now() > self.expire_time:
142
'Reached run timeout, all done running chaos.')
144
if self.pause_timeout:
146
'Pausing {} seconds after running chaos.'.format(
148
sleep(self.pause_timeout)
150
logging.error('The health check reported an error: {}'.format(
151
self.health_checker))
39
155
def get_args(argv=None):
79
196
configure_logging(logging.INFO)
81
client = client_from_config(args.env, None)
82
monkey_runner = MonkeyRunner(
83
args.env, client, service=args.service,
84
health_checker=args.health_checker,
85
enablement_timeout=args.enablement_timeout,
86
pause_timeout=args.pause_timeout,
87
total_timeout=args.total_timeout)
198
monkey_runner = MonkeyRunner.from_config(args)
88
199
logging.info("Chaos Monkey Start.")
89
200
monkey_runner.deploy_chaos_monkey()
90
run_while_healthy_or_timeout(monkey_runner)
201
monkey_runner.run_while_healthy_or_timeout()
91
202
logging.info("Chaos Monkey Complete.")
93
204
if __name__ == '__main__':