~andrewjbeach/juju-ci-tools/make-local-patcher

« back to all changes in this revision

Viewing changes to run_chaos_monkey.py

  • Committer: Curtis Hovey
  • Date: 2015-06-16 13:54:07 UTC
  • mto: This revision was merged to the branch mainline in revision 991.
  • Revision ID: curtis@canonical.com-20150616135407-xzq45ixl2xqqooli
Always collect logs from quickstart and deployer.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
#!/usr/bin/env python
 
2
__metaclass__ = type
 
3
 
2
4
from argparse import ArgumentParser
3
 
from datetime import datetime
 
5
from collections import defaultdict
 
6
from datetime import (
 
7
    datetime,
 
8
    timedelta,
 
9
)
4
10
import logging
 
11
import subprocess
5
12
import sys
6
13
from time import sleep
7
14
 
8
 
from chaos import MonkeyRunner
9
15
from jujupy import (
10
 
    client_from_config,
11
 
)
12
 
from utility import configure_logging
13
 
 
14
 
 
15
 
__metaclass__ = type
16
 
 
17
 
 
18
 
def run_while_healthy_or_timeout(monkey):
19
 
    logging.debug('run_while_healthy_or_timeout')
20
 
    while monkey.is_healthy():
21
 
        logging.debug('Unleashing chaos.')
22
 
        monkey.unleash_once()
23
 
        monkey.wait_for_chaos()
24
 
        if datetime.now() > monkey.expire_time:
25
 
            logging.debug(
26
 
                'Reached run timeout, all done running chaos.')
27
 
            break
28
 
        if monkey.pause_timeout:
29
 
            logging.debug(
30
 
                'Pausing {} seconds after running chaos.'.format(
31
 
                    monkey.pause_timeout))
32
 
            sleep(monkey.pause_timeout)
33
 
    else:
34
 
        logging.error('The health check reported an error: {}'.format(
35
 
            monkey.health_checker))
36
 
        sys.exit(1)
 
16
    EnvJujuClient,
 
17
    SimpleEnvironment,
 
18
)
 
19
from utility import (
 
20
    configure_logging,
 
21
    until_timeout,
 
22
)
 
23
 
 
24
 
 
25
class MonkeyRunner:
 
26
 
 
27
    @classmethod
 
28
    def from_config(cls, args):
 
29
        """Return a class instance populated with values from args.
 
30
 
 
31
        Lets EnvJujuClient.by_version() default to the juju binary in
 
32
        the OS path.
 
33
        """
 
34
        client = EnvJujuClient.by_version(
 
35
            SimpleEnvironment.from_config(args.env))
 
36
        return cls(args.env, args.service, args.health_checker, client,
 
37
                   enablement_timeout=args.enablement_timeout,
 
38
                   pause_timeout=args.pause_timeout,
 
39
                   total_timeout=args.total_timeout)
 
40
 
 
41
    def __init__(self, env, service, health_checker, client,
 
42
                 enablement_timeout=0, pause_timeout=0, total_timeout=0):
 
43
        self.env = env
 
44
        self.service = service
 
45
        self.health_checker = health_checker
 
46
        self.client = client
 
47
        self.enablement_timeout = enablement_timeout
 
48
        self.pause_timeout = pause_timeout
 
49
        self.total_timeout = total_timeout
 
50
        self.expire_time = (datetime.now() + timedelta(seconds=total_timeout))
 
51
        self.monkey_ids = {}
 
52
 
 
53
    def deploy_chaos_monkey(self):
 
54
        """Juju deploy chaos-monkey and add a relation.
 
55
 
 
56
        JUJU_REPOSITORY must be set in the OS environment so a local
 
57
        chaos-monkey charm can be found.
 
58
        """
 
59
        logging.debug('Deploying local:chaos-monkey.')
 
60
        self.client.deploy('local:chaos-monkey')
 
61
        logging.debug('Relating chaos-monkey to {}.'.format(self.service))
 
62
        self.client.juju('add-relation', (self.service, 'chaos-monkey'))
 
63
        logging.debug('Waiting for services to start.')
 
64
        self.client.wait_for_started()
 
65
        self.client.wait_for_subordinate_units(self.service, 'chaos-monkey')
 
66
 
 
67
    def iter_chaos_monkey_units(self):
 
68
        status = self.client.get_status()
 
69
        for unit_name, unit in status.service_subordinate_units(self.service):
 
70
            if not unit_name.startswith('chaos-monkey'):
 
71
                continue
 
72
            yield unit_name, unit
 
73
 
 
74
    def unleash_once(self):
 
75
        for unit_name, unit in self.iter_chaos_monkey_units():
 
76
            logging.info('Starting the chaos monkey on: {}'.format(unit_name))
 
77
            enablement_arg = ('enablement-timeout={}'.format(
 
78
                self.enablement_timeout))
 
79
            action_out = self.client.get_juju_output(
 
80
                'action do', unit_name, 'start', 'mode=single', enablement_arg)
 
81
            if not action_out.startswith('Action queued with id'):
 
82
                raise Exception(
 
83
                    'Unexpected output from "juju action do": {}'.format(
 
84
                        action_out))
 
85
            logging.info(action_out)
 
86
            self.monkey_ids[unit_name] = action_out.split().pop()
 
87
        # Allow chaos time to run
 
88
        sleep(self.enablement_timeout)
 
89
 
 
90
    def is_healthy(self):
 
91
        """Returns a boolean after running the health_checker."""
 
92
        try:
 
93
            sub_output = subprocess.check_output(self.health_checker)
 
94
            logging.info('Health check output: {}'.format(sub_output))
 
95
        except OSError as e:
 
96
            logging.error(
 
97
                'The health check script failed to execute with: {}'.format(
 
98
                    e))
 
99
            raise
 
100
        except subprocess.CalledProcessError as e:
 
101
            logging.error('Non-zero exit code returned from {}: {}'.format(
 
102
                self.health_checker, e))
 
103
            logging.error(e.output)
 
104
            return False
 
105
        return True
 
106
 
 
107
    def get_unit_status(self, unit_name):
 
108
        """Return 'done' if no lock file otherwise 'running'"""
 
109
        service_config = self.client.get_service_config('chaos-monkey')
 
110
        logging.debug('{}'.format(service_config))
 
111
        logging.debug('Checking if chaos is done on: {}'.format(unit_name))
 
112
        check_cmd = '[ -f '
 
113
        check_cmd += service_config['settings']['chaos-dir']['value']
 
114
        check_cmd += '/chaos_monkey.' + self.monkey_ids[unit_name]
 
115
        check_cmd += '/chaos_runner.lock'
 
116
        check_cmd += ' ]'
 
117
        if self.client.juju('run', ('--unit', unit_name, check_cmd),
 
118
                            check=False):
 
119
            return 'done'
 
120
        return 'running'
 
121
 
 
122
    def wait_for_chaos_complete(self, timeout=300):
 
123
        for ignored in until_timeout(timeout):
 
124
            locks = defaultdict(list)
 
125
            for unit_name, unit in self.iter_chaos_monkey_units():
 
126
                locks[self.get_unit_status(unit_name)].append(unit_name)
 
127
            if locks.keys() == ['done']:
 
128
                logging.debug(
 
129
                    'All lock files have been removed: {}'.format(locks))
 
130
                break
 
131
        else:
 
132
            raise Exception('Chaos operations did not complete.')
 
133
 
 
134
    def run_while_healthy_or_timeout(self):
 
135
        logging.debug('run_while_healthy_or_timeout')
 
136
        while self.is_healthy():
 
137
            logging.debug('Unleashing chaos.')
 
138
            self.unleash_once()
 
139
            self.wait_for_chaos_complete()
 
140
            if datetime.now() > self.expire_time:
 
141
                logging.debug(
 
142
                    'Reached run timeout, all done running chaos.')
 
143
                break
 
144
            if self.pause_timeout:
 
145
                logging.debug(
 
146
                    'Pausing {} seconds after running chaos.'.format(
 
147
                        self.pause_timeout))
 
148
                sleep(self.pause_timeout)
 
149
        else:
 
150
            logging.error('The health check reported an error: {}'.format(
 
151
                self.health_checker))
 
152
            sys.exit(1)
37
153
 
38
154
 
39
155
def get_args(argv=None):
64
180
    if args.enablement_timeout < 0:
65
181
        parser.error("Invalid enablement-timeout value: timeout must be "
66
182
                     "zero or greater.")
 
183
 
67
184
    return args
68
185
 
69
186
 
78
195
    """
79
196
    configure_logging(logging.INFO)
80
197
    args = get_args()
81
 
    client = client_from_config(args.env, None)
82
 
    monkey_runner = MonkeyRunner(
83
 
        args.env, client, service=args.service,
84
 
        health_checker=args.health_checker,
85
 
        enablement_timeout=args.enablement_timeout,
86
 
        pause_timeout=args.pause_timeout,
87
 
        total_timeout=args.total_timeout)
 
198
    monkey_runner = MonkeyRunner.from_config(args)
88
199
    logging.info("Chaos Monkey Start.")
89
200
    monkey_runner.deploy_chaos_monkey()
90
 
    run_while_healthy_or_timeout(monkey_runner)
 
201
    monkey_runner.run_while_healthy_or_timeout()
91
202
    logging.info("Chaos Monkey Complete.")
92
203
 
93
204
if __name__ == '__main__':