~hduran-8/juju-ci-tools/add_status_ci_tests

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python
__metaclass__ = type

from argparse import ArgumentParser
from collections import defaultdict
from datetime import (
    datetime,
    timedelta,
)
import logging
import subprocess
import sys
from time import sleep

from jujupy import (
    EnvJujuClient,
    SimpleEnvironment,
)
from utility import (
    configure_logging,
    until_timeout,
)


class MonkeyRunner:

    @classmethod
    def from_config(cls, args):
        """Return a class instance populated with values from args.

        Lets EnvJujuClient.by_version() default to the juju binary in
        the OS path.
        """
        client = EnvJujuClient.by_version(
            SimpleEnvironment.from_config(args.env))
        return cls(args.env, args.service, args.health_checker, client,
                   enablement_timeout=args.enablement_timeout,
                   pause_timeout=args.pause_timeout,
                   total_timeout=args.total_timeout)

    def __init__(self, env, service, health_checker, client,
                 enablement_timeout=0, pause_timeout=0, total_timeout=0):
        self.env = env
        self.service = service
        self.health_checker = health_checker
        self.client = client
        self.enablement_timeout = enablement_timeout
        self.pause_timeout = pause_timeout
        self.total_timeout = total_timeout
        self.expire_time = (datetime.now() + timedelta(seconds=total_timeout))
        self.monkey_ids = {}

    def deploy_chaos_monkey(self):
        """Juju deploy chaos-monkey and add a relation.

        JUJU_REPOSITORY must be set in the OS environment so a local
        chaos-monkey charm can be found.
        """
        logging.debug('Deploying local:chaos-monkey.')
        self.client.deploy('local:chaos-monkey')
        logging.debug('Relating chaos-monkey to {}.'.format(self.service))
        self.client.juju('add-relation', (self.service, 'chaos-monkey'))
        logging.debug('Waiting for services to start.')
        self.client.wait_for_started()
        self.client.wait_for_subordinate_units(self.service, 'chaos-monkey')

    def iter_chaos_monkey_units(self):
        status = self.client.get_status()
        for unit_name, unit in status.service_subordinate_units(self.service):
            if not unit_name.startswith('chaos-monkey'):
                continue
            yield unit_name, unit

    def unleash_once(self):
        for unit_name, unit in self.iter_chaos_monkey_units():
            logging.info('Starting the chaos monkey on: {}'.format(unit_name))
            enablement_arg = ('enablement-timeout={}'.format(
                self.enablement_timeout))
            monkey_id = self.monkey_ids.get(unit_name)
            args = (unit_name,) + ('start',) + ('mode=single',)
            args = args + (enablement_arg,)
            if monkey_id is not None:
                args = args + ('monkey-id={}'.format(monkey_id),)
            action_out = self.client.get_juju_output('action do', *args)
            if not action_out.startswith('Action queued with id'):
                raise Exception(
                    'Unexpected output from "juju action do": {}'.format(
                        action_out))
            logging.info(action_out)
            if not self.monkey_ids.get(unit_name):
                id = action_out.split().pop()
                logging.info('Setting the monkey-id for {} to: {}'.format(
                    unit_name, id))
                self.monkey_ids[unit_name] = id
        # Allow chaos time to run
        sleep(self.enablement_timeout)

    def is_healthy(self):
        """Returns a boolean after running the health_checker."""
        try:
            sub_output = subprocess.check_output(self.health_checker)
            logging.info('Health check output: {}'.format(sub_output))
        except OSError as e:
            logging.error(
                'The health check script failed to execute with: {}'.format(
                    e))
            raise
        except subprocess.CalledProcessError as e:
            logging.error('Non-zero exit code returned from {}: {}'.format(
                self.health_checker, e))
            logging.error(e.output)
            return False
        return True

    def get_unit_status(self, unit_name):
        """Return 'done' if no lock file otherwise 'running'"""
        service_config = self.client.get_service_config('chaos-monkey')
        logging.debug('{}'.format(service_config))
        logging.debug('Checking if chaos is done on: {}'.format(unit_name))
        check_cmd = '[ -f '
        check_cmd += service_config['settings']['chaos-dir']['value']
        check_cmd += '/chaos_monkey.' + self.monkey_ids[unit_name]
        check_cmd += '/chaos_runner.lock'
        check_cmd += ' ]'
        if self.client.juju('run', ('--unit', unit_name, check_cmd),
                            check=False):
            return 'done'
        return 'running'

    def wait_for_chaos_complete(self, timeout=300):
        for ignored in until_timeout(timeout):
            locks = defaultdict(list)
            for unit_name, unit in self.iter_chaos_monkey_units():
                locks[self.get_unit_status(unit_name)].append(unit_name)
            if locks.keys() == ['done']:
                logging.debug(
                    'All lock files have been removed: {}'.format(locks))
                break
        else:
            raise Exception('Chaos operations did not complete.')

    def run_while_healthy_or_timeout(self):
        logging.debug('run_while_healthy_or_timeout')
        while self.is_healthy():
            logging.debug('Unleashing chaos.')
            self.unleash_once()
            self.wait_for_chaos_complete()
            if datetime.now() > self.expire_time:
                logging.debug(
                    'Reached run timeout, all done running chaos.')
                break
            if self.pause_timeout:
                logging.debug(
                    'Pausing {} seconds after running chaos.'.format(
                        self.pause_timeout))
                sleep(self.pause_timeout)
        else:
            logging.error('The health check reported an error: {}'.format(
                self.health_checker))
            sys.exit(1)


def get_args(argv=None):
    parser = ArgumentParser()
    parser.add_argument('env', help='The name of the environment.')
    parser.add_argument('service', help='A service name to monkey with.')
    parser.add_argument(
        'health_checker',
        help='A binary for checking the health of the environment.')
    parser.add_argument(
        '-et', '--enablement-timeout', default=30, type=int,
        help="Enablement timeout in seconds.", metavar='SECONDS')
    parser.add_argument(
        '-tt', '--total-timeout', type=int, help="Total timeout in seconds.",
        metavar='SECONDS')
    parser.add_argument(
        '-pt', '--pause-timeout', default=0, type=int,
        help="Pause timeout in seconds.", metavar='SECONDS')
    args = parser.parse_args(argv)
    if not args.total_timeout:
        args.total_timeout = args.enablement_timeout
    if args.enablement_timeout > args.total_timeout:
        parser.error("total-timeout can not be less than "
                     "enablement-timeout.")
    if args.total_timeout <= 0:
        parser.error("Invalid total-timeout value: timeout must be "
                     "greater than zero.")
    if args.enablement_timeout < 0:
        parser.error("Invalid enablement-timeout value: timeout must be "
                     "zero or greater.")

    return args


def main():
    """ Deploy and run chaos monkey, while checking env health.

    The Chaos Monkey is deployed into the environment and related to
    the specified service. Juju actions are then used to run one chaos
    operation at a time. After each operation, the provided health
    check script is executed, to ensure the Juju environment or
    software stack is still healthy.
    """
    configure_logging(logging.INFO)
    args = get_args()
    monkey_runner = MonkeyRunner.from_config(args)
    logging.info("Chaos Monkey Start.")
    monkey_runner.deploy_chaos_monkey()
    monkey_runner.run_while_healthy_or_timeout()
    logging.info("Chaos Monkey Complete.")

if __name__ == '__main__':
    main()