~ev/uci-engine/guard-against-breaking-virtualenvs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/env python

import atexit
import logging
import os
import signal
import subprocess
import sys
import time

import statsd


HERE = os.path.abspath(os.path.dirname(__file__))

sys.path.insert(0, os.path.join(HERE, '..'))
from testing import venv


def check_lxc_mounted():
    out = subprocess.check_output(['mount'])
    if ' /var/lib/lxc ' not in out:
        logging.error('No /var/lib/lxc in mount output: {}'.format(out))
        return False
    return True


def _run_and_log_on_failure(cmd):
    '''Run the command and return True. On failure, log the output and return
    False.'''

    try:
        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
        return True
    except subprocess.CalledProcessError as e:
        # Only print the output of the command if it failed.
        msg = 'Calling `%s` failed (with code %d):'
        logging.error(msg % (e.cmd, e.returncode))
        for line in e.output.splitlines():
            logging.error(line)
        return False


def bootstrap():
    '''Try to bootstrap, then confirm it succeeded by calling status.'''

    cmd = ['juju', 'bootstrap']
    if not _run_and_log_on_failure(cmd):
        logging.error('Bootstrapping failed.')
        return False

    cmd = ['juju', 'status']
    if not _run_and_log_on_failure(cmd):
        # XXX ev 2014-07-07: Workaround juju:
        # Error details: EOF
        time.sleep(15)
        if not _run_and_log_on_failure(cmd):
            logging.error('Bootstrapping suceeded but instance unreachable.')
            return False

    return True


def destroy_environment():
    '''Destroy the local environment. Return True if successful, False
    otherwise.'''

    # FIXME: Using 'local' below makes it hard to test locally with a different
    # env name. Using ci_utils.juju.juju.environment_name() would address the
    # issue -- vila 2014-11-12
    cmd = ['juju', 'destroy-environment', '--force', '-y', 'local']
    return _run_and_log_on_failure(cmd)


def fake_out_branch():
    '''Work around tarmac using bzr export vs bzr branch (quoting dobey):

    tarmac used to just run the tests in the checked out branch, but it was
    changed to doing an export to a temp dir for security, so that rogue
    branches can't commit themselves and such.

    Return True on success, False on failure.'''

    kw = {'stderr': subprocess.PIPE}
    email = 'uci-bot <canonical-ci-engineering@lists.launchpad.net>'
    try:
        subprocess.check_output(['bzr', 'init'], **kw)
        subprocess.check_output(['bzr', 'whoami', email], **kw)
        subprocess.check_output(['bzr', 'add', '.'], **kw)
        subprocess.check_output(['bzr', 'commit', '-m', '""'], **kw)
    except subprocess.CalledProcessError as e:
        # Only print the output of the command if it failed.
        logging.error('Calling `%s` failed:' % e.cmd)
        for line in e.output.splitlines():
            logging.error(line)
            return False
    return True


class SaneMetrics(object):
    """Compatibility layer between bare StatsdClient and TxStatsd.

    This class mimics TxStatsd `ExtendedMetrics` API on a bare
    statsd `StatsClient`.
    It's just an nice-to-have feature while txstatsd is not installable
    from pypi.
    """
    def __init__(self, host, port, namespace=''):
        self.client = statsd.StatsClient(host, int(port), namespace)
        self.last_time = 0

    def reset_timing(self):
        self.last_time = time.time()

    def timing(self, stat):
        current_time = time.time()
        duration = (current_time - self.last_time) * 1000
        self.last_time = current_time
        self.client.timing(stat, duration)


def main():
    # XXX cprov 20140903: Sets logging level to INFO for the entire
    # procedure. It should never be lowered to DEBUG otherwise credentials
    # leak via python-keystone requests.
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s %(levelname)s %(message)s")

    # Setup STATSD metrics for 'CI_STATSD_URI', if it's not defined
    # metrics will go to limbo.
    statsd_uri = os.environ.get('CI_STATSD_URI', 'localhost:8125')
    namespace = os.environ.get('CI_STATSD_NAMESPACE', 'tarmac')

    # XXX cprov 20141120: Note that we are using timers also for counting
    # events ('counters.*' namespace), that's because counter (and gauges)
    # currently do not work as (we) expect in the available versions of
    # graphite+carbon+txstatsd packages in ubuntu. The essential problem is
    # that counting samples are propagated to future intervals instead of
    # getting aggregated (summed). Timers count work as we expect.
    metrics = SaneMetrics(*statsd_uri.split(':'), namespace=namespace)

    def die(signum, frame):
        """Die as cleanly as softwarely possible.

        We rely on atexit() handlers to remove various temporary directories
        so we need to call sys.exit().
        """
        metrics.timing('counters.failures.killed')
        logging.info('Received signal {}, exiting'.format(signum))
        # We don't want to be interrupted while we process the signal, we're
        # exiting anyway.
        signal.signal(signum, signal.SIG_IGN)

        # When tarmac kills us, we'll exit with a -SIGNAL return code, so the
        # value below will be overriden.
        sys.exit(42)

    # Be ready to die. We use some atexit handlers that need to be called in
    # all cases or we'll leak in various places.
    signal.signal(signal.SIGINT, die)
    signal.signal(signal.SIGTERM, die)

    # venv.install() re-executes this script inside the just-created virtual
    # environment. We store the metrics start time in an environment
    # variable so it can be restored.
    if os.environ.get('TARMAC_METRIC_STARTED') is None:
        metrics.reset_timing()
        metrics.timing('counters.attempts')
        os.environ['TARMAC_METRIC_STARTED'] = str(metrics.last_time)
    else:
        metrics.last_time = float(os.environ.get('TARMAC_METRIC_STARTED'))
        os.environ.pop('TARMAC_METRIC_STARTED')

    if hasattr(sys, 'real_prefix') and os.environ.get('REEXECED_UNDER_VENV'):
        # Inside the forked process. Clean up on exit.
        atexit.register(venv.delete)
        metrics.timing('setup')
        if not fake_out_branch():
            metrics.timing('counters.failures.fake_out_branch')
            return 1

    if not check_lxc_mounted():
        logging.error('/var/lib/lxc not mounted.')
        metrics.timing('counters.failures.lxc_not_mounted')
        return 1

    # TODO ev 2014-07-04 Delete all the swift containers created during
    # testing.

    # Run both the deployment and tests under a virtualenv.
    try:
        venv.install()
    except:
        # XXX cprov 2014-11-13: there is a chance for failures during the
        # venv creation before this code is re-executed. We don't know
        # exactly if and/or how they happen, thus the bare exception and
        # a simple counter. We will have a better idea after we start
        # collecting and investigating stats.
        metrics.timing('counters.failures.venv')
        raise

    from testing import run_tests
    # deploy.py isn't in our pythonpath when run as a script
    sys.path.append(os.path.abspath('juju-deployer'))
    import deploy

    try:
        with open('/dev/null', 'w') as null:
            subprocess.check_call(['juju', 'status'], stdout=null, stderr=null)
    except subprocess.CalledProcessError:
        # The environment doesn't exist, so `juju status` fails.
        pass
    else:
        # The environment wasn't cleaned up from the last run, but that's to be
        # expected. It's left around for debugging between runs.
        if not destroy_environment():
            metrics.timing('counters.failures.destroy')
            return 1
        metrics.timing('destroy')

    logging.info('Deploying.')
    t = time.time()
    # Bootstrap, then deploy.
    if not bootstrap() or deploy.main() != 0:
        metrics.timing('counters.failures.deploy')
        return 1
    metrics.timing('deploy')

    logging.info('Done in %.2fs' % (time.time() - t))
    result = run_tests.main(sys.argv[1:], sys.stdout, sys.stderr)
    if not result:
        metrics.timing('counters.success')
        metrics.timing('testing')
    else:
        metrics.timing('counters.failures.testing')

    return result


if __name__ == '__main__':
    sys.exit(main())