~ev/uci-engine/guard-against-breaking-virtualenvs : contents of bin/called-by-tarmac.py at revision 900

~ev/uci-engine/guard-against-breaking-virtualenvs : (revision 900)
#!/usr/bin/env python

import atexit
import logging
import os
import signal
import subprocess
import sys
import time

import statsd


HERE = os.path.abspath(os.path.dirname(__file__))

sys.path.insert(0, os.path.join(HERE, '..'))
from testing import venv


def check_lxc_mounted():
    out = subprocess.check_output(['mount'])
    if ' /var/lib/lxc ' not in out:
        logging.error('No /var/lib/lxc in mount output: {}'.format(out))
        return False
    return True


def _run_and_log_on_failure(cmd):
    '''Run the command and return True. On failure, log the output and return
    False.'''

    try:
        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
        return True
    except subprocess.CalledProcessError as e:
        # Only print the output of the command if it failed.
        msg = 'Calling `%s` failed (with code %d):'
        logging.error(msg % (e.cmd, e.returncode))
        for line in e.output.splitlines():
            logging.error(line)
        return False


def bootstrap():
    '''Try to bootstrap, then confirm it succeeded by calling status.'''

    cmd = ['juju', 'bootstrap']
    if not _run_and_log_on_failure(cmd):
        logging.error('Bootstrapping failed.')
        return False

    cmd = ['juju', 'status']
    if not _run_and_log_on_failure(cmd):
        # XXX ev 2014-07-07: Workaround juju:
        # Error details: EOF
        time.sleep(15)
        if not _run_and_log_on_failure(cmd):
            logging.error('Bootstrapping suceeded but instance unreachable.')
            return False

    return True


def destroy_environment():
    '''Destroy the local environment. Return True if successful, False
    otherwise.'''

    # FIXME: Using 'local' below makes it hard to test locally with a different
    # env name. Using ci_utils.juju.juju.environment_name() would address the
    # issue -- vila 2014-11-12
    cmd = ['juju', 'destroy-environment', '--force', '-y', 'local']
    return _run_and_log_on_failure(cmd)


def fake_out_branch():
    '''Work around tarmac using bzr export vs bzr branch (quoting dobey):

    tarmac used to just run the tests in the checked out branch, but it was
    changed to doing an export to a temp dir for security, so that rogue
    branches can't commit themselves and such.

    Return True on success, False on failure.'''

    kw = {'stderr': subprocess.PIPE}
    email = 'uci-bot <canonical-ci-engineering@lists.launchpad.net>'
    try:
        subprocess.check_output(['bzr', 'init'], **kw)
        subprocess.check_output(['bzr', 'whoami', email], **kw)
        subprocess.check_output(['bzr', 'add', '.'], **kw)
        subprocess.check_output(['bzr', 'commit', '-m', '""'], **kw)
    except subprocess.CalledProcessError as e:
        # Only print the output of the command if it failed.
        logging.error('Calling `%s` failed:' % e.cmd)
        for line in e.output.splitlines():
            logging.error(line)
            return False
    return True


class SaneMetrics(object):
    """Compatibility layer between bare StatsdClient and TxStatsd.

    This class mimics TxStatsd `ExtendedMetrics` API on a bare
    statsd `StatsClient`.
    It's just an nice-to-have feature while txstatsd is not installable
    from pypi.
    """
    def __init__(self, host, port, namespace=''):
        self.client = statsd.StatsClient(host, int(port), namespace)
        self.last_time = 0

    def reset_timing(self):
        self.last_time = time.time()

    def timing(self, stat):
        current_time = time.time()
        duration = (current_time - self.last_time) * 1000
        self.last_time = current_time
        self.client.timing(stat, duration)


def main():
    # XXX cprov 20140903: Sets logging level to INFO for the entire
    # procedure. It should never be lowered to DEBUG otherwise credentials
    # leak via python-keystone requests.
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s %(levelname)s %(message)s")

    # Setup STATSD metrics for 'CI_STATSD_URI', if it's not defined
    # metrics will go to limbo.
    statsd_uri = os.environ.get('CI_STATSD_URI', 'localhost:8125')
    namespace = os.environ.get('CI_STATSD_NAMESPACE', 'tarmac')

    # XXX cprov 20141120: Note that we are using timers also for counting
    # events ('counters.*' namespace), that's because counter (and gauges)
    # currently do not work as (we) expect in the available versions of
    # graphite+carbon+txstatsd packages in ubuntu. The essential problem is
    # that counting samples are propagated to future intervals instead of
    # getting aggregated (summed). Timers count work as we expect.
    metrics = SaneMetrics(*statsd_uri.split(':'), namespace=namespace)

    def die(signum, frame):
        """Die as cleanly as softwarely possible.

        We rely on atexit() handlers to remove various temporary directories
        so we need to call sys.exit().
        """
        metrics.timing('counters.failures.killed')
        logging.info('Received signal {}, exiting'.format(signum))
        # We don't want to be interrupted while we process the signal, we're
        # exiting anyway.
        signal.signal(signum, signal.SIG_IGN)

        # When tarmac kills us, we'll exit with a -SIGNAL return code, so the
        # value below will be overriden.
        sys.exit(42)

    # Be ready to die. We use some atexit handlers that need to be called in
    # all cases or we'll leak in various places.
    signal.signal(signal.SIGINT, die)
    signal.signal(signal.SIGTERM, die)

    # venv.install() re-executes this script inside the just-created virtual
    # environment. We store the metrics start time in an environment
    # variable so it can be restored.
    if os.environ.get('TARMAC_METRIC_STARTED') is None:
        metrics.reset_timing()
        metrics.timing('counters.attempts')
        os.environ['TARMAC_METRIC_STARTED'] = str(metrics.last_time)
    else:
        metrics.last_time = float(os.environ.get('TARMAC_METRIC_STARTED'))
        os.environ.pop('TARMAC_METRIC_STARTED')

    if hasattr(sys, 'real_prefix') and os.environ.get('REEXECED_UNDER_VENV'):
        # Inside the forked process. Clean up on exit.
        atexit.register(venv.delete)
        metrics.timing('setup')
        if not fake_out_branch():
            metrics.timing('counters.failures.fake_out_branch')
            return 1

    if not check_lxc_mounted():
        logging.error('/var/lib/lxc not mounted.')
        metrics.timing('counters.failures.lxc_not_mounted')
        return 1

    # TODO ev 2014-07-04 Delete all the swift containers created during
    # testing.

    # Run both the deployment and tests under a virtualenv.
    try:
        venv.install()
    except:
        # XXX cprov 2014-11-13: there is a chance for failures during the
        # venv creation before this code is re-executed. We don't know
        # exactly if and/or how they happen, thus the bare exception and
        # a simple counter. We will have a better idea after we start
        # collecting and investigating stats.
        metrics.timing('counters.failures.venv')
        raise

    from testing import run_tests
    # deploy.py isn't in our pythonpath when run as a script
    sys.path.append(os.path.abspath('juju-deployer'))
    import deploy

    try:
        with open('/dev/null', 'w') as null:
            subprocess.check_call(['juju', 'status'], stdout=null, stderr=null)
    except subprocess.CalledProcessError:
        # The environment doesn't exist, so `juju status` fails.
        pass
    else:
        # The environment wasn't cleaned up from the last run, but that's to be
        # expected. It's left around for debugging between runs.
        if not destroy_environment():
            metrics.timing('counters.failures.destroy')
            return 1
        metrics.timing('destroy')

    logging.info('Deploying.')
    t = time.time()
    # Bootstrap, then deploy.
    if not bootstrap() or deploy.main() != 0:
        metrics.timing('counters.failures.deploy')
        return 1
    metrics.timing('deploy')

    logging.info('Done in %.2fs' % (time.time() - t))
    result = run_tests.main(sys.argv[1:], sys.stdout, sys.stderr)
    if not result:
        metrics.timing('counters.success')
        metrics.timing('testing')
    else:
        metrics.timing('counters.failures.testing')

    return result


if __name__ == '__main__':
    sys.exit(main())