~veebers/juju-ci-tools/model_migration_check_all_units_of_charm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
#!/usr/bin/env python
from __future__ import print_function
__metaclass__ = type


from argparse import ArgumentParser
from contextlib import (
    contextmanager,
    nested,
)
import glob
import logging
import os
import random
import re
import string
import subprocess
import sys
import time
import json
import shutil

from chaos import background_chaos
from jujuconfig import (
    get_jenv_path,
    get_juju_home,
    translate_to_env,
)
from jujupy import (
    EnvJujuClient,
    get_cache_path,
    get_local_root,
    jes_home_path,
    SimpleEnvironment,
    temp_bootstrap_env,
)
from remote import (
    remote_from_address,
    remote_from_unit,
)
from substrate import (
    destroy_job_instances,
    LIBVIRT_DOMAIN_RUNNING,
    resolve_remote_dns_names,
    run_instances,
    start_libvirt_domain,
    stop_libvirt_domain,
    verify_libvirt_domain,
)
from utility import (
    add_basic_testing_arguments,
    configure_logging,
    ensure_deleted,
    PortTimeoutError,
    print_now,
    until_timeout,
    wait_for_port,
)


def destroy_environment(client, instance_tag):
    client.destroy_environment()
    if (client.env.config['type'] == 'manual'
            and 'AWS_ACCESS_KEY' in os.environ):
        destroy_job_instances(instance_tag)


def deploy_dummy_stack(client, charm_prefix):
    """"Deploy a dummy stack in the specified environment.
    """
    client.deploy(charm_prefix + 'dummy-source')
    token = get_random_string()
    client.juju('set', ('dummy-source', 'token=%s' % token))
    client.deploy(charm_prefix + 'dummy-sink')
    client.juju('add-relation', ('dummy-source', 'dummy-sink'))
    client.juju('expose', ('dummy-sink',))
    if client.env.kvm or client.env.maas:
        # A single virtual machine may need up to 30 minutes before
        # "apt-get update" and other initialisation steps are
        # finished; two machines initializing concurrently may
        # need even 40 minutes. In addition Windows image blobs or
        # any system deployment using MAAS requires extra time.
        client.wait_for_started(3600)
    else:
        client.wait_for_started()
    check_token(client, token)


GET_TOKEN_SCRIPT = """
        for x in $(seq 120); do
          if [ -f /var/run/dummy-sink/token ]; then
            if [ "$(cat /var/run/dummy-sink/token)" != "" ]; then
              break
            fi
          fi
          sleep 1
        done
        cat /var/run/dummy-sink/token
    """


def check_token(client, token):
    # Wait up to 120 seconds for token to be created.
    # Utopic is slower, maybe because the devel series gets more
    # package updates.
    logging.info('Retrieving token.')
    remote = remote_from_unit(client, "dummy-sink/0")
    # Update remote with real address if needed.
    resolve_remote_dns_names(client.env, [remote])
    start = time.time()
    while True:
        if remote.is_windows():
            result = remote.cat("%ProgramData%\\dummy-sink\\token")
        else:
            result = remote.run(GET_TOKEN_SCRIPT)
        result = re.match(r'([^\n\r]*)\r?\n?', result).group(1)
        if result == token:
            logging.info("Token matches expected %r", result)
            return
        if time.time() - start > 120:
            raise ValueError('Token is %r' % result)
        logging.info("Found token %r expected %r", result, token)
        time.sleep(5)


def get_random_string():
    allowed_chars = string.ascii_uppercase + string.digits
    return ''.join(random.choice(allowed_chars) for n in range(20))


def _can_run_ssh():
    """Returns true if local system can use ssh to access remote machines"""
    # When client is run on a windows machine, we have no local ssh binary.
    return sys.platform != "win32"


def dump_env_logs(client, bootstrap_host, artifacts_dir, runtime_config=None):
    if client.env.local:
        logging.info("Retrieving logs for local environment")
        copy_local_logs(client.env, artifacts_dir)
    else:
        remote_machines = get_remote_machines(client, bootstrap_host)

        for machine_id in sorted(remote_machines, key=int):
            remote = remote_machines[machine_id]
            if not _can_run_ssh() and not remote.is_windows():
                logging.info("No ssh, skipping logs for machine-%s using %r",
                             machine_id, remote)
                continue
            logging.info("Retrieving logs for machine-%s using %r", machine_id,
                         remote)
            machine_dir = os.path.join(artifacts_dir,
                                       "machine-%s" % machine_id)
            os.mkdir(machine_dir)
            copy_remote_logs(remote, machine_dir)
    archive_logs(artifacts_dir)
    retain_config(runtime_config, artifacts_dir)


def retain_config(runtime_config, log_directory):
    if not runtime_config:
        return False

    try:
        shutil.copy(runtime_config, log_directory)
        return True
    except IOError:
        print_now("Failed to copy file. Source: %s Destination: %s" %
                  (runtime_config, log_directory))
    return False


def dump_juju_timings(client, log_directory):
    try:
        with open(os.path.join(log_directory, 'juju_command_times.json'),
                  'w') as timing_file:
            json.dump(client.get_juju_timings(), timing_file, indent=2,
                      sort_keys=True)
            timing_file.write('\n')
    except Exception as e:
        print_now("Failed to save timings")
        print_now(str(e))


def get_remote_machines(client, bootstrap_host):
    """Return a dict of machine_id to remote machines.

    A bootstrap_host address may be provided as a fallback for machine 0 if
    status fails. For some providers such as MAAS the dns-name will be
    resolved to a real ip address using the substrate api.
    """
    # Try to get machine details from environment if possible.
    machines = dict(iter_remote_machines(client))
    # The bootstrap host is added as a fallback in case status failed.
    if bootstrap_host and '0' not in machines:
        machines['0'] = remote_from_address(bootstrap_host)
    # Update remote machines in place with real addresses if substrate needs.
    resolve_remote_dns_names(client.env, machines.itervalues())
    return machines


def iter_remote_machines(client):
    try:
        status = client.get_status()
    except Exception as err:
        logging.warning("Failed to retrieve status for dumping logs: %s", err)
        return

    for machine_id, machine in status.iter_machines():
        hostname = machine.get('dns-name')
        if hostname:
            remote = remote_from_address(hostname, machine.get('series'))
            yield machine_id, remote


def archive_logs(log_dir):
    """Compress log files in given log_dir using gzip."""
    log_files = glob.glob(os.path.join(log_dir, '*.log'))
    if log_files:
        subprocess.check_call(['gzip', '--best', '-f'] + log_files)


lxc_template_glob = '/var/lib/juju/containers/juju-*-lxc-template/*.log'


def copy_local_logs(env, directory):
    """Copy logs for all machines in local environment."""
    local = get_local_root(get_juju_home(), env)
    log_names = [os.path.join(local, 'cloud-init-output.log')]
    log_names.extend(glob.glob(os.path.join(local, 'log', '*.log')))
    log_names.extend(glob.glob(lxc_template_glob))
    try:
        subprocess.check_call(['sudo', 'chmod', 'go+r'] + log_names)
        subprocess.check_call(['cp'] + log_names + [directory])
    except subprocess.CalledProcessError as e:
        logging.warning("Could not retrieve local logs: %s", e)


def copy_remote_logs(remote, directory):
    """Copy as many logs from the remote host as possible to the directory."""
    # This list of names must be in the order of creation to ensure they
    # are retrieved.
    if remote.is_windows():
        log_paths = [
            "%ProgramFiles(x86)%\\Cloudbase Solutions\\Cloudbase-Init\\log\\*",
            "C:\\Juju\\log\\juju\\*.log",
        ]
    else:
        log_paths = [
            '/var/log/cloud-init*.log',
            '/var/log/juju/*.log',
        ]

        try:
            wait_for_port(remote.address, 22, timeout=60)
        except PortTimeoutError:
            logging.warning("Could not dump logs because port 22 was closed.")
            return

        try:
            remote.run('sudo chmod go+r /var/log/juju/*')
        except subprocess.CalledProcessError as e:
            # The juju log dir is not created until after cloud-init succeeds.
            logging.warning("Could not allow access to the juju logs:")
            logging.warning(e.output)

    try:
        remote.copy(directory, log_paths)
    except subprocess.CalledProcessError as e:
        # The juju logs will not exist if cloud-init failed.
        logging.warning("Could not retrieve some or all logs:")
        logging.warning(e.output)


def assess_juju_run(client):
    responses = client.get_juju_output('run', '--format', 'json', '--service',
                                       'dummy-source,dummy-sink', 'uname')
    responses = json.loads(responses)
    for machine in responses:
        if machine.get('ReturnCode', 0) != 0:
            raise ValueError('juju run on machine %s returned %d: %s' % (
                             machine.get('MachineId'),
                             machine.get('ReturnCode'),
                             machine.get('Stderr')))
    logging.info(
        "juju run succeeded on machines: %r",
        [str(machine.get("MachineId")) for machine in responses])
    return responses


def assess_upgrade(old_client, juju_path, skip_juju_run=False):
    client = EnvJujuClient.by_version(old_client.env, juju_path,
                                      old_client.debug)
    upgrade_juju(client)
    if client.env.config['type'] == 'maas':
        timeout = 1200
    else:
        timeout = 600
    client.wait_for_version(client.get_matching_agent_version(), timeout)
    if not skip_juju_run:
        assess_juju_run(client)
    token = get_random_string()
    client.juju('set', ('dummy-source', 'token=%s' % token))
    check_token(client, token)


def upgrade_juju(client):
    client.set_testing_tools_metadata_url()
    tools_metadata_url = client.get_env_option('tools-metadata-url')
    print(
        'The tools-metadata-url is %s' % tools_metadata_url)
    client.upgrade_juju()


def deploy_job_parse_args(argv=None):
    parser = ArgumentParser('deploy_job')
    add_basic_testing_arguments(parser)
    parser.add_argument('--upgrade', action="store_true", default=False,
                        help='Perform an upgrade test.')
    parser.add_argument('--with-chaos', default=0, type=int,
                        help='Deploy and run Chaos Monkey in the background.')
    parser.add_argument('--jes', action='store_true',
                        help='Use JES to control environments.')
    parser.add_argument('--pre-destroy', action='store_true',
                        help='Destroy any environment in the way first.')
    return parser.parse_args(argv)


def deploy_job():
    args = deploy_job_parse_args()
    configure_logging(args.verbose)
    series = args.series
    if series is None:
        series = 'precise'
    charm_prefix = 'local:{}/'.format(series)
    # Don't need windows state server to test windows charms, trusty is faster.
    if series.startswith("win"):
        logging.info('Setting default series to trusty for windows deploy.')
        series = 'trusty'
    return _deploy_job(args.temp_env_name, args.env, args.upgrade,
                       charm_prefix, args.bootstrap_host, args.machine,
                       series, args.logs, args.debug, args.juju_bin,
                       args.agent_url, args.agent_stream,
                       args.keep_env, args.upload_tools, args.with_chaos,
                       args.jes, args.pre_destroy)


def update_env(env, new_env_name, series=None, bootstrap_host=None,
               agent_url=None, agent_stream=None):
    # Rename to the new name.
    env.environment = new_env_name
    env.config['name'] = new_env_name
    if series is not None:
        env.config['default-series'] = series
    if bootstrap_host is not None:
        env.config['bootstrap-host'] = bootstrap_host
    if agent_url is not None:
        env.config['tools-metadata-url'] = agent_url
    if agent_stream is not None:
        env.config['agent-stream'] = agent_stream


@contextmanager
def boot_context(temp_env_name, client, bootstrap_host, machines, series,
                 agent_url, agent_stream, log_dir, keep_env, upload_tools,
                 permanent=False):
    """Create a temporary environment in a context manager to run tests in.

    Bootstrap a new environment from a temporary config that is suitable to
    run tests in. Logs will be collected from the machines. The environment
    will be destroyed when the test completes or there is an unrecoverable
    error.

    The temporary environment is created by updating a EnvJujuClient's config
    with series, agent_url, agent_stream.

    :param temp_env_name: a unique name for the juju env, such as a Jenkins
        job name.
    :param client: an EnvJujuClient.
    :param bootstrap_host: None, or the address of a manual or MAAS host to
        bootstrap on.
    :param machine: [] or a list of machines to use add to a manual env
        before deploying services.
    :param series: None or the default-series for the temp config.
    :param agent_url: None or the agent-metadata-url for the temp config.
    :param agent_stream: None or the agent-stream for the temp config.
    :param log_dir: The path to the directory to store logs.
    :param keep_env: False or True to not destroy the environment and keep
        it alive to do an autopsy.
    :param upload_tools: False or True to upload the local agent instead of
        using streams.
    """
    created_machines = False
    running_domains = dict()
    try:
        if client.env.config['type'] == 'manual' and bootstrap_host is None:
            instances = run_instances(3, temp_env_name)
            created_machines = True
            bootstrap_host = instances[0][1]
            machines.extend(i[1] for i in instances[1:])
        if client.env.config['type'] == 'maas' and machines:
            for machine in machines:
                name, URI = machine.split('@')
                # Record already running domains, so they can be left running,
                # when finished with the test.
                if verify_libvirt_domain(URI, name, LIBVIRT_DOMAIN_RUNNING):
                    running_domains = {machine: True}
                    logging.info("%s is already running" % name)
                else:
                    running_domains = {machine: False}
                    logging.info("Attempting to start %s at %s" % (name, URI))
                    status_msg = start_libvirt_domain(URI, name)
                    logging.info("%s" % status_msg)
            # No further handling of machines down the line is required.
            machines = []

        update_env(client.env, temp_env_name, series=series,
                   bootstrap_host=bootstrap_host, agent_url=agent_url,
                   agent_stream=agent_stream)
        try:
            host = bootstrap_host
            ssh_machines = [] + machines
            if host is not None:
                ssh_machines.append(host)
            for machine in ssh_machines:
                logging.info('Waiting for port 22 on %s' % machine)
                wait_for_port(machine, 22, timeout=120)
            juju_home = get_juju_home()
            jenv_path = get_jenv_path(juju_home, client.env.environment)
            ensure_deleted(jenv_path)
            try:
                with temp_bootstrap_env(juju_home, client,
                                        permanent=permanent):
                    client.bootstrap(upload_tools)
            except:
                # If run from a windows machine may not have ssh to get logs
                if host is not None and _can_run_ssh():
                    remote = remote_from_address(host, series=series)
                    copy_remote_logs(remote, log_dir)
                    archive_logs(log_dir)
                raise
            try:
                if host is None:
                    host = get_machine_dns_name(client, 0)
                if host is None:
                    raise Exception('Could not get machine 0 host')
                try:
                    yield
                except BaseException as e:
                    logging.exception(e)
                    sys.exit(1)
            finally:
                safe_print_status(client)
                jes_enabled = client.is_jes_enabled()
                if jes_enabled:
                    runtime_config = get_cache_path(client.juju_home)
                else:
                    runtime_config = get_jenv_path(client.juju_home,
                                                   client.env.environment)
                if host is not None:
                    dump_env_logs(client, host, log_dir,
                                  runtime_config=runtime_config)
                if not keep_env:
                    if jes_enabled:
                        client.juju(
                            'system kill', (client.env.environment, '-y'),
                            include_e=False, check=False, timeout=600)
                    else:
                        client.destroy_environment()
        finally:
            if created_machines and not keep_env:
                destroy_job_instances(temp_env_name)
    finally:
        logging.info(
            'Juju command timings: {}'.format(client.get_juju_timings()))
        dump_juju_timings(client, log_dir)
        if client.env.config['type'] == 'maas' and not keep_env:
            logging.info("Waiting for destroy-environment to complete")
            time.sleep(90)
            for machine, running in running_domains.items():
                name, URI = machine.split('@')
                if running:
                    logging.warning("%s at %s was running when deploy_job "
                                    "started. Shutting it down to ensure a "
                                    "clean environment." % (name, URI))
                logging.info("Attempting to stop %s at %s" % (name, URI))
                status_msg = stop_libvirt_domain(URI, name)
                logging.info("%s" % status_msg)


def _deploy_job(temp_env_name, base_env, upgrade, charm_prefix, bootstrap_host,
                machines, series, log_dir, debug, juju_path, agent_url,
                agent_stream, keep_env, upload_tools, with_chaos, use_jes,
                pre_destroy):
    start_juju_path = None if upgrade else juju_path
    if sys.platform == 'win32':
        # Ensure OpenSSH is never in the path for win tests.
        sys.path = [p for p in sys.path if 'OpenSSH' not in p]
    if pre_destroy:
        client = EnvJujuClient.by_version(
            SimpleEnvironment(temp_env_name, {}), juju_path, debug)
        if use_jes:
            client.enable_jes()
            client.juju_home = jes_home_path(client.juju_home, temp_env_name)
        client.destroy_environment()
    client = EnvJujuClient.by_version(
        SimpleEnvironment.from_config(base_env), start_juju_path, debug)
    if use_jes:
        client.enable_jes()
    with boot_context(temp_env_name, client, bootstrap_host, machines,
                      series, agent_url, agent_stream, log_dir, keep_env,
                      upload_tools, permanent=use_jes):
        if machines is not None:
            client.add_ssh_machines(machines)
        if sys.platform in ('win32', 'darwin'):
            # The win and osx client tests only verify the client
            # can bootstrap and call the state-server.
            return
        client.juju('status', ())
        if with_chaos > 0:
            manager = background_chaos(temp_env_name, client, log_dir,
                                       with_chaos)
        else:
            # Create a no-op context manager, to avoid duplicate calls of
            # deploy_dummy_stack(), as was the case prior to this revision.
            manager = nested()
        with manager:
            deploy_dummy_stack(client, charm_prefix)
        is_windows_charm = charm_prefix.startswith("local:win")
        if not is_windows_charm:
            assess_juju_run(client)
        if upgrade:
            client.juju('status', ())
            assess_upgrade(client, juju_path, skip_juju_run=is_windows_charm)


def safe_print_status(client):
    """Show the output of juju status without raising exceptions."""
    try:
        client.juju('status', ())
    except Exception as e:
        logging.exception(e)


def get_machine_dns_name(client, machine):
    timeout = 600
    for remaining in until_timeout(timeout):
        bootstrap = client.get_status(
            timeout=timeout).status['machines'][str(machine)]
        host = bootstrap.get('dns-name')
        if host is not None and not host.startswith('172.'):
            return host


def wait_for_state_server_to_shutdown(host, client, instance_id):
    print_now("Waiting for port to close on %s" % host)
    wait_for_port(host, 17070, closed=True)
    print_now("Closed.")
    provider_type = client.env.config.get('type')
    if provider_type == 'openstack':
        environ = dict(os.environ)
        environ.update(translate_to_env(client.env.config))
        for ignored in until_timeout(300):
            output = subprocess.check_output(['nova', 'list'], env=environ)
            if instance_id not in output:
                print_now('{} was removed from nova list'.format(instance_id))
                break
        else:
            raise Exception(
                '{} was not deleted:\n{}'.format(instance_id, output))