1
# Windows Azure Linux Agent
3
# Copyright 2014 Microsoft Corporation
5
# Licensed under the Apache License, Version 2.0 (the "License");
6
# you may not use this file except in compliance with the License.
7
# You may obtain a copy of the License at
9
# http://www.apache.org/licenses/LICENSE-2.0
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
17
# Requires Python 2.4+ and Openssl 1.0+
32
import azurelinuxagent.common.conf as conf
33
import azurelinuxagent.common.logger as logger
34
import azurelinuxagent.common.utils.fileutil as fileutil
35
import azurelinuxagent.common.utils.restutil as restutil
36
import azurelinuxagent.common.utils.textutil as textutil
38
from azurelinuxagent.common.event import add_event, WALAEventOperation
39
from azurelinuxagent.common.exception import UpdateError, ProtocolError
40
from azurelinuxagent.common.future import ustr
41
from azurelinuxagent.common.osutil import get_osutil
42
from azurelinuxagent.common.protocol import get_protocol_util
43
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
44
from azurelinuxagent.common.version import AGENT_NAME, AGENT_VERSION, AGENT_LONG_VERSION, \
45
AGENT_DIR_GLOB, AGENT_PKG_GLOB, \
46
AGENT_PATTERN, AGENT_NAME_PATTERN, AGENT_DIR_PATTERN, \
47
CURRENT_AGENT, CURRENT_VERSION, \
48
is_current_agent_installed
50
from azurelinuxagent.ga.exthandlers import HandlerManifest
53
AGENT_ERROR_FILE = "error.json" # File name for agent error record
54
AGENT_MANIFEST_FILE = "HandlerManifest.json"
56
CHILD_HEALTH_INTERVAL = 15 * 60
57
CHILD_LAUNCH_INTERVAL = 5 * 60
58
CHILD_LAUNCH_RESTART_MAX = 3
59
CHILD_POLL_INTERVAL = 60
61
MAX_FAILURE = 3 # Max failure allowed for agent before blacklisted
62
RETAIN_INTERVAL = 24 * 60 * 60 # Retain interval for black list
64
GOAL_STATE_INTERVAL = 25
65
REPORT_STATUS_INTERVAL = 15
67
ORPHAN_WAIT_INTERVAL = 15 * 60 * 60
69
AGENT_SENTINAL_FILE = "current_version"
72
def get_update_handler():
73
return UpdateHandler()
77
major_version = platform.python_version_tuple()[0]
78
return "python" if int(major_version) <= 2 else "python{0}".format(major_version)
81
class UpdateHandler(object):
84
self.osutil = get_osutil()
85
self.protocol_util = get_protocol_util()
89
self.last_attempt_time = None
93
self.child_agent = None
94
self.child_launch_time = None
95
self.child_launch_attempts = 0
96
self.child_process = None
98
self.signal_handler = None
101
def run_latest(self):
103
This method is called from the daemon to find and launch the most
104
current, downloaded agent.
107
- Most events should be tagged to the launched agent (agent_version)
110
if self.child_process is not None:
111
raise Exception("Illegal attempt to launch multiple goal state Agent processes")
113
if self.signal_handler is None:
114
self.signal_handler = signal.signal(signal.SIGTERM, self.forward_signal)
116
latest_agent = self.get_latest_agent()
117
if latest_agent is None:
118
logger.info(u"Installed Agent {0} is the most current agent", CURRENT_AGENT)
119
agent_cmd = "python -u {0} -run-exthandlers".format(sys.argv[0])
120
agent_dir = os.getcwd()
121
agent_name = CURRENT_AGENT
122
agent_version = CURRENT_VERSION
124
logger.info(u"Determined Agent {0} to be the latest agent", latest_agent.name)
125
agent_cmd = latest_agent.get_agent_cmd()
126
agent_dir = latest_agent.get_agent_dir()
127
agent_name = latest_agent.name
128
agent_version = latest_agent.version
132
# Launch the correct Python version for python-based agents
133
cmds = shlex.split(agent_cmd)
134
if cmds[0].lower() == "python":
135
cmds[0] = get_python_cmd()
136
agent_cmd = " ".join(cmds)
138
self._evaluate_agent_health(latest_agent)
140
self.child_process = subprocess.Popen(
146
logger.info(u"Agent {0} launched with command '{1}'", agent_name, agent_cmd)
149
start_time = time.time()
150
while (time.time() - start_time) < CHILD_HEALTH_INTERVAL:
151
time.sleep(CHILD_POLL_INTERVAL)
152
ret = self.child_process.poll()
156
if ret is None or ret <= 0:
157
msg = u"Agent {0} launched with command '{1}' is successfully running".format(
163
version=agent_version,
164
op=WALAEventOperation.Enable,
169
ret = self.child_process.wait()
172
msg = u"Agent {0} launched with command '{1}' failed with return code: {2}".format(
179
version=agent_version,
180
op=WALAEventOperation.Enable,
184
if ret is not None and ret > 0:
185
msg = u"Agent {0} launched with command '{1}' returned code: {2}".format(
190
if latest_agent is not None:
191
latest_agent.mark_failure()
193
except Exception as e:
194
msg = u"Agent {0} launched with command '{1}' failed with exception: {2}".format(
201
version=agent_version,
202
op=WALAEventOperation.Enable,
205
if latest_agent is not None:
206
latest_agent.mark_failure(is_fatal=True)
208
self.child_process = None
213
This is the main loop which watches for agent and extension updates.
216
logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT)
218
# Launch monitoring threads
219
from azurelinuxagent.ga.monitor import get_monitor_handler
220
get_monitor_handler().run()
222
from azurelinuxagent.ga.env import get_env_handler
223
get_env_handler().run()
225
from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
226
exthandlers_handler = get_exthandlers_handler()
227
migrate_handler_state()
230
self._ensure_no_orphans()
231
self._emit_restart_event()
233
# TODO: Add means to stop running
235
if self._is_orphaned:
236
logger.info("Goal state agent {0} was orphaned -- exiting", CURRENT_AGENT)
239
if self._ensure_latest_agent():
240
if len(self.agents) > 0:
242
u"Agent {0} discovered {1} as an update and will exit",
247
exthandlers_handler.run()
249
time.sleep(GOAL_STATE_INTERVAL)
251
except Exception as e:
252
logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT, ustr(e))
260
def forward_signal(self, signum, frame):
262
# - At present, the handler is registered only for SIGTERM.
263
# However, clean shutdown is both SIGTERM and SIGKILL.
264
# A SIGKILL handler is not being registered at this time to
265
# minimize perturbing the code.
266
if signum in (signal.SIGTERM, signal.SIGKILL):
269
if self.child_process is None:
273
u"Agent {0} forwarding signal {1} to {2}",
276
self.child_agent.name if self.child_agent is not None else CURRENT_AGENT)
277
self.child_process.send_signal(signum)
279
if self.signal_handler not in (None, signal.SIG_IGN, signal.SIG_DFL):
280
self.signal_handler(signum, frame)
281
elif self.signal_handler is signal.SIG_DFL:
282
if signum == signal.SIGTERM:
283
# TODO: This should set self.running to False vs. just exiting
287
def get_latest_agent(self):
289
If autoupdate is enabled, return the most current, downloaded,
290
non-blacklisted agent (if any).
291
Otherwise, return None (implying to use the installed agent).
294
if not conf.get_autoupdate_enabled():
298
available_agents = [agent for agent in self.agents if agent.is_available]
299
return available_agents[0] if len(available_agents) >= 1 else None
301
def _emit_restart_event(self):
302
if not self._is_clean_start:
303
msg = u"{0} unexpectedly restarted".format(CURRENT_AGENT)
307
version=CURRENT_VERSION,
308
op=WALAEventOperation.Restart,
315
def _ensure_latest_agent(self, base_version=CURRENT_VERSION):
316
# Ignore new agents if updating is disabled
317
if not conf.get_autoupdate_enabled():
321
if self.last_attempt_time is not None:
322
next_attempt_time = self.last_attempt_time + conf.get_autoupdate_frequency()
324
next_attempt_time = now
325
if next_attempt_time > now:
328
family = conf.get_autoupdate_gafamily()
329
logger.info("Checking for agent family {0} updates", family)
331
self.last_attempt_time = now
333
protocol = self.protocol_util.get_protocol()
334
manifest_list, etag = protocol.get_vmagent_manifests()
335
except Exception as e:
336
msg = u"Exception retrieving agent manifests: {0}".format(ustr(e))
340
op=WALAEventOperation.Download,
341
version=CURRENT_VERSION,
346
if self.last_etag is not None and self.last_etag == etag:
347
logger.info(u"Incarnation {0} has no agent updates", etag)
350
manifests = [m for m in manifest_list.vmAgentManifests if m.family == family]
351
if len(manifests) == 0:
352
logger.info(u"Incarnation {0} has no agent family {1} updates", etag, family)
356
pkg_list = protocol.get_vmagent_pkgs(manifests[0])
357
except ProtocolError as e:
358
msg= u"Incarnation {0} failed to get {1} package list: {2}".format(
365
op=WALAEventOperation.Download,
366
version=CURRENT_VERSION,
371
# Set the agents to those available for download at least as current as the existing agent
372
# and remove from disk any agent no longer reported to the VM.
374
# The code leaves on disk available, but blacklisted, agents so as to preserve the state.
375
# Otherwise, those agents could be again downloaded and inappropriately retried.
376
self._set_agents([GuestAgent(pkg=pkg) for pkg in pkg_list.versions])
378
self._filter_blacklisted_agents()
380
# Return True if agents more recent than the current are available
381
return len(self.agents) > 0 and self.agents[0].version > base_version
383
def _ensure_no_orphans(self, orphan_wait_interval=ORPHAN_WAIT_INTERVAL):
384
previous_pid_file, pid_file = self._write_pid_file()
385
if previous_pid_file is not None:
387
pid = fileutil.read_file(previous_pid_file)
388
wait_interval = orphan_wait_interval
389
while self.osutil.check_pid_alive(pid):
390
wait_interval -= GOAL_STATE_INTERVAL
391
if wait_interval <= 0:
393
u"{0} forcibly terminated orphan process {1}",
396
os.kill(pid, signal.SIGKILL)
400
u"{0} waiting for orphan process {1} to terminate",
403
time.sleep(GOAL_STATE_INTERVAL)
405
except Exception as e:
407
u"Exception occurred waiting for orphan agent to terminate: {0}",
411
def _evaluate_agent_health(self, latest_agent):
413
Evaluate the health of the selected agent: If it is restarting
414
too frequently, raise an Exception to force blacklisting.
416
if latest_agent is None:
417
self.child_agent = None
420
if self.child_agent is None or latest_agent.version != self.child_agent.version:
421
self.child_agent = latest_agent
422
self.child_launch_time = None
423
self.child_launch_attempts = 0
425
if self.child_launch_time is None:
426
self.child_launch_time = time.time()
428
self.child_launch_attempts += 1
430
if (time.time() - self.child_launch_time) <= CHILD_LAUNCH_INTERVAL \
431
and self.child_launch_attempts >= CHILD_LAUNCH_RESTART_MAX:
432
msg = u"Agent {0} restarted more than {1} times in {2} seconds".format(
433
self.child_agent.name,
434
CHILD_LAUNCH_RESTART_MAX,
435
CHILD_LAUNCH_INTERVAL)
439
def _filter_blacklisted_agents(self):
440
self.agents = [agent for agent in self.agents if not agent.is_blacklisted]
443
def _get_pid_files(self):
444
pid_file = conf.get_agent_pid_file_path()
446
pid_dir = os.path.dirname(pid_file)
447
pid_name = os.path.basename(pid_file)
449
pid_re = re.compile("(\d+)_{0}".format(re.escape(pid_name)))
450
pid_files = [int(pid_re.match(f).group(1)) for f in os.listdir(pid_dir) if pid_re.match(f)]
453
pid_index = -1 if len(pid_files) <= 0 else pid_files[-1]
454
previous_pid_file = None \
456
else os.path.join(pid_dir, "{0}_{1}".format(pid_index, pid_name))
457
pid_file = os.path.join(pid_dir, "{0}_{1}".format(pid_index+1, pid_name))
458
return previous_pid_file, pid_file
461
def _is_clean_start(self):
462
if not os.path.isfile(self._sentinal_file_path()):
466
if fileutil.read_file(self._sentinal_file_path()) != CURRENT_AGENT:
468
except Exception as e:
470
u"Exception reading sentinal file {0}: {1}",
471
self._sentinal_file_path(),
477
def _is_orphaned(self):
478
parent_pid = os.getppid()
479
if parent_pid in (1, None):
482
if not os.path.isfile(conf.get_agent_pid_file_path()):
485
return fileutil.read_file(conf.get_agent_pid_file_path()) != ustr(parent_pid)
487
def _load_agents(self):
489
Load all non-blacklisted agents currently on disk.
491
if len(self.agents) <= 0:
493
path = os.path.join(conf.get_lib_dir(), "{0}-*".format(AGENT_NAME))
494
self._set_agents([GuestAgent(path=agent_dir)
495
for agent_dir in glob.iglob(path) if os.path.isdir(agent_dir)])
496
self._filter_blacklisted_agents()
497
except Exception as e:
498
logger.warn(u"Exception occurred loading available agents: {0}", ustr(e))
501
def _purge_agents(self):
503
Remove from disk all directories and .zip files of unknown agents
504
(without removing the current, running agent).
506
path = os.path.join(conf.get_lib_dir(), "{0}-*".format(AGENT_NAME))
508
known_versions = [agent.version for agent in self.agents]
509
if not is_current_agent_installed() and CURRENT_VERSION not in known_versions:
511
u"Running Agent {0} was not found in the agent manifest - adding to list",
513
known_versions.append(CURRENT_VERSION)
515
for agent_path in glob.iglob(path):
517
name = fileutil.trim_ext(agent_path, "zip")
518
m = AGENT_DIR_PATTERN.match(name)
519
if m is not None and FlexibleVersion(m.group(1)) not in known_versions:
520
if os.path.isfile(agent_path):
521
logger.info(u"Purging outdated Agent file {0}", agent_path)
522
os.remove(agent_path)
524
logger.info(u"Purging outdated Agent directory {0}", agent_path)
525
shutil.rmtree(agent_path)
526
except Exception as e:
527
logger.warn(u"Purging {0} raised exception: {1}", agent_path, ustr(e))
530
def _set_agents(self, agents=[]):
532
self.agents.sort(key=lambda agent: agent.version, reverse=True)
535
def _set_sentinal(self, agent=CURRENT_AGENT):
537
fileutil.write_file(self._sentinal_file_path(), agent)
538
except Exception as e:
540
u"Exception writing sentinal file {0}: {1}",
541
self._sentinal_file_path(),
545
def _sentinal_file_path(self):
546
return os.path.join(conf.get_lib_dir(), AGENT_SENTINAL_FILE)
549
if not os.path.isfile(self._sentinal_file_path()):
553
os.remove(self._sentinal_file_path())
554
except Exception as e:
556
u"Exception removing sentinal file {0}: {1}",
557
self._sentinal_file_path(),
561
def _write_pid_file(self):
562
previous_pid_file, pid_file = self._get_pid_files()
564
fileutil.write_file(pid_file, ustr(os.getpid()))
565
logger.info(u"{0} running as process {1}", CURRENT_AGENT, ustr(os.getpid()))
566
except Exception as e:
569
u"Expection writing goal state agent {0} pid to {1}: {2}",
573
return previous_pid_file, pid_file
576
class GuestAgent(object):
577
def __init__(self, path=None, pkg=None):
581
m = AGENT_DIR_PATTERN.match(path)
583
raise UpdateError(u"Illegal agent directory: {0}".format(path))
585
elif self.pkg is not None:
586
version = pkg.version
589
raise UpdateError(u"Illegal agent version: {0}".format(version))
590
self.version = FlexibleVersion(version)
592
location = u"disk" if path is not None else u"package"
593
logger.info(u"Instantiating Agent {0} from {1}", self.name, location)
597
self._ensure_downloaded()
602
return "{0}-{1}".format(AGENT_NAME, self.version)
604
def get_agent_cmd(self):
605
return self.manifest.get_enable_command()
607
def get_agent_dir(self):
608
return os.path.join(conf.get_lib_dir(), self.name)
610
def get_agent_error_file(self):
611
return os.path.join(conf.get_lib_dir(), self.name, AGENT_ERROR_FILE)
613
def get_agent_manifest_path(self):
614
return os.path.join(self.get_agent_dir(), AGENT_MANIFEST_FILE)
616
def get_agent_pkg_path(self):
617
return ".".join((os.path.join(conf.get_lib_dir(), self.name), "zip"))
619
def clear_error(self):
624
def is_available(self):
625
return self.is_downloaded and not self.is_blacklisted
628
def is_blacklisted(self):
629
return self.error is not None and self.error.is_blacklisted
632
def is_downloaded(self):
633
return self.is_blacklisted or os.path.isfile(self.get_agent_manifest_path())
635
def mark_failure(self, is_fatal=False):
637
if not os.path.isdir(self.get_agent_dir()):
638
os.makedirs(self.get_agent_dir())
639
self.error.mark_failure(is_fatal=is_fatal)
642
logger.warn(u"Agent {0} is permanently blacklisted", self.name)
643
except Exception as e:
644
logger.warn(u"Agent {0} failed recording error state: {1}", self.name, ustr(e))
647
def _ensure_downloaded(self):
649
logger.info(u"Ensuring Agent {0} is downloaded", self.name)
651
if self.is_blacklisted:
652
logger.info(u"Agent {0} is blacklisted - skipping download", self.name)
655
if self.is_downloaded:
656
logger.info(u"Agent {0} was previously downloaded - skipping download", self.name)
657
self._load_manifest()
661
raise UpdateError(u"Agent {0} is missing package and download URIs".format(
666
self._load_manifest()
669
msg = u"Agent {0} downloaded successfully".format(self.name)
673
version=self.version,
674
op=WALAEventOperation.Install,
678
except Exception as e:
679
# Note the failure, blacklist the agent if the package downloaded
680
# - An exception with a downloaded package indicates the package
681
# is corrupt (e.g., missing the HandlerManifest.json file)
682
self.mark_failure(is_fatal=os.path.isfile(self.get_agent_pkg_path()))
684
msg = u"Agent {0} download failed with exception: {1}".format(self.name, ustr(e))
688
version=self.version,
689
op=WALAEventOperation.Install,
697
for uri in self.pkg.uris:
699
resp = restutil.http_get(uri.uri, chk_proxy=True)
700
if resp.status == restutil.httpclient.OK:
701
package = resp.read()
702
fileutil.write_file(self.get_agent_pkg_path(), bytearray(package), asbin=True)
703
logger.info(u"Agent {0} downloaded from {1}", self.name, uri.uri)
705
except restutil.HttpError as e:
706
logger.warn(u"Agent {0} download from {1} failed", self.name, uri.uri)
708
if not os.path.isfile(self.get_agent_pkg_path()):
709
msg = u"Unable to download Agent {0} from any URI".format(self.name)
712
op=WALAEventOperation.Download,
713
version=CURRENT_VERSION,
716
raise UpdateError(msg)
719
def _load_error(self):
721
if self.error is None:
722
self.error = GuestAgentError(self.get_agent_error_file())
724
logger.info(u"Agent {0} error state: {1}", self.name, ustr(self.error))
725
except Exception as e:
726
logger.warn(u"Agent {0} failed loading error state: {1}", self.name, ustr(e))
729
def _load_manifest(self):
730
path = self.get_agent_manifest_path()
731
if not os.path.isfile(path):
732
msg = u"Agent {0} is missing the {1} file".format(self.name, AGENT_MANIFEST_FILE)
733
raise UpdateError(msg)
735
with open(path, "r") as manifest_file:
737
manifests = json.load(manifest_file)
738
except Exception as e:
739
msg = u"Agent {0} has a malformed {1}".format(self.name, AGENT_MANIFEST_FILE)
740
raise UpdateError(msg)
741
if type(manifests) is list:
742
if len(manifests) <= 0:
743
msg = u"Agent {0} has an empty {1}".format(self.name, AGENT_MANIFEST_FILE)
744
raise UpdateError(msg)
745
manifest = manifests[0]
750
self.manifest = HandlerManifest(manifest)
751
if len(self.manifest.get_enable_command()) <= 0:
752
raise Exception(u"Manifest is missing the enable command")
753
except Exception as e:
754
msg = u"Agent {0} has an illegal {1}: {2}".format(
758
raise UpdateError(msg)
761
u"Agent {0} loaded manifest from {1}",
763
self.get_agent_manifest_path())
764
logger.verbose(u"Successfully loaded Agent {0} {1}: {2}",
767
ustr(self.manifest.data))
772
if os.path.isdir(self.get_agent_dir()):
773
shutil.rmtree(self.get_agent_dir())
775
zipfile.ZipFile(self.get_agent_pkg_path()).extractall(self.get_agent_dir())
777
except Exception as e:
778
msg = u"Exception unpacking Agent {0} from {1}: {2}".format(
780
self.get_agent_pkg_path(),
782
raise UpdateError(msg)
784
if not os.path.isdir(self.get_agent_dir()):
785
msg = u"Unpacking Agent {0} failed to create directory {1}".format(
787
self.get_agent_dir())
788
raise UpdateError(msg)
791
u"Agent {0} unpacked successfully to {1}",
793
self.get_agent_dir())
797
class GuestAgentError(object):
798
def __init__(self, path):
800
raise UpdateError(u"GuestAgentError requires a path")
807
def mark_failure(self, is_fatal=False):
808
self.last_failure = time.time()
809
self.failure_count += 1
810
self.was_fatal = is_fatal
814
self.last_failure = 0.0
815
self.failure_count = 0
816
self.was_fatal = False
819
def clear_old_failure(self):
820
if self.last_failure <= 0.0:
822
if self.last_failure < (time.time() - RETAIN_INTERVAL):
827
def is_blacklisted(self):
828
return self.was_fatal or self.failure_count >= MAX_FAILURE
831
if self.path is not None and os.path.isfile(self.path):
832
with open(self.path, 'r') as f:
833
self.from_json(json.load(f))
837
if os.path.isdir(os.path.dirname(self.path)):
838
with open(self.path, 'w') as f:
839
json.dump(self.to_json(), f)
842
def from_json(self, data):
843
self.last_failure = max(
845
data.get(u"last_failure", 0.0))
846
self.failure_count = max(
848
data.get(u"failure_count", 0))
849
self.was_fatal = self.was_fatal or data.get(u"was_fatal", False)
854
u"last_failure": self.last_failure,
855
u"failure_count": self.failure_count,
856
u"was_fatal" : self.was_fatal
861
return "Last Failure: {0}, Total Failures: {1}, Fatal: {2}".format(