3
juju status health check (not to be confused with juju health status)
5
A quick health check based on juju agent states reported in juju stat, plus
6
SSH and juju agent TCP port connection checks, and ping connectivity checks.
14
sys.path.insert(1, os.path.join(sys.path[0], '..'))
15
from common import osci_utils # noqa
18
USAGE = '''Usage: %prog [options]
20
juju status health check (not to be confused with juju health status)
22
A quick health check based on juju agent states reported in juju stat, plus
23
SSH and juju agent TCP port connection checks, and ping connectivity checks.
27
def jshc_ssh_command_check(host='127.0.0.1'):
28
''' A locally reusable chunk to check SSH command success on a remote host.
31
pkey = os.path.expanduser('~/.juju/ssh/juju_id_rsa')
33
ret = osci_utils.func_timeout(timeout=10,
34
func_call=osci_utils.ssh_command_check,
40
logging.info('SSH command check OK for {}.'.format(host))
43
logging.warn('SSH command check FAIL for'
45
return False, 'host: {}'.format(host)
48
def jshc_tcp_socket_check(host="127.0.0.1", port=22, timeout=15):
49
''' A locally reusable chunk to check TCP socket connectivity
52
if osci_utils.port_knock(host=host, port=port, timeout=timeout):
53
logging.info('Socket connect OK for {} port {}.'.format(host, port))
56
logging.warn('Socket connect FAIL for'
57
' {} port {}'.format(host, port))
58
return False, 'host: {}'.format(host)
61
def jshc_ping_check(host, check_hist):
62
''' A locally reusable chunk to check host pingability and give feedback.
64
check_str = 'ping: {}'.format(host)
65
if check_str in check_hist:
66
logging.info('Already checked {}'.format(check_str))
67
return None, None, check_str
69
if osci_utils.ping_host(host=host):
70
logging.info('Ping OK for: {}'.format(host))
71
return True, None, check_str
73
logging.warn('Ping FAILED for: {}'.format(host))
74
err_str = 'host: {}'.format(host)
75
return False, err_str, check_str
82
_jstat = osci_utils.juju_stat()
83
except subprocess.CalledProcessError:
84
logging.error('Could not get juju status.')
87
if 'services' not in _jstat:
91
def juju_status_health_check():
92
''' Check juju agent, instance, machine status and connectivity.
96
j_stat = osci_utils.juju_stat()
100
found_bootstrap_node = False
102
# per-service unit state check loop
103
logging.debug('Checking agents per service.')
104
for charm, services in j_stat['services'].iteritems():
105
logging.info('=== Checking service: {} ==='.format(charm))
107
# check for existing units
108
if 'units' not in services.keys():
109
logging.info('No unit(s) for {}'.format(charm))
112
# per-unit check loop
113
for unit, unit_data in services['units'].iteritems():
114
check_str = 'agent: {}'.format(unit)
115
if check_str in check_hist:
116
logging.info('Already checked {}'.format(check_str))
118
check_hist.add(check_str)
119
if unit_data['agent-state'] == 'started':
120
logging.info('Agent OK for unit: {}'.format(unit))
121
elif unit_data['agent-state'] == 'down':
122
logging.warn('Agent down for unit: {}'.format(unit))
123
sick.append('unit: {}'.format(unit))
125
logging.warn('Agent state unknown ({}) for unit: '
126
'{}'.format(unit_data['agent-state'],
128
sick.append('unit: {}'.format(unit))
130
# connectivity checks
131
if 'public-address' in unit_data.keys():
132
host = unit_data['public-address']
134
# SSH port connect check
135
check_str = 'ssh socket: {}'.format(host)
136
if check_str in check_hist:
137
logging.info('Already checked {}'.format(check_str))
139
check_hist.add(check_str)
140
ret = jshc_tcp_socket_check(host=host)
143
sick.append('unit: {}'.format(unit))
146
check_str = 'ssh cmd: {}'.format(host)
147
if check_str in check_hist:
148
logging.info('Already checked {}'.format(check_str))
150
check_hist.add(check_str)
151
ret = jshc_ssh_command_check(host=host)
154
sick.append('unit: {}'.format(unit))
157
ret = jshc_ping_check(host=host, check_hist=check_hist)
159
check_hist.add(ret[2])
161
sick.extend([ret[1], 'unit: {}'.format(unit)])
163
logging.info('No public-address for unit: {}'.format(unit))
165
# per-machine unit state check
166
logging.debug('Checking agents per machine.')
167
for mach, mach_data in j_stat['machines'].iteritems():
168
logging.info('=== Checking machine: {} ==='.format(mach))
171
if 'agent-state' not in mach_data.keys():
172
logging.warn('No agent-state reported for {}'.format(mach))
173
sick.append('machine: {}'.format(mach))
176
check_hist.add('agent: {}'.format(mach))
177
if mach_data['agent-state'] == 'started':
178
logging.info('Agent OK for machine: {}'.format(mach))
179
elif mach_data['agent-state'] == 'down':
180
logging.warn('Agent down for machine: {}'.format(mach))
181
sick.append('machine: {}'.format(mach))
183
logging.warn('Agent state unknown ({}) for '
184
'machine: {}'.format(mach_data['agent-state'],
186
sick.append('machine: {}'.format(mach))
188
# bootstrap node check
189
if 'state-server-member-status' in mach_data:
190
if mach_data['state-server-member-status'] == 'has-vote':
191
found_bootstrap_node = True
192
logging.info('Found bootstrap machine: {}'.format(mach))
193
if 'dns-name' in mach_data.keys():
194
host = mach_data['dns-name']
196
# juju agent port connect check
197
check_str = 'agent socket: {}'.format(host)
198
if check_str in check_hist:
199
logging.info('Already checked {}'.format(check_str))
201
check_hist.add(check_str)
202
ret = jshc_tcp_socket_check(host=host, port=17070)
205
sick.append('machine: {}'.format(mach))
208
check_str = 'ssh cmd: {}'.format(host)
209
if check_str in check_hist:
210
logging.info('Already checked {}'.format(check_str))
212
check_hist.add(check_str)
213
ret = jshc_ssh_command_check(host=host)
216
sick.append('machine: {}'.format(mach))
218
# connectivity checks
219
if 'dns-name' in mach_data.keys():
220
host = mach_data['dns-name']
222
# SSH port connect check
223
check_str = 'ssh socket: {}'.format(host)
224
if check_str in check_hist:
225
logging.info('Already checked {}'.format(check_str))
227
check_hist.add(check_str)
228
ret = jshc_tcp_socket_check(host=host)
231
sick.append('machine: {}'.format(mach))
234
check_str = 'ssh cmd: {}'.format(host)
235
if check_str in check_hist:
236
logging.info('Already checked {}'.format(check_str))
238
check_hist.add(check_str)
239
ret = jshc_ssh_command_check(host=host)
242
sick.append('machine: {}'.format(mach))
245
ret = jshc_ping_check(host=host, check_hist=check_hist)
247
check_hist.add(ret[2])
249
sick.extend([ret[1], 'machine: {}'.format(unit)])
251
# per-machine container state check
252
if 'containers' in mach_data:
253
for container in mach_data['containers']:
254
check_hist.add('agent: {}'.format(container))
255
if mach_data['agent-state'] == 'started':
256
logging.info('Agent OK for container: '
257
'{}'.format(container))
258
elif mach_data['agent-state'] == 'down':
259
logging.warn('Agent down for container:'
260
' {}'.format(container))
261
sick.append('container: {}'.format(container))
263
logging.warn('Agent state unknown ({}) for container: '
264
'{}'.format(mach_data['agent-state'],
266
sick.append('container: {}'.format(container))
268
if 'dns-name' in mach_data['containers'][container]:
269
host = mach_data['containers'][container]['dns-name']
271
# SSH port connect check
272
check_str = 'ssh socket: {}'.format(host)
273
if check_str in check_hist:
274
logging.info('Already checked {}'.format(check_str))
276
check_hist.add(check_str)
277
ret = jshc_tcp_socket_check(host=host)
280
sick.append('container: '.format(container))
283
check_str = 'ssh cmd: {}'.format(host)
284
if check_str in check_hist:
285
logging.info('Already checked {}'.format(check_str))
287
check_hist.add(check_str)
288
ret = jshc_ssh_command_check(host=host)
291
sick.append('container: {}'.format(container))
294
ret = jshc_ping_check(host=host, check_hist=check_hist)
296
check_hist.add(ret[2])
298
sick.extend([ret[1], 'container: {}'.format(unit)])
301
logging.debug('Check history: {}'.format(check_hist))
302
if not found_bootstrap_node:
303
sick.append('bootstrap node (not identified)')
305
logging.debug('Sick data: {}'.format(sick))
308
logging.error('One or more units are not ok:'
309
'\n {}'.format('\n '.join(list(set(sick)))))
312
logging.info(' OK: All units and machines have agents '
313
'running and they are reachable.')
317
'''Define and handle command line parameters
319
# Define command line options
320
parser = optparse.OptionParser(USAGE)
321
parser.add_option("-d", "--debug",
322
help="Enable debug output",
323
dest="debug", action="store_true", default=False)
325
parser.add_option("-q", "--quiet",
326
help="Less output (WARN and ERROR only)",
327
dest="quiet", action="store_true", default=False)
329
params = parser.parse_args()
330
(opts, args) = params
332
# Handle parameters, inform user
333
if opts.debug and not opts.quiet:
334
logging.basicConfig(level=logging.DEBUG)
335
logging.info('Logging level set to DEBUG!')
336
logging.debug('parse opts: \n{}'.format(
337
yaml.dump(vars(opts), default_flow_style=False)))
338
logging.debug('parse args: {}'.format(args))
340
logging.basicConfig(level=logging.WARN)
342
logging.basicConfig(level=logging.INFO)
344
logging.info('Ubuntu OSCI Juju Status Health Check')
345
juju_status_health_check()
348
if __name__ == '__main__':