1
# vim: tabstop=4 shiftwidth=4 softtabstop=4
3
# Copyright 2010 United States Government as represented by the
4
# Administrator of the National Aeronautics and Space Administration.
7
# Licensed under the Apache License, Version 2.0 (the "License"); you may
8
# not use this file except in compliance with the License. You may obtain
9
# a copy of the License at
11
# http://www.apache.org/licenses/LICENSE-2.0
13
# Unless required by applicable law or agreed to in writing, software
14
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
16
# License for the specific language governing permissions and limitations
22
Optionally may be run on each compute node. Provides RRD
23
based statistics and graphs and makes them internally available
34
from twisted.internet import task
35
from twisted.application import service
37
from nova import flags
38
from nova import log as logging
39
from nova import utils
40
from nova.virt import connection as virt_connection
44
flags.DEFINE_integer('monitoring_instances_delay', 5,
45
'Sleep time between updates')
46
flags.DEFINE_integer('monitoring_instances_step', 300,
47
'Interval of RRD updates')
48
flags.DEFINE_string('monitoring_rrd_path', '$state_path/monitor/instances',
49
'Location of RRD files')
54
'DS:cpu:GAUGE:600:0:100',
55
'RRA:AVERAGE:0.5:1:800',
56
'RRA:AVERAGE:0.5:6:800',
57
'RRA:AVERAGE:0.5:24:800',
58
'RRA:AVERAGE:0.5:288:800',
62
'RRA:MAX:0.5:288:800',
65
'DS:rx:COUNTER:600:0:1250000',
66
'DS:tx:COUNTER:600:0:1250000',
67
'RRA:AVERAGE:0.5:1:800',
68
'RRA:AVERAGE:0.5:6:800',
69
'RRA:AVERAGE:0.5:24:800',
70
'RRA:AVERAGE:0.5:288:800',
74
'RRA:MAX:0.5:288:800',
77
'DS:rd:COUNTER:600:U:U',
78
'DS:wr:COUNTER:600:U:U',
79
'RRA:AVERAGE:0.5:1:800',
80
'RRA:AVERAGE:0.5:6:800',
81
'RRA:AVERAGE:0.5:24:800',
82
'RRA:AVERAGE:0.5:288:800',
86
'RRA:MAX:0.5:444:800',
93
LOG = logging.getLogger('nova.compute.monitor')
96
def update_rrd(instance, name, data):
98
Updates the specified RRD file.
100
filename = os.path.join(instance.get_rrd_path(), '%s.rrd' % name)
102
if not os.path.exists(filename):
103
init_rrd(instance, name)
105
timestamp = int(time.mktime(utcnow().timetuple()))
106
rrdtool.update(filename, '%d:%s' % (timestamp, data))
109
def init_rrd(instance, name):
111
Initializes the specified RRD file.
113
path = os.path.join(FLAGS.monitoring_rrd_path, instance.instance_id)
115
if not os.path.exists(path):
118
filename = os.path.join(path, '%s.rrd' % name)
120
if not os.path.exists(filename):
123
'--step', '%d' % FLAGS.monitoring_instances_step,
128
def graph_cpu(instance, duration):
130
Creates a graph of cpu usage for the specified instance and duration.
132
path = instance.get_rrd_path()
133
filename = os.path.join(path, 'cpu-%s.png' % duration)
137
'--disable-rrdtool-tag',
138
'--imgformat', 'PNG',
141
'--start', 'now-%s' % duration,
142
'--vertical-label', '% cpu used',
145
'DEF:cpu=%s:cpu:AVERAGE' % os.path.join(path, 'cpu.rrd'),
146
'AREA:cpu#eacc00:% CPU',)
148
store_graph(instance.instance_id, filename)
151
def graph_net(instance, duration):
153
Creates a graph of network usage for the specified instance and duration.
155
path = instance.get_rrd_path()
156
filename = os.path.join(path, 'net-%s.png' % duration)
160
'--disable-rrdtool-tag',
161
'--imgformat', 'PNG',
164
'--start', 'now-%s' % duration,
165
'--vertical-label', 'bytes/s',
168
'--lower-limit', '1000',
170
'DEF:rx=%s:rx:AVERAGE' % os.path.join(path, 'net.rrd'),
171
'DEF:tx=%s:tx:AVERAGE' % os.path.join(path, 'net.rrd'),
172
'AREA:rx#00FF00:In traffic',
173
'LINE1:tx#0000FF:Out traffic',)
175
store_graph(instance.instance_id, filename)
178
def graph_disk(instance, duration):
180
Creates a graph of disk usage for the specified duration.
182
path = instance.get_rrd_path()
183
filename = os.path.join(path, 'disk-%s.png' % duration)
187
'--disable-rrdtool-tag',
188
'--imgformat', 'PNG',
191
'--start', 'now-%s' % duration,
192
'--vertical-label', 'bytes/s',
195
'--lower-limit', '1000',
197
'DEF:rd=%s:rd:AVERAGE' % os.path.join(path, 'disk.rrd'),
198
'DEF:wr=%s:wr:AVERAGE' % os.path.join(path, 'disk.rrd'),
199
'AREA:rd#00FF00:Read',
200
'LINE1:wr#0000FF:Write',)
202
store_graph(instance.instance_id, filename)
205
def store_graph(instance_id, filename):
207
Transmits the specified graph file to internal object store on cloud
210
# TODO(devcamcar): Need to use an asynchronous method to make this
211
# connection. If boto has some separate method that generates
212
# the request it would like to make and another method to parse
213
# the response we can make our own client that does the actual
214
# request and hands it off to the response parser.
215
s3 = boto.s3.connection.S3Connection(
216
aws_access_key_id=FLAGS.aws_access_key_id,
217
aws_secret_access_key=FLAGS.aws_secret_access_key,
219
calling_format=boto.s3.connection.OrdinaryCallingFormat(),
222
bucket_name = '_%s.monitor' % instance_id
224
# Object store isn't creating the bucket like it should currently
225
# when it is first requested, so have to catch and create manually.
227
bucket = s3.get_bucket(bucket_name)
229
bucket = s3.create_bucket(bucket_name)
231
key = boto.s3.Key(bucket)
232
key.key = os.path.basename(filename)
233
key.set_contents_from_filename(filename)
236
class Instance(object):
237
def __init__(self, conn, instance_id):
239
self.instance_id = instance_id
240
self.last_updated = datetime.datetime.min
242
self.cputime_last_updated = None
244
init_rrd(self, 'cpu')
245
init_rrd(self, 'net')
246
init_rrd(self, 'disk')
248
def needs_update(self):
250
Indicates whether this instance is due to have its statistics updated.
252
delta = utcnow() - self.last_updated
253
return delta.seconds >= FLAGS.monitoring_instances_step
257
Updates the instances statistics and stores the resulting graphs
258
in the internal object store on the cloud controller.
260
LOG.debug(_('updating %s...'), self.instance_id)
263
data = self.fetch_cpu_stats()
265
LOG.debug('CPU: %s', data)
266
update_rrd(self, 'cpu', data)
268
data = self.fetch_net_stats()
269
LOG.debug('NET: %s', data)
270
update_rrd(self, 'net', data)
272
data = self.fetch_disk_stats()
273
LOG.debug('DISK: %s', data)
274
update_rrd(self, 'disk', data)
276
# TODO(devcamcar): Turn these into pool.ProcessPool.execute() calls
277
# and make the methods @defer.inlineCallbacks.
278
graph_cpu(self, '1d')
279
graph_cpu(self, '1w')
280
graph_cpu(self, '1m')
282
graph_net(self, '1d')
283
graph_net(self, '1w')
284
graph_net(self, '1m')
286
graph_disk(self, '1d')
287
graph_disk(self, '1w')
288
graph_disk(self, '1m')
290
LOG.exception(_('unexpected error during update'))
292
self.last_updated = utcnow()
294
def get_rrd_path(self):
296
Returns the path to where RRD files are stored.
298
return os.path.join(FLAGS.monitoring_rrd_path, self.instance_id)
300
def fetch_cpu_stats(self):
302
Returns cpu usage statistics for this instance.
304
info = self.conn.get_info(self.instance_id)
306
# Get the previous values.
307
cputime_last = self.cputime
308
cputime_last_updated = self.cputime_last_updated
310
# Get the raw CPU time used in nanoseconds.
311
self.cputime = float(info['cpu_time'])
312
self.cputime_last_updated = utcnow()
314
LOG.debug('CPU: %d', self.cputime)
316
# Skip calculation on first pass. Need delta to get a meaningful value.
317
if cputime_last_updated is None:
320
# Calculate the number of seconds between samples.
321
d = self.cputime_last_updated - cputime_last_updated
322
t = d.days * 86400 + d.seconds
324
LOG.debug('t = %d', t)
326
# Calculate change over time in number of nanoseconds of CPU time used.
327
cputime_delta = self.cputime - cputime_last
329
LOG.debug('cputime_delta = %s', cputime_delta)
331
# Get the number of virtual cpus in this domain.
332
vcpus = int(info['num_cpu'])
334
LOG.debug('vcpus = %d', vcpus)
336
# Calculate CPU % used and cap at 100.
337
return min(cputime_delta / (t * vcpus * 1.0e9) * 100, 100)
339
def fetch_disk_stats(self):
341
Returns disk usage statistics for this instance.
346
disks = self.conn.get_disks(self.instance_id)
348
# Aggregate the read and write totals.
351
rd_req, rd_bytes, wr_req, wr_bytes, errs = \
352
self.conn.block_stats(self.instance_id, disk)
356
iid = self.instance_id
357
LOG.error(_('Cannot get blockstats for "%(disk)s"'
358
' on "%(iid)s"') % locals())
361
return '%d:%d' % (rd, wr)
363
def fetch_net_stats(self):
365
Returns network usage statistics for this instance.
370
interfaces = self.conn.get_interfaces(self.instance_id)
372
# Aggregate the in and out totals.
373
for interface in interfaces:
375
stats = self.conn.interface_stats(self.instance_id, interface)
379
iid = self.instance_id
380
LOG.error(_('Cannot get ifstats for "%(interface)s"'
381
' on "%(iid)s"') % locals())
384
return '%d:%d' % (rx, tx)
387
class InstanceMonitor(object, service.Service):
389
Monitors the running instances of the current machine.
394
Initialize the monitoring loop.
397
self._loop = task.LoopingCall(self.updateInstances)
399
def startService(self):
401
self._loop.start(interval=FLAGS.monitoring_instances_delay)
402
service.Service.startService(self)
404
def stopService(self):
406
service.Service.stopService(self)
408
def updateInstances(self):
410
Update resource usage for all running instances.
413
conn = virt_connection.get_connection(read_only=True)
414
except Exception, exn:
415
LOG.exception(_('unexpected exception getting connection'))
416
time.sleep(FLAGS.monitoring_instances_delay)
419
domain_ids = conn.list_instances()
421
self.updateInstances_(conn, domain_ids)
422
except Exception, exn:
423
LOG.exception('updateInstances_')
425
def updateInstances_(self, conn, domain_ids):
426
for domain_id in domain_ids:
427
if not domain_id in self._instances:
428
instance = Instance(conn, domain_id)
429
self._instances[domain_id] = instance
430
LOG.debug(_('Found instance: %s'), domain_id)
432
for key in self._instances.keys():
433
instance = self._instances[key]
434
if instance.needs_update():