~0x44/nova/bug838466

Viewing changes to nova/compute/monitor.py

Committer: Tarmac
Author(s): Brian Waldon
Date: 2011-08-04 12:50:32 UTC
mfrom: (1343.2.12 update-HACKING)
Revision ID: tarmac-20110804125032-xox88sztymlp9dx9

Update HACKING:
- Make imports more explicit
- Add some dict/list formatting guidelines
- Add some long method signature/call guidelines
- Add explanation of i18n

files added:
nova/rpc

nova/rpc/__init__.py

nova/rpc/common.py

nova/tests/test_rpc_amqp.py

files removed:
bin/nova-instancemonitor

doc/source/api/nova..compute.monitor.rst

doc/source/api/nova..tests.test_twistd.rst

doc/source/api/nova..twistd.rst

nova/compute/monitor.py

nova/tests/test_twistd.py

nova/twistd.py

tools/eventlet-patch

files renamed:
nova/rpc.py => nova/rpc/amqp.py

nova/tests/api/openstack/extensions/test_flavors_extra_specs.py => nova/tests/api/openstack/test_flavors_extra_specs.py

files modified:
Authors

bin/nova-logspool

bin/nova-manage

bin/nova-objectstore

contrib/nova.sh

doc/source/api/autoindex.rst

doc/source/code.rst

doc/source/devref/architecture.rst

doc/source/devref/compute.rst

doc/source/devref/development.environment.rst

doc/source/devref/nova.rst

nova/api/direct.py

nova/api/ec2/__init__.py

nova/api/openstack/__init__.py

nova/api/openstack/auth.py

nova/api/openstack/contrib/floating_ips.py

nova/api/openstack/create_instance_helper.py

nova/api/openstack/images.py

nova/api/openstack/servers.py

nova/api/openstack/versions.py

nova/api/openstack/views/servers.py

nova/api/openstack/views/versions.py

nova/api/openstack/wsgi.py

nova/auth/manager.py

nova/cloudpipe/pipelib.py

nova/compute/api.py

nova/compute/instance_types.py

nova/compute/manager.py

nova/context.py

nova/db/sqlalchemy/api.py

nova/db/sqlalchemy/models.py

nova/exception.py

nova/image/glance.py

nova/image/s3.py

nova/log.py

nova/network/manager.py

nova/notifier/api.py

nova/scheduler/least_cost.py

nova/scheduler/zone_aware_scheduler.py

nova/service.py

nova/test.py

nova/tests/api/openstack/contrib/test_floating_ips.py

nova/tests/api/openstack/contrib/test_multinic_xs.py

nova/tests/api/openstack/fakes.py

nova/tests/api/openstack/test_accounts.py

nova/tests/api/openstack/test_adminapi.py

nova/tests/api/openstack/test_auth.py

nova/tests/api/openstack/test_extensions.py

nova/tests/api/openstack/test_flavors.py

nova/tests/api/openstack/test_image_metadata.py

nova/tests/api/openstack/test_images.py

nova/tests/api/openstack/test_limits.py

nova/tests/api/openstack/test_server_metadata.py

nova/tests/api/openstack/test_servers.py

nova/tests/api/openstack/test_shared_ip_groups.py

nova/tests/api/openstack/test_users.py

nova/tests/api/openstack/test_versions.py

nova/tests/api/openstack/test_zones.py

nova/tests/glance/stubs.py

nova/tests/hyperv_unittest.py

nova/tests/image/test_glance.py

nova/tests/scheduler/test_scheduler.py

nova/tests/scheduler/test_zone_aware_scheduler.py

nova/tests/test_access.py

nova/tests/test_adminapi.py

nova/tests/test_api.py

nova/tests/test_auth.py

nova/tests/test_cloud.py

nova/tests/test_compute.py

nova/tests/test_console.py

nova/tests/test_db_api.py

nova/tests/test_libvirt.py

nova/tests/test_objectstore.py

nova/tests/test_quota.py

nova/tests/test_rpc.py

nova/tests/test_service.py

nova/tests/test_test.py

nova/tests/test_utils.py

nova/tests/test_vmwareapi.py

nova/tests/test_xenapi.py

nova/tests/xenapi/stubs.py

nova/utils.py

nova/virt/driver.py

nova/virt/fake.py

nova/virt/hyperv.py

nova/virt/images.py

nova/virt/libvirt/connection.py

nova/virt/vmwareapi/vmops.py

nova/virt/vmwareapi_conn.py

nova/virt/xenapi/vm_utils.py

nova/virt/xenapi/vmops.py

nova/virt/xenapi_conn.py

nova/wsgi.py

plugins/xenserver/xenapi/etc/xapi.d/plugins/glance

po/ast.po

po/cs.po

po/da.po

po/de.po

po/en_AU.po

po/en_GB.po

po/es.po

po/fr.po

po/it.po

po/ja.po

po/nova.pot

po/pt_BR.po

po/ru.po

po/tl.po

po/uk.po

po/zh_CN.po

po/zh_TW.po

setup.py

tools/install_venv.py

tools/pip-requires

Show diffs side-by-side

added added

removed removed

nova/compute/monitor.py

# vim: tabstop=4 shiftwidth=4 softtabstop=4

# Administrator of the National Aeronautics and Space Administration.

# Licensed under the Apache License, Version 2.0 (the "License"); you may

# not use this file except in compliance with the License. You may obtain

# a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the

# License for the specific language governing permissions and limitations

# under the License.

"""

Instance Monitoring:

Optionally may be run on each compute node. Provides RRD

based statistics and graphs and makes them internally available

in the object store.

"""

import datetime

import os

import time

import boto

import boto.s3

import rrdtool

from twisted.internet import task

from twisted.application import service

from nova import flags

from nova import log as logging

from nova import utils

from nova.virt import connection as virt_connection

FLAGS = flags.FLAGS

flags.DEFINE_integer('monitoring_instances_delay', 5,

'Sleep time between updates')

flags.DEFINE_integer('monitoring_instances_step', 300,

'Interval of RRD updates')

flags.DEFINE_string('monitoring_rrd_path', '$state_path/monitor/instances',

'Location of RRD files')

RRD_VALUES = {

'cpu': [

'DS:cpu:GAUGE:600:0:100',

'RRA:AVERAGE:0.5:1:800',

'RRA:AVERAGE:0.5:6:800',

'RRA:AVERAGE:0.5:24:800',

'RRA:AVERAGE:0.5:288:800',

'RRA:MAX:0.5:1:800',

'RRA:MAX:0.5:6:800',

'RRA:MAX:0.5:24:800',

'RRA:MAX:0.5:288:800',

'net': [

'DS:rx:COUNTER:600:0:1250000',

'DS:tx:COUNTER:600:0:1250000',

'RRA:AVERAGE:0.5:1:800',

'RRA:AVERAGE:0.5:6:800',

'RRA:AVERAGE:0.5:24:800',

'RRA:AVERAGE:0.5:288:800',

'RRA:MAX:0.5:1:800',

'RRA:MAX:0.5:6:800',

'RRA:MAX:0.5:24:800',

'RRA:MAX:0.5:288:800',

'disk': [

'DS:rd:COUNTER:600:U:U',

'DS:wr:COUNTER:600:U:U',

'RRA:AVERAGE:0.5:1:800',

'RRA:AVERAGE:0.5:6:800',

'RRA:AVERAGE:0.5:24:800',

'RRA:AVERAGE:0.5:288:800',

'RRA:MAX:0.5:1:800',

'RRA:MAX:0.5:6:800',

'RRA:MAX:0.5:24:800',

'RRA:MAX:0.5:444:800',

]}

utcnow = utils.utcnow

LOG = logging.getLogger('nova.compute.monitor')

def update_rrd(instance, name, data):

"""

Updates the specified RRD file.

"""

100

filename = os.path.join(instance.get_rrd_path(), '%s.rrd' % name)

101

102

if not os.path.exists(filename):

103

init_rrd(instance, name)

104

105

timestamp = int(time.mktime(utcnow().timetuple()))

106

rrdtool.update(filename, '%d:%s' % (timestamp, data))

107

108

109

def init_rrd(instance, name):

110

"""

111

Initializes the specified RRD file.

112

"""

113

path = os.path.join(FLAGS.monitoring_rrd_path, instance.instance_id)

114

115

if not os.path.exists(path):

116

os.makedirs(path)

117

118

filename = os.path.join(path, '%s.rrd' % name)

119

120

if not os.path.exists(filename):

121

rrdtool.create(

122

filename,

123

'--step', '%d' % FLAGS.monitoring_instances_step,

124

'--start', '0',

125

*RRD_VALUES[name])

126

127

128

def graph_cpu(instance, duration):

129

"""

130

Creates a graph of cpu usage for the specified instance and duration.

131

"""

132

path = instance.get_rrd_path()

133

filename = os.path.join(path, 'cpu-%s.png' % duration)

134

135

rrdtool.graph(

136

filename,

137

'--disable-rrdtool-tag',

138

'--imgformat', 'PNG',

139

'--width', '400',

140

'--height', '120',

141

'--start', 'now-%s' % duration,

142

'--vertical-label', '% cpu used',

143

'-l', '0',

144

'-u', '100',

145

'DEF:cpu=%s:cpu:AVERAGE' % os.path.join(path, 'cpu.rrd'),

146

'AREA:cpu#eacc00:% CPU',)

147

148

store_graph(instance.instance_id, filename)

149

150

151

def graph_net(instance, duration):

152

"""

153

Creates a graph of network usage for the specified instance and duration.

154

"""

155

path = instance.get_rrd_path()

156

filename = os.path.join(path, 'net-%s.png' % duration)

157

158

rrdtool.graph(

159

filename,

160

'--disable-rrdtool-tag',

161

'--imgformat', 'PNG',

162

'--width', '400',

163

'--height', '120',

164

'--start', 'now-%s' % duration,

165

'--vertical-label', 'bytes/s',

166

'--logarithmic',

167

'--units', 'si',

168

'--lower-limit', '1000',

169

'--rigid',

170

'DEF:rx=%s:rx:AVERAGE' % os.path.join(path, 'net.rrd'),

171

'DEF:tx=%s:tx:AVERAGE' % os.path.join(path, 'net.rrd'),

172

'AREA:rx#00FF00:In traffic',

173

'LINE1:tx#0000FF:Out traffic',)

174

175

store_graph(instance.instance_id, filename)

176

177

178

def graph_disk(instance, duration):

179

"""

180

Creates a graph of disk usage for the specified duration.

181

"""

182

path = instance.get_rrd_path()

183

filename = os.path.join(path, 'disk-%s.png' % duration)

184

185

rrdtool.graph(

186

filename,

187

'--disable-rrdtool-tag',

188

'--imgformat', 'PNG',

189

'--width', '400',

190

'--height', '120',

191

'--start', 'now-%s' % duration,

192

'--vertical-label', 'bytes/s',

193

'--logarithmic',

194

'--units', 'si',

195

'--lower-limit', '1000',

196

'--rigid',

197

'DEF:rd=%s:rd:AVERAGE' % os.path.join(path, 'disk.rrd'),

198

'DEF:wr=%s:wr:AVERAGE' % os.path.join(path, 'disk.rrd'),

199

'AREA:rd#00FF00:Read',

200

'LINE1:wr#0000FF:Write',)

201

202

store_graph(instance.instance_id, filename)

203

204

205

def store_graph(instance_id, filename):

206

"""

207

Transmits the specified graph file to internal object store on cloud

208

controller.

209

"""

210

# TODO(devcamcar): Need to use an asynchronous method to make this

211

# connection. If boto has some separate method that generates

212

# the request it would like to make and another method to parse

213

# the response we can make our own client that does the actual

214

# request and hands it off to the response parser.

215

s3 = boto.s3.connection.S3Connection(

216

aws_access_key_id=FLAGS.aws_access_key_id,

217

aws_secret_access_key=FLAGS.aws_secret_access_key,

218

is_secure=False,

219

calling_format=boto.s3.connection.OrdinaryCallingFormat(),

220

port=FLAGS.s3_port,

221

host=FLAGS.s3_host)

222

bucket_name = '_%s.monitor' % instance_id

223

224

# Object store isn't creating the bucket like it should currently

225

# when it is first requested, so have to catch and create manually.

226

try:

227

bucket = s3.get_bucket(bucket_name)

228

except Exception:

229

bucket = s3.create_bucket(bucket_name)

230

231

key = boto.s3.Key(bucket)

232

key.key = os.path.basename(filename)

233

key.set_contents_from_filename(filename)

234

235

236

class Instance(object):

237

def __init__(self, conn, instance_id):

238

self.conn = conn

239

self.instance_id = instance_id

240

self.last_updated = datetime.datetime.min

241

self.cputime = 0

242

self.cputime_last_updated = None

243

244

init_rrd(self, 'cpu')

245

init_rrd(self, 'net')

246

init_rrd(self, 'disk')

247

248

def needs_update(self):

249

"""

250

Indicates whether this instance is due to have its statistics updated.

251

"""

252

delta = utcnow() - self.last_updated

253

return delta.seconds >= FLAGS.monitoring_instances_step

254

255

def update(self):

256

"""

257

Updates the instances statistics and stores the resulting graphs

258

in the internal object store on the cloud controller.

259

"""

260

LOG.debug(_('updating %s...'), self.instance_id)

261

262

try:

263

data = self.fetch_cpu_stats()

264

if data is not None:

265

LOG.debug('CPU: %s', data)

266

update_rrd(self, 'cpu', data)

267

268

data = self.fetch_net_stats()

269

LOG.debug('NET: %s', data)

270

update_rrd(self, 'net', data)

271

272

data = self.fetch_disk_stats()

273

LOG.debug('DISK: %s', data)

274

update_rrd(self, 'disk', data)

275

276

# TODO(devcamcar): Turn these into pool.ProcessPool.execute() calls

277

# and make the methods @defer.inlineCallbacks.

278

graph_cpu(self, '1d')

279

graph_cpu(self, '1w')

280

graph_cpu(self, '1m')

281

282

graph_net(self, '1d')

283

graph_net(self, '1w')

284

graph_net(self, '1m')

285

286

graph_disk(self, '1d')

287

graph_disk(self, '1w')

288

graph_disk(self, '1m')

289

except Exception:

290

LOG.exception(_('unexpected error during update'))

291

292

self.last_updated = utcnow()

293

294

def get_rrd_path(self):

295

"""

296

Returns the path to where RRD files are stored.

297

"""

298

return os.path.join(FLAGS.monitoring_rrd_path, self.instance_id)

299

300

def fetch_cpu_stats(self):

301

"""

302

Returns cpu usage statistics for this instance.

303

"""

304

info = self.conn.get_info(self.instance_id)

305

306

# Get the previous values.

307

cputime_last = self.cputime

308

cputime_last_updated = self.cputime_last_updated

309

310

# Get the raw CPU time used in nanoseconds.

311

self.cputime = float(info['cpu_time'])

312

self.cputime_last_updated = utcnow()

313

314

LOG.debug('CPU: %d', self.cputime)

315

316

# Skip calculation on first pass. Need delta to get a meaningful value.

317

if cputime_last_updated is None:

318

return None

319

320

# Calculate the number of seconds between samples.

321

d = self.cputime_last_updated - cputime_last_updated

322

t = d.days * 86400 + d.seconds

323

324

LOG.debug('t = %d', t)

325

326

# Calculate change over time in number of nanoseconds of CPU time used.

327

cputime_delta = self.cputime - cputime_last

328

329

LOG.debug('cputime_delta = %s', cputime_delta)

330

331

# Get the number of virtual cpus in this domain.

332

vcpus = int(info['num_cpu'])

333

334

LOG.debug('vcpus = %d', vcpus)

335

336

# Calculate CPU % used and cap at 100.

337

return min(cputime_delta / (t * vcpus * 1.0e9) * 100, 100)

338

339

def fetch_disk_stats(self):

340

"""

341

Returns disk usage statistics for this instance.

342

"""

343

rd = 0

344

wr = 0

345

346

disks = self.conn.get_disks(self.instance_id)

347

348

# Aggregate the read and write totals.

349

for disk in disks:

350

try:

351

rd_req, rd_bytes, wr_req, wr_bytes, errs = \

352

self.conn.block_stats(self.instance_id, disk)

353

rd += rd_bytes

354

wr += wr_bytes

355

except TypeError:

356

iid = self.instance_id

357

LOG.error(_('Cannot get blockstats for "%(disk)s"'

358

' on "%(iid)s"') % locals())

359

raise

360

361

return '%d:%d' % (rd, wr)

362

363

def fetch_net_stats(self):

364

"""

365

Returns network usage statistics for this instance.

366

"""

367

rx = 0

368

tx = 0

369

370

interfaces = self.conn.get_interfaces(self.instance_id)

371

372

# Aggregate the in and out totals.

373

for interface in interfaces:

374

try:

375

stats = self.conn.interface_stats(self.instance_id, interface)

376

rx += stats[0]

377

tx += stats[4]

378

except TypeError:

379

iid = self.instance_id

380

LOG.error(_('Cannot get ifstats for "%(interface)s"'

381

' on "%(iid)s"') % locals())

382

raise

383

384

return '%d:%d' % (rx, tx)

385

386

387

class InstanceMonitor(object, service.Service):

388

"""

389

Monitors the running instances of the current machine.

390

"""

391

392

def __init__(self):

393

"""

394

Initialize the monitoring loop.

395

"""

396

self._instances = {}

397

self._loop = task.LoopingCall(self.updateInstances)

398

399

def startService(self):

400

self._instances = {}

401

self._loop.start(interval=FLAGS.monitoring_instances_delay)

402

service.Service.startService(self)

403

404

def stopService(self):

405

self._loop.stop()

406

service.Service.stopService(self)

407

408

def updateInstances(self):

409

"""

410

Update resource usage for all running instances.

411

"""

412

try:

413

conn = virt_connection.get_connection(read_only=True)

414

except Exception, exn:

415

LOG.exception(_('unexpected exception getting connection'))

416

time.sleep(FLAGS.monitoring_instances_delay)

417

return

418

419

domain_ids = conn.list_instances()

420

try:

421

self.updateInstances_(conn, domain_ids)

422

except Exception, exn:

423

LOG.exception('updateInstances_')

424

425

def updateInstances_(self, conn, domain_ids):

426

for domain_id in domain_ids:

427

if not domain_id in self._instances:

428

instance = Instance(conn, domain_id)

429

self._instances[domain_id] = instance

430

LOG.debug(_('Found instance: %s'), domain_id)

431

432

for key in self._instances.keys():

433

instance = self._instances[key]

434

if instance.needs_update():

435

instance.update()

Older »