~registry/uec-testing-scripts/trunk : revision 56

1

#!/usr/bin/python

2

3

import datetime

4

import logging, logging.handlers

5

import os, os.path

6

import random

7

import re

8

import string

9

import sys

10

import yaml

11

import time

12

13

from twisted.internet import reactor, task

14

from twisted.python import usage

15

from zipfile import ZipFile, is_zipfile

16

from shutil import rmtree

17

from collections import defaultdict, namedtuple

18

19

import uec_utils

20

21

class Options(usage.Options):

22

optFlags = [

23

["cleanup", None, "Clean up" ],

24

["write_log", 'w', "Write log to file" ],

25

["max_instances", 'm', "Run the maximum number of instances" ],

26

]

27

optParameters = [

28

["log", "l", "info",

29

"log level wanted (debug, info, warning, error, critical"],

30

["start_stop", "s", None,

31

"start and stop instances over a certain period of time (x minutes)"],

32

]

33

34

euca_describe_availability_zones = 'euca-describe-availability-zones'

35

euca_run_instances = 'euca-run-instances'

36

euca_describe_instances = 'euca-describe-instances'

37

euca_terminate_instances = 'euca-terminate-instances'

38

euca_add_keypair = 'euca-add-keypair'

39

euca_delete_keypair = 'euca-delete-keypair'

40

euca_add_group = 'euca-add-group'

41

euca_delete_group = 'euca-delete-group'

42

euca_authorize = 'euca-authorize'

43

euca_get_console_output = 'euca-get-console-output'

44

euca_conf = 'euca_conf'

45

euca_describe_images = 'euca-describe-images'

46

47

zone = namedtuple('zone', 'max cpu ram disk')

48

admin_user = None

49

opts = None

50

overtime = False

51

52

class Instance(object):

53

TEST_STATES = [ 'not-tested', 'being-tested', 'success', 'failed',

54

'rescheduled', 'lost' ]

55

TEST_MAX_RETRIES = 12

56

EMI_USED = defaultdict(int)

57

VMTYPE_USED = defaultdict(int)

58

GROUP_USED = defaultdict(int)

59

KEYPAIR_USED = defaultdict(int)

60

MAX_INSTANCES = 0

61

ZONE_CPU = []

62

ZONE_CPU_DICT = dict()

63

CLUSTERS = defaultdict(int)

64

65

def __init__(self, emi, vmtype, user, ssh_key, group):

66

self.emi = emi

67

self.vmtype = vmtype

68

self.user = user

69

self.ssh_key = ssh_key

70

self.group = group

71

self.id = None

72

self.pub_ip = None

73

self.test_state = "not-tested"

74

self._test_retries = 0

75

self._state = None

76

self.req_shutdown = False

77

self.zone = None

78

self.cluster = None

79

80

def __repr__(self):

81

return "<Instance: %s - %s - %s - %s - %s - %s - %s>" % (self.id, \

82

self.emi, self.vmtype, self.user, self.ssh_key, self.group, self.state)

83

84

@property

85

def logger(self):

86

if self.id:

87

return logging.getLogger("INSTANCE %s" % (self.id))

88

else:

89

return logging.getLogger()

90

91

@property

92

def private_key(self):

93

return self.user.private_keys[self.ssh_key]

94

95

@property

96

def state(self):

97

return self._state

98

@state.setter

99

def state(self, value):

100

self.logger.debug("State: %s" % (value))

101

# The state has changed

102

if self._state != value:

103

self.logger.debug("New state: %s => %s" % (self._state, value))

104

self._state = value

105

# Only trigger a test when the state has been changed to running

106

if self._state == "running":

107

self.logger.debug("Scheduling test")

108

reactor.callLater(random.randint(5, 10) + random.random(),

109

self.test)

110

if self._state in ["terminated", "shutting-down"] \

111

and not self.req_shutdown:

112

self.test_state = 'lost'

113

114

@property

115

def test_state(self):

116

return self._test_state

117

@test_state.setter

118

def test_state(self, value):

119

self.logger.debug("Test state: %s" % (value))

120

if value not in self.TEST_STATES:

121

raise ValueError, "Unknow test state: %s" % (value)

122

else:

123

self._test_state = value

124

125

def getProcessOutput(self, executable, args=(), timeout=60):

126

self.logger.debug("Executing: %s - %s" % (executable, args))

127

return self.user.getProcessOutput(executable, args, timeout)

128

129

def start(self):

130

self.logger.info("Starting instance")

131

output = self.getProcessOutput(euca_run_instances,

132

args= [self.emi,

133

'-k', self.ssh_key,

134

'-g', self.group,

135

'-t', self.vmtype])

136

output.addCallbacks(self.started, self.errStarted)

137

# Terminate the instance no matter what

138

reactor.callLater(45*60, self.terminate)

139

140

def started(self, output):

141

self.logger.debug("Instance start output: %s" % (output))

142

started = False

143

for l in output.split("\n"):

144

if l.startswith('INSTANCE'):

145

self.id = l.split()[1]

146

self.cluster = l.split()[10]

147

self.logger.info("Started (%s/%s)" %

148

(self.emi, self.vmtype))

149

started = True

150

if not started:

151

self.errStarted(output)

152

153

def errStarted(self, output):

154

self.logger.warning("Instance failed to start: %s" % (output))

155

156

def test(self):

157

if self.state != "running":

158

self.logger.debug("Not running - aborting scheduled test.")

159

return

160

self.logger.info("Testing instance")

161

self.test_state = "being-tested"

162

self._test_retries += 1

163

args = ['-o', 'UserKnownHostsFile=/dev/null',

164

'-o', 'StrictHostKeyChecking=no',

165

'-o', 'ConnectTimeout=5',

166

'-o', 'Batchmode=yes',

167

'-i', self.private_key,

168

"ubuntu@%s" % (self.pub_ip),

169

'echo TEST_SUCCESS']

170

self.logger.debug("Ssh args: %s" % (args))

171

output = uec_utils.getProcessOutput('ssh', args, errortoo=True)

172

output.addCallback(self.tested)

173

174

def tested(self, output):

175

if 'TEST_SUCCESS' in output:

176

self.logger.info("Test successful [%s]" % self._test_retries)

177

self.test_state = "success"

178

self.logger.debug("Test output: %s" % (output))

179

if overtime:

180

self.terminate()

181

elif self._test_retries <= self.TEST_MAX_RETRIES:

182

self.logger.info("Rescheduling test (%s/%s)" %

183

(self._test_retries, self.TEST_MAX_RETRIES))

184

self.test_state = "rescheduled"

185

self.logger.debug("Test output: %s" % (output))

186

reactor.callLater(30, self.test)

187

else:

188

self.logger.warning("Test output: %s" % (output))

189

self.logger.info("Test failed")

190

self.test_state = "failed"

191

output = self.getProcessOutput(euca_get_console_output,

192

args = [self.id], timeout=300)

193

output.addCallback(self.consoleOutput)

194

195

196

def consoleOutput(self, output):

197

# caution -- only used when a test failed

198

self.logger.warning("Console output: START")

199

for aLine in re.split("\n", output):

200

self.logger.warning(aLine)

201

self.logger.warning("Console output: END")

202

self.terminate()

203

204

def terminate(self):

205

if self.state != "terminated":

206

self.logger.info("Terminating")

207

if self.id is None:

208

self.state = "terminated"

209

self.logger.info(

210

"instance has already terminated, last test state=%s" \

211

% self.test_state)

212

else:

213

if self.test_state not in ( "success", "failed" ):

214

output = self.getProcessOutput(euca_get_console_output,

215

args = [self.id], timeout=300)

216

output.addCallback(self.consoleOutput)

217

else:

218

self.req_shutdown = True

219

output = self.getProcessOutput(euca_terminate_instances,

220

args= [self.id])

221

output.addCallback(self.terminated)

222

223

def terminated(self, output):

224

self.logger.info("Terminated")

225

if self.test_state not in ( "success", "failed" ):

226

self.logger.info("terminated state found while test_state = %s" %

227

(self.test_state))

228

self.test_state = "failed"

229

self.state = "terminated"

230

self.logger.debug("terminate output: %s" % (output))

231

232

class User(object):

233

def __init__(self):

234

self.user_id = 'admin'

235

self.logger = logging.getLogger("USER %s" % (self.user_id))

236

self.keypairs = []

237

self.groups = []

238

self.cred_dir = '%s/cred/%s/' %(os.getcwd(), self.user_id)

239

self.cred_zip = 'cred.zip'

240

self.nb_groups = 2

241

self.nb_keypairs = 2

242

243

def __repr__(self):

244

return "<User: %s>" % (self.user_id)

245

246

def setup(self):

247

rmtree(self.cred_dir, ignore_errors=1)

248

os.makedirs(self.cred_dir)

249

output = self.getProcessOutput(euca_conf,

250

args = ['--get-credentials',

251

self.cred_dir+self.cred_zip,

252

r'2>&1'],

253

eucarc = False)

254

output.addCallback(self.credentialsAdded)

255

256

def credentialsAdded(self, output):

257

self.logger.debug("Credentials added output: %s" % (output))

258

if not is_zipfile(self.cred_dir+self.cred_zip):

259

self.logger.error("Credential file is not a valid zip")

260

reactor.stop()

261

ZipFile(self.cred_dir+self.cred_zip).extractall(self.cred_dir)

262

self.logger.info("Credentials downloaded")

263

for k in self.keypair_names:

264

self.logger.debug("Adding keypair %s" % (k))

265

output = self.getProcessOutput(euca_add_keypair, args = [k])

266

output.addCallback(self.keypairAdded)

267

for g in self.group_names:

268

self.logger.debug("Adding group %s" % (g))

269

output = self.getProcessOutput(euca_add_group,

270

args = ['-d', 'UEC-test', g])

271

output.addCallback(self.groupAdded)

272

273

def keypairAdded(self, output):

274

self.logger.debug("Keypair added output: %s" % (output))

275

key = None

276

privkey_fh = None

277

for l in output.split("\n"):

278

if not key and l.startswith("KEYPAIR"):

279

key = l.split()[1]

280

self.logger.info("Keypair (%s) added" % (key))

281

Instance.KEYPAIR_USED[key]

282

self.keypairs.append(key)

283

privkey_fh = open(self.private_keys[key], 'w')

284

os.chmod(self.private_keys[key], 0600)

285

continue

286

if privkey_fh:

287

privkey_fh.write("%s\n" % (l))

288

if privkey_fh:

289

privkey_fh.close()

290

291

def groupAdded(self, output):

292

self.logger.debug("Group added output: %s" % (output))

293

for l in output.split("\n"):

294

if l.startswith("GROUP"):

295

group = l.split()[1]

296

self.logger.info("Group (%s) added" % (group))

297

Instance.GROUP_USED[group]

298

self.logger.debug("Authorizing group %s" % (group))

299

for port in ['22','80']:

300

# FIXME

301

time.sleep(3) # Dummy delay see LP #592641

302

output = self.getProcessOutput(euca_authorize,

303

args = [group,

304

'-P', 'tcp',

305

'-p', port,

306

'-s', '0.0.0.0/0'])

307

output.addCallback(self.groupAuthorized)

308

309

def groupAuthorized(self, output):

310

self.logger.debug("Group authorize output: %s" % (output))

311

for l in output.split("\n"):

312

if l.startswith("GROUP"):

313

group = l.split()[1]

314

self.logger.info("Group %s authorized" % (group))

315

if not self.groups.count(group): self.groups.append(group)

316

317

@property

318

def ready(self):

319

return len(self.keypairs) == self.nb_keypairs and \

320

len(self.groups) == self.nb_groups

321

322

@property

323

def private_keys(self):

324

ret = {}

325

for k in self.keypairs:

326

ret[k] = os.path.join(self.cred_dir, "%s.priv" % (k))

327

return ret

328

329

@property

330

def keypair_names(self):

331

return [ "uectest-k%s" % (k) for k in range(self.nb_keypairs)]

332

333

@property

334

def group_names(self):

335

return [ "uectest-g%s" % (g) for g in range(self.nb_groups)]

336

337

def getProcessOutput(self, executable, args=(), timeout=60, eucarc = True):

338

eucarc = "source %s/eucarc ; " %self.cred_dir if eucarc else ""

339

myargs = [ '-c', "%s%s %s" % (eucarc,

340

executable,

341

string.join(args))]

342

self.logger.debug("Running bash command: %s " % (myargs))

343

return uec_utils.getProcessOutput('bash', myargs, timeout=timeout,

344

env=os.environ)

345

346

def cleanup(self):

347

for k in self.keypair_names:

348

self.logger.debug("Deleting keypair %s" % (k))

349

output = self.getProcessOutput(euca_delete_keypair, args = [k])

350

output.addCallback(self.keypairDeleted, k)

351

for g in self.group_names:

352

self.logger.debug("Deleting group %s" % (g))

353

output = self.getProcessOutput(euca_delete_group, args = [g])

354

output.addCallback(self.groupDeleted, g)

355

356

def keypairDeleted(self, output, key):

357

self.logger.info("Keypair (%s) deleted" % (key))

358

self.logger.debug("Keypair deleted output: %s" % (output))

359

360

def groupDeleted(self, output, group):

361

self.logger.info("Group (%s) deleted" % (group))

362

self.logger.debug("Group deleted output: %s" % (output))

363

364

def checkRessources(instances, admin_user, first_call = False):

365

if not admin_user.ready:

366

reactor.callLater(10, checkRessources, instances, admin_user,

367

first_call = True)

368

return

369

logging.debug("Checking ressource availability with %s" % (admin_user))

370

logging.debug("Known instances: %s" % (instances))

371

output = admin_user.getProcessOutput(euca_describe_availability_zones,

372

args= ['verbose'])

373

output.addCallback(availableEMI, instances, admin_user, first_call)

374

375

def availableEMI(output, instances, admin_user, first_call):

376

logging.debug("describe-availability-zones: %s" % (output))

377

# Get a list of types which have available ressource

378

available_types = []

379

zone_cpu = []

380

global zone

381

global opts

382

for l in output.split("\n"):

383

m = re.search(r"(\S+)\s+(\d+)\s/\s(\d+)\s+(\d+)\s+(\d+)\s+(\d+)", l)

384

if m and int(m.group(2)) != 0:

385

available_types.append(m.group(1))

386

if m and first_call:

387

zone_cpu.append(m.group(1))

388

Instance.ZONE_CPU_DICT[m.group(1)] = zone._make([int(m.group(3)),

389

int(m.group(4)),

390

int(m.group(5)),

391

int(m.group(6))])

392

if first_call:

393

Instance.ZONE_CPU = zone_cpu

394

max_i = Instance.ZONE_CPU_DICT[zone_cpu[0]].max

395

if opts["max_instances"]:

396

Instance.MAX_INSTANCES = max_i

397

while max_i > 0:

398

for zone in zone_cpu:

399

if max_i >= Instance.ZONE_CPU_DICT[zone].cpu:

400

max_i -= Instance.ZONE_CPU_DICT[zone].cpu

401

Instance.MAX_INSTANCES += 1

402

if overtime:

403

reactor.callLater(int(overtime)*60, TerminaTestedInstances, instances)

404

output = admin_user.getProcessOutput(euca_describe_images, args= [])

405

output.addCallback(availableRessource, instances,

406

admin_user, available_types)

407

408

def availableRessource(output, instances, admin_user, available_types):

409

# Get a list of available EMI

410

available_emi = []

411

#logging.debug("describe-images: %s" % (output))

412

for l in output.split("\n"):

413

m = re.search(r"IMAGE\s+(emi.*?)\s+", l)

414

if m : available_emi.append(m.group(1))

415

logging.debug("Available VM types: %s" % (available_types))

416

logging.debug("Available EMI: %s" % (available_emi))

417

[ Instance.VMTYPE_USED[t] for t in available_types ]

418

[ Instance.EMI_USED[i] for i in available_emi ]

419

# Start one instance of an emi that has ressource available

420

emi = None

421

vmtype = None

422

ressources_ok = True

423

try:

424

emi = [ i for i in available_emi if Instance.EMI_USED[i] \

425

== min(Instance.EMI_USED.values())][0]

426

vmtype = [ t for t in available_types if Instance.VMTYPE_USED[t] \

427

== min(Instance.VMTYPE_USED.values())][0]

428

key = [ k for k in admin_user.keypairs if Instance.KEYPAIR_USED[k] \

429

== min(Instance.KEYPAIR_USED.values())][0]

430

group = [ g for g in admin_user.groups if Instance.GROUP_USED[g] \

431

== min(Instance.GROUP_USED.values())][0]

432

except IndexError:

433

logging.info("Not enough ressource available to start a new EMI")

434

ressources_ok = False

435

436

if opts["max_instances"] and Instance.ZONE_CPU[0] in available_types:

437

vmtype = Instance.ZONE_CPU[0]

438

439

if emi and vmtype:

440

logging.info("Creating new instance")

441

i = Instance(emi = emi, vmtype = vmtype,

442

user = admin_user, ssh_key = key, group = group)

443

instances.append(i)

444

Instance.EMI_USED[emi] += 1

445

Instance.VMTYPE_USED[vmtype] += 1

446

Instance.KEYPAIR_USED[key] += 1

447

Instance.GROUP_USED[group] += 1

448

i.start()

449

450

if not opts['start_stop']:

451

if (ressources_ok and len(instances) < Instance.MAX_INSTANCES):

452

reactor.callLater(random.randint(20, 30) + random.random(),

453

checkRessources, instances, admin_user)

454

else:

455

logging.info("Max instances reached. Stop checking for %s" %

456

"available ressource.")

457

reactor.callLater(30, TerminaTestedInstances, instances)

458

elif overtime:

459

reactor.callLater(random.randint(20, 30) + random.random(),

460

checkRessources, instances, admin_user)

461

462

def TerminaTestedInstances(instances):

463

global overtime

464

overtime = False

465

logging.debug("Check if all the instances have been tested")

466

if len(instances) == len(filter(lambda i: i.test_state in \

467

( "success", "failed", "lost" ),

468

instances)):

469

logging.info("Terminate all tested instances")

470

for i in instances:

471

if i.state not in 'terminated': i.terminate()

472

if len(instances) == len(filter(lambda i: i.state == 'terminated',

473

instances)):

474

logging.info("Test run completed")

475

reactor.stop()

476

else:

477

reactor.callLater(random.randint(10, 15) + random.random(),

478

TerminaTestedInstances, instances)

479

480

def checkInstancesState(instances, admin_user):

481

if not admin_user.ready:

482

reactor.callLater(15, checkInstancesState, instances, admin_user)

483

return

484

logging.debug("Checking instances state")

485

output = admin_user.getProcessOutput(euca_describe_instances)

486

output.addCallback(instancesState, instances, admin_user)

487

488

def instancesState(output, instances, admin_user):

489

logging.debug("describe instance: %s", output)

490

for l in output.split("\n"):

491

if l.startswith('INSTANCE'):

492

try:

493

i_id = l.split()[1]

494

i = filter(lambda i: i.id == i_id, instances)[0]

495

except IndexError:

496

#logging.debug("Unknown instance %s - skipping" % (i_id))

497

continue

498

state = l.split()[5]

499

i.state = state

500

pub_ip = l.split()[3]

501

if pub_ip != '0.0.0.0':

502

i.logger.debug("Setting pub ip (%s)" % (pub_ip))

503

i.pub_ip = pub_ip

504

m = re.search(r"\s+(\w+)\s+eki-", l)

505

if m: i.zone = m.group(1)

506

# Stop when the number of instances have been run and tested

507

#if config['max_instances_to_start'] != 0 \

508

# and len(instances) >= config['max_instances_to_start'] \

509

# and len(filter(lambda i: i.state != 'terminated', instances)) == 0:

510

# logging.info("Test run completed")

511

# reactor.stop()

512

#else:

513

reactor.callLater(random.randint(10, 20) + random.random(),

514

checkInstancesState, instances, admin_user)

515

516

def cleanUp(instances):

517

logging.info("Cleaning up")

518

logging.debug("Cleaning instances: %s" % (instances))

519

[ i.terminate() for i in instances ]

520

logging.debug("Cleaning user: %s" % (admin_user.user_id))

521

admin_user.cleanup()

522

523

def printStats(instances, once=False):

524

stats = { 'started' : len(instances) }

525

for s in Instance.TEST_STATES:

526

stats[s] = len(filter(lambda i: i.test_state == s, instances))

527

try:

528

stats["%s_rate" % (s)] = float("%.2f" \

529

% (float(stats[s])/stats['started']))

530

except ZeroDivisionError:

531

stats["%s_rate" % (s)] = 0.00

532

stats["%s_instances" % (s)] = \

533

[ i.id for i in instances if i.test_state == s ]

534

logging.getLogger('STATS').info("Stats: %s" % (yaml.dump(stats)))

535

if not once:

536

reactor.callLater(60, printStats, instances)

537

538

def printFinalStats(instances):

539

stats = { 'started' : len(instances) }

540

for s in Instance.TEST_STATES:

541

stats[s] = len(filter(lambda i: i.test_state == s, instances))

542

try:

543

stats["%s_rate" % (s)] = float("%.2f" \

544

% (float(stats[s])/stats['started']))

545

except ZeroDivisionError:

546

stats["%s_rate" % (s)] = 0.00

547

for s in sorted(stats.keys()):

548

print "%s : %s" % (s, stats[s])

549

550

def main():

551

LEVELS = {'debug': logging.DEBUG,

552

'info': logging.INFO,

553

'warning': logging.WARNING,

554

'error': logging.ERROR,

555

'critical': logging.CRITICAL}

556

global opts

557

global admin_user

558

global overtime

559

opts = Options()

560

561

try:

562

opts.parseOptions() # When given no argument, parses sys.argv[1:]

563

except usage.UsageError, errortext:

564

print '%s: %s' % (sys.argv[0], errortext)

565

print '%s: Try --help for usage details.' % (sys.argv[0])

566

return 1

567

568

# Verify that script is run as root

569

if os.getuid():

570

sys.stderr.write(

571

"This script needs superuser permissions to run correctly\n")

572

return 1

573

574

level = LEVELS.get(opts['log'], logging.NOTSET)

575

576

if opts['write_log']:

577

suffix = "all_vmtypes"

578

if opts["max_instances"]:

579

suffix = "max_instances"

580

elif opts["start_stop"]:

581

suffix = "start_stop"

582

LOG_FILENAME = "/var/log/uec_instances.%s.log" %suffix

583

handler = logging.handlers.RotatingFileHandler(LOG_FILENAME,

584

backupCount=2)

585

logging.getLogger().addHandler(handler)

586

handler.setFormatter(

587

logging.Formatter("[%(asctime)s] %(levelname)s %(message)s"))

588

logging.getLogger().setLevel(level)

589

handler.doRollover()

590

else:

591

logging.basicConfig(level=level)

592

logging.info("log level set to " + opts['log'])

593

instances = []

594

admin_user = User() # Create admin user

595

if opts["start_stop"]:

596

overtime = opts["start_stop"]

597

if opts["cleanup"]:

598

cleanUp(instances)

599

else:

600

reactor.callWhenRunning(admin_user.setup) # setup admin user

601

reactor.callWhenRunning(checkRessources, instances, admin_user,

602

first_call = True)

603

reactor.callWhenRunning(checkInstancesState, instances, admin_user)

604

printStats(instances)

605

reactor.run()

606

cleanUp(instances)

607

printStats(instances, once=True)

608

printFinalStats(instances)

609

if len(filter(lambda i: i.test_state == 'success', instances)) != \

610

len(instances):

611

return 1

612

else:

613

return 0

614

615

if __name__ == "__main__":

616

sys.exit(main())

617