~certify-web-dev/twisted/certify-trunk

« back to all changes in this revision

Viewing changes to twisted/web/sux.py

Committer: Marc Tardif
Date: 2010-05-20 19:56:06 UTC
Revision ID: marc.tardif@canonical.com-20100520195606-xdrf0ztlxhvwmmzb

Added twisted-web.

files added:
twisted/plugins/twisted_web.py

twisted/web

twisted/web/__init__.py

twisted/web/_auth

twisted/web/_auth/__init__.py

twisted/web/_auth/basic.py

twisted/web/_auth/digest.py

twisted/web/_auth/wrapper.py

twisted/web/_newclient.py

twisted/web/_version.py

twisted/web/client.py

twisted/web/demo.py

twisted/web/distrib.py

twisted/web/domhelpers.py

twisted/web/error.py

twisted/web/google.py

twisted/web/guard.py

twisted/web/html.py

twisted/web/http.py

twisted/web/http_headers.py

twisted/web/iweb.py

twisted/web/microdom.py

twisted/web/proxy.py

twisted/web/resource.py

twisted/web/rewrite.py

twisted/web/script.py

twisted/web/server.py

twisted/web/soap.py

twisted/web/static.py

twisted/web/sux.py

twisted/web/tap.py

twisted/web/test

twisted/web/test/__init__.py

twisted/web/test/_util.py

twisted/web/test/test_cgi.py

twisted/web/test/test_distrib.py

twisted/web/test/test_domhelpers.py

twisted/web/test/test_error.py

twisted/web/test/test_http.py

twisted/web/test/test_http_headers.py

twisted/web/test/test_httpauth.py

twisted/web/test/test_newclient.py

twisted/web/test/test_proxy.py

twisted/web/test/test_resource.py

twisted/web/test/test_script.py

twisted/web/test/test_soap.py

twisted/web/test/test_static.py

twisted/web/test/test_tap.py

twisted/web/test/test_vhost.py

twisted/web/test/test_web.py

twisted/web/test/test_webclient.py

twisted/web/test/test_wsgi.py

twisted/web/test/test_xml.py

twisted/web/test/test_xmlrpc.py

twisted/web/trp.py

twisted/web/twcgi.py

twisted/web/util.py

twisted/web/vhost.py

twisted/web/wsgi.py

twisted/web/xmlrpc.py

Show diffs side-by-side

added added

removed removed

twisted/web/sux.py

# -*- test-case-name: twisted.web.test.test_xml -*-

# See LICENSE for details.

"""

*S*mall, *U*ncomplicated *X*ML.

This is a very simple implementation of XML/HTML as a network

protocol. It is not at all clever. Its main features are that it

does not:

- support namespaces

- mung mnemonic entity references

- validate

- perform *any* external actions (such as fetching URLs or writing files)

under *any* circumstances

- has lots and lots of horrible hacks for supporting broken HTML (as an

option, they're not on by default).

"""

from twisted.internet.protocol import Protocol, FileWrapper

from twisted.python.reflect import prefixedMethodNames

# Elements of the three-tuples in the state table.

BEGIN_HANDLER = 0

DO_HANDLER = 1

END_HANDLER = 2

identChars = '.-_:'

lenientIdentChars = identChars + ';+#/%~'

def nop(*args, **kw):

"Do nothing."

def unionlist(*args):

l = []

for x in args:

l.extend(x)

d = dict([(x, 1) for x in l])

return d.keys()

def zipfndict(*args, **kw):

default = kw.get('default', nop)

d = {}

for key in unionlist(*[fndict.keys() for fndict in args]):

d[key] = tuple([x.get(key, default) for x in args])

return d

def prefixedMethodClassDict(clazz, prefix):

return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])

def prefixedMethodObjDict(obj, prefix):

return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])

class ParseError(Exception):

def __init__(self, filename, line, col, message):

self.filename = filename

self.line = line

self.col = col

self.message = message

def __str__(self):

return "%s:%s:%s: %s" % (self.filename, self.line, self.col,

self.message)

class XMLParser(Protocol):

state = None

encodings = None

filename = "<xml />"

beExtremelyLenient = 0

_prepend = None

# _leadingBodyData will sometimes be set before switching to the

# 'bodydata' state, when we "accidentally" read a byte of bodydata

# in a different state.

_leadingBodyData = None

def connectionMade(self):

self.lineno = 1

self.colno = 0

self.encodings = []

def saveMark(self):

'''Get the line number and column of the last character parsed'''

# This gets replaced during dataReceived, restored afterwards

return (self.lineno, self.colno)

def _parseError(self, message):

100

raise ParseError(*((self.filename,)+self.saveMark()+(message,)))

101

102

def _buildStateTable(self):

103

'''Return a dictionary of begin, do, end state function tuples'''

104

# _buildStateTable leaves something to be desired but it does what it

105

# does.. probably slowly, so I'm doing some evil caching so it doesn't

106

# get called more than once per class.

107

stateTable = getattr(self.__class__, '__stateTable', None)

108

if stateTable is None:

109

stateTable = self.__class__.__stateTable = zipfndict(

110

*[prefixedMethodObjDict(self, prefix)

111

for prefix in ('begin_', 'do_', 'end_')])

112

return stateTable

113

114

def _decode(self, data):

115

if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:

116

assert not len(data) & 1, 'UTF-16 must come in pairs for now'

117

if self._prepend:

118

data = self._prepend + data

119

for encoding in self.encodings:

120

data = unicode(data, encoding)

121

return data

122

123

def maybeBodyData(self):

124

if self.endtag:

125

return 'bodydata'

126

127

# Get ready for fun! We're going to allow

128

# <script>if (foo < bar)</script> to work!

129

# We do this by making everything between <script> and

130

# </script> a Text

131

# BUT <script src="foo"> will be special-cased to do regular,

132

# lenient behavior, because those may not have </script>

133

# -radix

134

135

if (self.tagName == 'script'

136

and not self.tagAttributes.has_key('src')):

137

# we do this ourselves rather than having begin_waitforendscript

138

# becuase that can get called multiple times and we don't want

139

# bodydata to get reset other than the first time.

140

self.begin_bodydata(None)

141

return 'waitforendscript'

142

return 'bodydata'

143

144

145

146

def dataReceived(self, data):

147

stateTable = self._buildStateTable()

148

if not self.state:

149

# all UTF-16 starts with this string

150

if data.startswith('\xff\xfe'):

151

self._prepend = '\xff\xfe'

152

self.encodings.append('UTF-16')

153

data = data[2:]

154

elif data.startswith('\xfe\xff'):

155

self._prepend = '\xfe\xff'

156

self.encodings.append('UTF-16')

157

data = data[2:]

158

self.state = 'begin'

159

if self.encodings:

160

data = self._decode(data)

161

# bring state, lineno, colno into local scope

162

lineno, colno = self.lineno, self.colno

163

curState = self.state

164

# replace saveMark with a nested scope function

165

_saveMark = self.saveMark

166

def saveMark():

167

return (lineno, colno)

168

self.saveMark = saveMark

169

# fetch functions from the stateTable

170

beginFn, doFn, endFn = stateTable[curState]

171

try:

172

for byte in data:

173

# do newline stuff

174

if byte == '\n':

175

lineno += 1

176

colno = 0

177

else:

178

colno += 1

179

newState = doFn(byte)

180

if newState is not None and newState != curState:

181

# this is the endFn from the previous state

182

endFn()

183

curState = newState

184

beginFn, doFn, endFn = stateTable[curState]

185

beginFn(byte)

186

finally:

187

self.saveMark = _saveMark

188

self.lineno, self.colno = lineno, colno

189

# state doesn't make sense if there's an exception..

190

self.state = curState

191

192

193

def connectionLost(self, reason):

194

"""

195

End the last state we were in.

196

"""

197

stateTable = self._buildStateTable()

198

stateTable[self.state][END_HANDLER]()

199

200

201

# state methods

202

203

def do_begin(self, byte):

204

if byte.isspace():

205

return

206

if byte != '<':

207

if self.beExtremelyLenient:

208

self._leadingBodyData = byte

209

return 'bodydata'

210

self._parseError("First char of document [%r] wasn't <" % (byte,))

211

return 'tagstart'

212

213

def begin_comment(self, byte):

214

self.commentbuf = ''

215

216

def do_comment(self, byte):

217

self.commentbuf += byte

218

if self.commentbuf.endswith('-->'):

219

self.gotComment(self.commentbuf[:-3])

220

return 'bodydata'

221

222

def begin_tagstart(self, byte):

223

self.tagName = '' # name of the tag

224

self.tagAttributes = {} # attributes of the tag

225

self.termtag = 0 # is the tag self-terminating

226

self.endtag = 0

227

228

def do_tagstart(self, byte):

229

if byte.isalnum() or byte in identChars:

230

self.tagName += byte

231

if self.tagName == '!--':

232

return 'comment'

233

elif byte.isspace():

234

if self.tagName:

235

if self.endtag:

236

# properly strict thing to do here is probably to only

237

# accept whitespace

238

return 'waitforgt'

239

return 'attrs'

240

else:

241

self._parseError("Whitespace before tag-name")

242

elif byte == '>':

243

if self.endtag:

244

self.gotTagEnd(self.tagName)

245

return 'bodydata'

246

else:

247

self.gotTagStart(self.tagName, {})

248

return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()

249

elif byte == '/':

250

if self.tagName:

251

return 'afterslash'

252

else:

253

self.endtag = 1

254

elif byte in '!?':

255

if self.tagName:

256

if not self.beExtremelyLenient:

257

self._parseError("Invalid character in tag-name")

258

else:

259

self.tagName += byte

260

self.termtag = 1

261

elif byte == '[':

262

if self.tagName == '!':

263

return 'expectcdata'

264

else:

265

self._parseError("Invalid '[' in tag-name")

266

else:

267

if self.beExtremelyLenient:

268

self.bodydata = '<'

269

return 'unentity'

270

self._parseError('Invalid tag character: %r'% byte)

271

272

def begin_unentity(self, byte):

273

self.bodydata += byte

274

275

def do_unentity(self, byte):

276

self.bodydata += byte

277

return 'bodydata'

278

279

def end_unentity(self):

280

self.gotText(self.bodydata)

281

282

def begin_expectcdata(self, byte):

283

self.cdatabuf = byte

284

285

def do_expectcdata(self, byte):

286

self.cdatabuf += byte

287

cdb = self.cdatabuf

288

cd = '[CDATA['

289

if len(cd) > len(cdb):

290

if cd.startswith(cdb):

291

return

292

elif self.beExtremelyLenient:

293

## WHAT THE CRAP!? MSWord9 generates HTML that includes these

294

## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore

295

## 'em as best I can. this should really be a separate parse

296

## state but I don't even have any idea what these _are_.

297

return 'waitforgt'

298

else:

299

self._parseError("Mal-formed CDATA header")

300

if cd == cdb:

301

self.cdatabuf = ''

302

return 'cdata'

303

self._parseError("Mal-formed CDATA header")

304

305

def do_cdata(self, byte):

306

self.cdatabuf += byte

307

if self.cdatabuf.endswith("]]>"):

308

self.cdatabuf = self.cdatabuf[:-3]

309

return 'bodydata'

310

311

def end_cdata(self):

312

self.gotCData(self.cdatabuf)

313

self.cdatabuf = ''

314

315

def do_attrs(self, byte):

316

if byte.isalnum() or byte in identChars:

317

# XXX FIXME really handle !DOCTYPE at some point

318

if self.tagName == '!DOCTYPE':

319

return 'doctype'

320

if self.tagName[0] in '!?':

321

return 'waitforgt'

322

return 'attrname'

323

elif byte.isspace():

324

return

325

elif byte == '>':

326

self.gotTagStart(self.tagName, self.tagAttributes)

327

return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()

328

elif byte == '/':

329

return 'afterslash'

330

elif self.beExtremelyLenient:

331

# discard and move on? Only case I've seen of this so far was:

332

# <foo bar="baz"">

333

return

334

self._parseError("Unexpected character: %r" % byte)

335

336

def begin_doctype(self, byte):

337

self.doctype = byte

338

339

def do_doctype(self, byte):

340

if byte == '>':

341

return 'bodydata'

342

self.doctype += byte

343

344

def end_doctype(self):

345

self.gotDoctype(self.doctype)

346

self.doctype = None

347

348

def do_waitforgt(self, byte):

349

if byte == '>':

350

if self.endtag or not self.beExtremelyLenient:

351

return 'bodydata'

352

return self.maybeBodyData()

353

354

def begin_attrname(self, byte):

355

self.attrname = byte

356

self._attrname_termtag = 0

357

358

def do_attrname(self, byte):

359

if byte.isalnum() or byte in identChars:

360

self.attrname += byte

361

return

362

elif byte == '=':

363

return 'beforeattrval'

364

elif byte.isspace():

365

return 'beforeeq'

366

elif self.beExtremelyLenient:

367

if byte in '"\'':

368

return 'attrval'

369

if byte in lenientIdentChars or byte.isalnum():

370

self.attrname += byte

371

return

372

if byte == '/':

373

self._attrname_termtag = 1

374

return

375

if byte == '>':

376

self.attrval = 'True'

377

self.tagAttributes[self.attrname] = self.attrval

378

self.gotTagStart(self.tagName, self.tagAttributes)

379

if self._attrname_termtag:

380

self.gotTagEnd(self.tagName)

381

return 'bodydata'

382

return self.maybeBodyData()

383

# something is really broken. let's leave this attribute where it

384

# is and move on to the next thing

385

return

386

self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))

387

388

def do_beforeattrval(self, byte):

389

if byte in '"\'':

390

return 'attrval'

391

elif byte.isspace():

392

return

393

elif self.beExtremelyLenient:

394

if byte in lenientIdentChars or byte.isalnum():

395

return 'messyattr'

396

if byte == '>':

397

self.attrval = 'True'

398

self.tagAttributes[self.attrname] = self.attrval

399

self.gotTagStart(self.tagName, self.tagAttributes)

400

return self.maybeBodyData()

401

if byte == '\\':

402

# I saw this in actual HTML once:

403

# <font size=\"3\"><sup>SM</sup></font>

404

return

405

self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)

406

407

attrname = ''

408

attrval = ''

409

410

def begin_beforeeq(self,byte):

411

self._beforeeq_termtag = 0

412

413

def do_beforeeq(self, byte):

414

if byte == '=':

415

return 'beforeattrval'

416

elif byte.isspace():

417

return

418

elif self.beExtremelyLenient:

419

if byte.isalnum() or byte in identChars:

420

self.attrval = 'True'

421

self.tagAttributes[self.attrname] = self.attrval

422

return 'attrname'

423

elif byte == '>':

424

self.attrval = 'True'

425

self.tagAttributes[self.attrname] = self.attrval

426

self.gotTagStart(self.tagName, self.tagAttributes)

427

if self._beforeeq_termtag:

428

self.gotTagEnd(self.tagName)

429

return 'bodydata'

430

return self.maybeBodyData()

431

elif byte == '/':

432

self._beforeeq_termtag = 1

433

return

434

self._parseError("Invalid attribute")

435

436

def begin_attrval(self, byte):

437

self.quotetype = byte

438

self.attrval = ''

439

440

def do_attrval(self, byte):

441

if byte == self.quotetype:

442

return 'attrs'

443

self.attrval += byte

444

445

def end_attrval(self):

446

self.tagAttributes[self.attrname] = self.attrval

447

self.attrname = self.attrval = ''

448

449

def begin_messyattr(self, byte):

450

self.attrval = byte

451

452

def do_messyattr(self, byte):

453

if byte.isspace():

454

return 'attrs'

455

elif byte == '>':

456

endTag = 0

457

if self.attrval.endswith('/'):

458

endTag = 1

459

self.attrval = self.attrval[:-1]

460

self.tagAttributes[self.attrname] = self.attrval

461

self.gotTagStart(self.tagName, self.tagAttributes)

462

if endTag:

463

self.gotTagEnd(self.tagName)

464

return 'bodydata'

465

return self.maybeBodyData()

466

else:

467

self.attrval += byte

468

469

def end_messyattr(self):

470

if self.attrval:

471

self.tagAttributes[self.attrname] = self.attrval

472

473

def begin_afterslash(self, byte):

474

self._after_slash_closed = 0

475

476

def do_afterslash(self, byte):

477

# this state is only after a self-terminating slash, e.g. <foo/>

478

if self._after_slash_closed:

479

self._parseError("Mal-formed")#XXX When does this happen??

480

if byte != '>':

481

if self.beExtremelyLenient:

482

return

483

else:

484

self._parseError("No data allowed after '/'")

485

self._after_slash_closed = 1

486

self.gotTagStart(self.tagName, self.tagAttributes)

487

self.gotTagEnd(self.tagName)

488

# don't need maybeBodyData here because there better not be

489

# any javascript code after a <script/>... we'll see :(

490

return 'bodydata'

491

492

def begin_bodydata(self, byte):

493

if self._leadingBodyData:

494

self.bodydata = self._leadingBodyData

495

del self._leadingBodyData

496

else:

497

self.bodydata = ''

498

499

def do_bodydata(self, byte):

500

if byte == '<':

501

return 'tagstart'

502

if byte == '&':

503

return 'entityref'

504

self.bodydata += byte

505

506

def end_bodydata(self):

507

self.gotText(self.bodydata)

508

self.bodydata = ''

509

510

def do_waitforendscript(self, byte):

511

if byte == '<':

512

return 'waitscriptendtag'

513

self.bodydata += byte

514

515

def begin_waitscriptendtag(self, byte):

516

self.temptagdata = ''

517

self.tagName = ''

518

self.endtag = 0

519

520

def do_waitscriptendtag(self, byte):

521

# 1 enforce / as first byte read

522

# 2 enforce following bytes to be subset of "script" until

523

# tagName == "script"

524

# 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)

525

# 3 spaces can happen anywhere, they're ignored

526

# e.g. < / script >

527

# 4 anything else causes all data I've read to be moved to the

528

# bodydata, and switch back to waitforendscript state

529

530

# If it turns out this _isn't_ a </script>, we need to

531

# remember all the data we've been through so we can append it

532

# to bodydata

533

self.temptagdata += byte

534

535

# 1

536

if byte == '/':

537

self.endtag = True

538

elif not self.endtag:

539

self.bodydata += "<" + self.temptagdata

540

return 'waitforendscript'

541

# 2

542

elif byte.isalnum() or byte in identChars:

543

self.tagName += byte

544

if not 'script'.startswith(self.tagName):

545

self.bodydata += "<" + self.temptagdata

546

return 'waitforendscript'

547

elif self.tagName == 'script':

548

self.gotText(self.bodydata)

549

self.gotTagEnd(self.tagName)

550

return 'waitforgt'

551

# 3

552

elif byte.isspace():

553

return 'waitscriptendtag'

554

# 4

555

else:

556

self.bodydata += "<" + self.temptagdata

557

return 'waitforendscript'

558

559

560

def begin_entityref(self, byte):

561

self.erefbuf = ''

562

self.erefextra = '' # extra bit for lenient mode

563

564

def do_entityref(self, byte):

565

if byte.isspace() or byte == "<":

566

if self.beExtremelyLenient:

567

# '&foo' probably was '&foo'

568

if self.erefbuf and self.erefbuf != "amp":

569

self.erefextra = self.erefbuf

570

self.erefbuf = "amp"

571

if byte == "<":

572

return "tagstart"

573

else:

574

self.erefextra += byte

575

return 'spacebodydata'

576

self._parseError("Bad entity reference")

577

elif byte != ';':

578

self.erefbuf += byte

579

else:

580

return 'bodydata'

581

582

def end_entityref(self):

583

self.gotEntityReference(self.erefbuf)

584

585

# hacky support for space after & in entityref in beExtremelyLenient

586

# state should only happen in that case

587

def begin_spacebodydata(self, byte):

588

self.bodydata = self.erefextra

589

self.erefextra = None

590

do_spacebodydata = do_bodydata

591

end_spacebodydata = end_bodydata

592

593

# Sorta SAX-ish API

594

595

def gotTagStart(self, name, attributes):

596

'''Encountered an opening tag.

597

598

Default behaviour is to print.'''

599

print 'begin', name, attributes

600

601

def gotText(self, data):

602

'''Encountered text

603

604

Default behaviour is to print.'''

605

print 'text:', repr(data)

606

607

def gotEntityReference(self, entityRef):

608

'''Encountered mnemonic entity reference

609

610

Default behaviour is to print.'''

611

print 'entityRef: &%s;' % entityRef

612

613

def gotComment(self, comment):

614

'''Encountered comment.

615

616

Default behaviour is to ignore.'''

617

pass

618

619

def gotCData(self, cdata):

620

'''Encountered CDATA

621

622

Default behaviour is to call the gotText method'''

623

self.gotText(cdata)

624

625

def gotDoctype(self, doctype):

626

"""Encountered DOCTYPE

627

628

This is really grotty: it basically just gives you everything between

629

'<!DOCTYPE' and '>' as an argument.

630

"""

631

print '!DOCTYPE', repr(doctype)

632

633

def gotTagEnd(self, name):

634

'''Encountered closing tag

635

636

Default behaviour is to print.'''

637

print 'end', name

638

639

if __name__ == '__main__':

640

from cStringIO import StringIO

641

testDocument = '''

642

643

<!DOCTYPE ignore all this shit, hah its malformed!!!!@$>

644

<?xml version="suck it"?>

645

<foo>

646

647

<bar />

648

649

<![CDATA[ foo bar baz ]]>

650

</foo>

651

'''

652

x = XMLParser()

653

x.makeConnection(FileWrapper(StringIO()))

654

# fn = "/home/glyph/Projects/Twisted/doc/howto/ipc10paper.html"

655

fn = "/home/glyph/gruesome.xml"

656

# testDocument = open(fn).read()

657

x.dataReceived(testDocument)

Older »