~excid3/keryx/devel : revision 11

1

# This library is free software; you can redistribute it and/or

2

# modify it under the terms of the GNU Lesser General Public

3

# License as published by the Free Software Foundation; either

4

# version 2.1 of the License, or (at your option) any later version.

5

#

6

# This library is distributed in the hope that it will be useful,

7

# but WITHOUT ANY WARRANTY; without even the implied warranty of

8

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

9

# Lesser General Public License for more details.

10

#

11

# You should have received a copy of the GNU Lesser General Public

12

# License along with this library; if not, write to the

13

# Free Software Foundation, Inc.,

14

# 59 Temple Place, Suite 330,

15

# Boston, MA 02111-1307 USA

16

17

# This file is part of urlgrabber, a high-level cross-protocol url-grabber

18

19

20

"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.

21

22

>>> import urllib2

23

>>> from keepalive import HTTPHandler

24

>>> keepalive_handler = HTTPHandler()

25

>>> opener = urllib2.build_opener(keepalive_handler)

26

>>> urllib2.install_opener(opener)

27

>>>

28

>>> fo = urllib2.urlopen('http://www.python.org')

29

30

If a connection to a given host is requested, and all of the existing

31

connections are still in use, another connection will be opened. If

32

the handler tries to use an existing connection but it fails in some

33

way, it will be closed and removed from the pool.

34

35

To remove the handler, simply re-run build_opener with no arguments, and

36

install that opener.

37

38

You can explicitly close connections by using the close_connection()

39

method of the returned file-like object (described below) or you can

40

use the handler methods:

41

42

close_connection(host)

43

close_all()

44

open_connections()

45

46

NOTE: using the close_connection and close_all methods of the handler

47

should be done with care when using multiple threads.

48

* there is nothing that prevents another thread from creating new

49

connections immediately after connections are closed

50

* no checks are done to prevent in-use connections from being closed

51

52

>>> keepalive_handler.close_all()

53

54

EXTRA ATTRIBUTES AND METHODS

55

56

Upon a status of 200, the object returned has a few additional

57

attributes and methods, which should not be used if you want to

58

remain consistent with the normal urllib2-returned objects:

59

60

close_connection() - close the connection to the host

61

readlines() - you know, readlines()

62

status - the return status (ie 404)

63

reason - english translation of status (ie 'File not found')

64

65

If you want the best of both worlds, use this inside an

66

AttributeError-catching try:

67

68

>>> try: status = fo.status

69

>>> except AttributeError: status = None

70

71

Unfortunately, these are ONLY there if status == 200, so it's not

72

easy to distinguish between non-200 responses. The reason is that

73

urllib2 tries to do clever things with error codes 301, 302, 401,

74

and 407, and it wraps the object upon return.

75

76

For python versions earlier than 2.4, you can avoid this fancy error

77

handling by setting the module-level global HANDLE_ERRORS to zero.

78

You see, prior to 2.4, it's the HTTP Handler's job to determine what

79

to handle specially, and what to just pass up. HANDLE_ERRORS == 0

80

means "pass everything up". In python 2.4, however, this job no

81

longer belongs to the HTTP Handler and is now done by a NEW handler,

82

HTTPErrorProcessor. Here's the bottom line:

83

84

python version < 2.4

85

HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as

86

errors

87

HANDLE_ERRORS == 0 pass everything up, error processing is

88

left to the calling code

89

python version >= 2.4

90

HANDLE_ERRORS == 1 pass up 200, treat the rest as errors

91

HANDLE_ERRORS == 0 (default) pass everything up, let the

92

other handlers (specifically,

93

HTTPErrorProcessor) decide what to do

94

95

In practice, setting the variable either way makes little difference

96

in python 2.4, so for the most consistent behavior across versions,

97

you probably just want to use the defaults, which will give you

98

exceptions on errors.

99

100

"""

101

102

# $Id: keepalive.py,v 1.17 2006/12/08 00:14:16 mstenner Exp $

103

104

import urllib2

105

import httplib

106

import socket

107

import thread

108

109

DEBUG = None

110

111

import sslfactory

112

113

import sys

114

if sys.version_info < (2, 4): HANDLE_ERRORS = 1

115

else: HANDLE_ERRORS = 0

116

117

class ConnectionManager:

118

"""

119

The connection manager must be able to:

120

* keep track of all existing

121

"""

122

def __init__(self):

123

self._lock = thread.allocate_lock()

124

self._hostmap = {} # map hosts to a list of connections

125

self._connmap = {} # map connections to host

126

self._readymap = {} # map connection to ready state

127

128

def add(self, host, connection, ready):

129

self._lock.acquire()

130

try:

131

if not self._hostmap.has_key(host): self._hostmap[host] = []

132

self._hostmap[host].append(connection)

133

self._connmap[connection] = host

134

self._readymap[connection] = ready

135

finally:

136

self._lock.release()

137

138

def remove(self, connection):

139

self._lock.acquire()

140

try:

141

try:

142

host = self._connmap[connection]

143

except KeyError:

144

pass

145

else:

146

del self._connmap[connection]

147

del self._readymap[connection]

148

self._hostmap[host].remove(connection)

149

if not self._hostmap[host]: del self._hostmap[host]

150

finally:

151

self._lock.release()

152

153

def set_ready(self, connection, ready):

154

try: self._readymap[connection] = ready

155

except KeyError: pass

156

157

def get_ready_conn(self, host):

158

conn = None

159

self._lock.acquire()

160

try:

161

if self._hostmap.has_key(host):

162

for c in self._hostmap[host]:

163

if self._readymap[c]:

164

self._readymap[c] = 0

165

conn = c

166

break

167

finally:

168

self._lock.release()

169

return conn

170

171

def get_all(self, host=None):

172

if host:

173

return list(self._hostmap.get(host, []))

174

else:

175

return dict(self._hostmap)

176

177

class KeepAliveHandler:

178

def __init__(self):

179

self._cm = ConnectionManager()

180

181

#### Connection Management

182

def open_connections(self):

183

"""return a list of connected hosts and the number of connections

184

to each. [('foo.com:80', 2), ('bar.org', 1)]"""

185

return [(host, len(li)) for (host, li) in self._cm.get_all().items()]

186

187

def close_connection(self, host):

188

"""close connection(s) to <host>

189

host is the host:port spec, as in 'www.cnn.com:8080' as passed in.

190

no error occurs if there is no connection to that host."""

191

for h in self._cm.get_all(host):

192

self._cm.remove(h)

193

h.close()

194

195

def close_all(self):

196

"""close all open connections"""

197

for host, conns in self._cm.get_all().items():

198

for h in conns:

199

self._cm.remove(h)

200

h.close()

201

202

def _request_closed(self, request, host, connection):

203

"""tells us that this request is now closed and the the

204

connection is ready for another request"""

205

self._cm.set_ready(connection, 1)

206

207

def _remove_connection(self, host, connection, close=0):

208

if close: connection.close()

209

self._cm.remove(connection)

210

211

#### Transaction Execution

212

def do_open(self, req):

213

host = req.get_host()

214

if not host:

215

raise urllib2.URLError('no host given')

216

217

try:

218

h = self._cm.get_ready_conn(host)

219

while h:

220

r = self._reuse_connection(h, req, host)

221

222

# if this response is non-None, then it worked and we're

223

# done. Break out, skipping the else block.

224

if r: break

225

226

# connection is bad - possibly closed by server

227

# discard it and ask for the next free connection

228

h.close()

229

self._cm.remove(h)

230

h = self._cm.get_ready_conn(host)

231

else:

232

# no (working) free connections were found. Create a new one.

233

h = self._get_connection(host)

234

if DEBUG: DEBUG.info("creating new connection to %s (%d)",

235

host, id(h))

236

self._cm.add(host, h, 0)

237

self._start_transaction(h, req)

238

r = h.getresponse()

239

except (socket.error, httplib.HTTPException), err:

240

raise urllib2.URLError(err)

241

242

if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)

243

244

# if not a persistent connection, don't try to reuse it

245

if r.will_close:

246

if DEBUG: DEBUG.info('server will close connection, discarding')

247

self._cm.remove(h)

248

249

r._handler = self

250

r._host = host

251

r._url = req.get_full_url()

252

r._connection = h

253

r.code = r.status

254

r.headers = r.msg

255

r.msg = r.reason

256

257

if r.status == 200 or not HANDLE_ERRORS:

258

return r

259

else:

260

return self.parent.error('http', req, r,

261

r.status, r.msg, r.headers)

262

263

def _reuse_connection(self, h, req, host):

264

"""start the transaction with a re-used connection

265

return a response object (r) upon success or None on failure.

266

This DOES not close or remove bad connections in cases where

267

it returns. However, if an unexpected exception occurs, it

268

will close and remove the connection before re-raising.

269

"""

270

try:

271

self._start_transaction(h, req)

272

r = h.getresponse()

273

# note: just because we got something back doesn't mean it

274

# worked. We'll check the version below, too.

275

except (socket.error, httplib.HTTPException):

276

r = None

277

except:

278

# adding this block just in case we've missed

279

# something we will still raise the exception, but

280

# lets try and close the connection and remove it

281

# first. We previously got into a nasty loop

282

# where an exception was uncaught, and so the

283

# connection stayed open. On the next try, the

284

# same exception was raised, etc. The tradeoff is

285

# that it's now possible this call will raise

286

# a DIFFERENT exception

287

if DEBUG: DEBUG.error("unexpected exception - closing " + \

288

"connection to %s (%d)", host, id(h))

289

self._cm.remove(h)

290

h.close()

291

raise

292

293

if r is None or r.version == 9:

294

# httplib falls back to assuming HTTP 0.9 if it gets a

295

# bad header back. This is most likely to happen if

296

# the socket has been closed by the server since we

297

# last used the connection.

298

if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",

299

host, id(h))

300

r = None

301

else:

302

if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))

303

304

return r

305

306

def _start_transaction(self, h, req):

307

try:

308

if req.has_data():

309

data = req.get_data()

310

h.putrequest('POST', req.get_selector())

311

if not req.headers.has_key('Content-type'):

312

h.putheader('Content-type',

313

'application/x-www-form-urlencoded')

314

if not req.headers.has_key('Content-length'):

315

h.putheader('Content-length', '%d' % len(data))

316

else:

317

h.putrequest('GET', req.get_selector())

318

except (socket.error, httplib.HTTPException), err:

319

raise urllib2.URLError(err)

320

321

for args in self.parent.addheaders:

322

h.putheader(*args)

323

for k, v in req.headers.items():

324

h.putheader(k, v)

325

h.endheaders()

326

if req.has_data():

327

h.send(data)

328

329

def _get_connection(self, host):

330

return NotImplementedError

331

332

class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):

333

def __init__(self):

334

KeepAliveHandler.__init__(self)

335

336

def http_open(self, req):

337

return self.do_open(req)

338

339

def _get_connection(self, host):

340

return HTTPConnection(host)

341

342

class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler):

343

def __init__(self, ssl_factory=None):

344

KeepAliveHandler.__init__(self)

345

if not ssl_factory:

346

ssl_factory = sslfactory.get_factory()

347

self._ssl_factory = ssl_factory

348

349

def https_open(self, req):

350

return self.do_open(req)

351

352

def _get_connection(self, host):

353

try: return self._ssl_factory.get_https_connection(host)

354

except AttributeError: return HTTPSConnection(host)

355

356

class HTTPResponse(httplib.HTTPResponse):

357

# we need to subclass HTTPResponse in order to

358

# 1) add readline() and readlines() methods

359

# 2) add close_connection() methods

360

# 3) add info() and geturl() methods

361

362

# in order to add readline(), read must be modified to deal with a

363

# buffer. example: readline must read a buffer and then spit back

364

# one line at a time. The only real alternative is to read one

365

# BYTE at a time (ick). Once something has been read, it can't be

366

# put back (ok, maybe it can, but that's even uglier than this),

367

# so if you THEN do a normal read, you must first take stuff from

368

# the buffer.

369

370

# the read method wraps the original to accomodate buffering,

371

# although read() never adds to the buffer.

372

# Both readline and readlines have been stolen with almost no

373

# modification from socket.py

374

375

376

def __init__(self, sock, debuglevel=0, strict=0, method=None):

377

if method: # the httplib in python 2.3 uses the method arg

378

httplib.HTTPResponse.__init__(self, sock, debuglevel, method)

379

else: # 2.2 doesn't

380

httplib.HTTPResponse.__init__(self, sock, debuglevel)

381

self.fileno = sock.fileno

382

self.code = None

383

self._rbuf = ''

384

self._rbufsize = 8096

385

self._handler = None # inserted by the handler later

386

self._host = None # (same)

387

self._url = None # (same)

388

self._connection = None # (same)

389

390

_raw_read = httplib.HTTPResponse.read

391

392

def close(self):

393

if self.fp:

394

self.fp.close()

395

self.fp = None

396

if self._handler:

397

self._handler._request_closed(self, self._host,

398

self._connection)

399

400

def close_connection(self):

401

self._handler._remove_connection(self._host, self._connection, close=1)

402

self.close()

403

404

def info(self):

405

return self.headers

406

407

def geturl(self):

408

return self._url

409

410

def read(self, amt=None):

411

# the _rbuf test is only in this first if for speed. It's not

412

# logically necessary

413

if self._rbuf and not amt is None:

414

L = len(self._rbuf)

415

if amt > L:

416

amt -= L

417

else:

418

s = self._rbuf[:amt]

419

self._rbuf = self._rbuf[amt:]

420

return s

421

422

s = self._rbuf + self._raw_read(amt)

423

self._rbuf = ''

424

return s

425

426

def readline(self, limit=-1):

427

data = ""

428

i = self._rbuf.find('\n')

429

while i < 0 and not (0 < limit <= len(self._rbuf)):

430

new = self._raw_read(self._rbufsize)

431

if not new: break

432

i = new.find('\n')

433

if i >= 0: i = i + len(self._rbuf)

434

self._rbuf = self._rbuf + new

435

if i < 0: i = len(self._rbuf)

436

else: i = i+1

437

if 0 <= limit < len(self._rbuf): i = limit

438

data, self._rbuf = self._rbuf[:i], self._rbuf[i:]

439

return data

440

441

def readlines(self, sizehint = 0):

442

total = 0

443

list = []

444

while 1:

445

line = self.readline()

446

if not line: break

447

list.append(line)

448

total += len(line)

449

if sizehint and total >= sizehint:

450

break

451

return list

452

453

454

class HTTPConnection(httplib.HTTPConnection):

455

# use the modified response class

456

response_class = HTTPResponse

457

458

class HTTPSConnection(httplib.HTTPSConnection):

459

response_class = HTTPResponse

460

461

#########################################################################

462

##### TEST FUNCTIONS

463

#########################################################################

464

465

def error_handler(url):

466

global HANDLE_ERRORS

467

orig = HANDLE_ERRORS

468

keepalive_handler = HTTPHandler()

469

opener = urllib2.build_opener(keepalive_handler)

470

urllib2.install_opener(opener)

471

pos = {0: 'off', 1: 'on'}

472

for i in (0, 1):

473

print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)

474

HANDLE_ERRORS = i

475

try:

476

fo = urllib2.urlopen(url)

477

foo = fo.read()

478

fo.close()

479

try: status, reason = fo.status, fo.reason

480

except AttributeError: status, reason = None, None

481

except IOError, e:

482

print " EXCEPTION: %s" % e

483

raise

484

else:

485

print " status = %s, reason = %s" % (status, reason)

486

HANDLE_ERRORS = orig

487

hosts = keepalive_handler.open_connections()

488

print "open connections:", hosts

489

keepalive_handler.close_all()

490

491

def continuity(url):

492

import md5

493

format = '%25s: %s'

494

495

# first fetch the file with the normal http handler

496

opener = urllib2.build_opener()

497

urllib2.install_opener(opener)

498

fo = urllib2.urlopen(url)

499

foo = fo.read()

500

fo.close()

501

m = md5.new(foo)

502

print format % ('normal urllib', m.hexdigest())

503

504

# now install the keepalive handler and try again

505

opener = urllib2.build_opener(HTTPHandler())

506

urllib2.install_opener(opener)

507

508

fo = urllib2.urlopen(url)

509

foo = fo.read()

510

fo.close()

511

m = md5.new(foo)

512

print format % ('keepalive read', m.hexdigest())

513

514

fo = urllib2.urlopen(url)

515

foo = ''

516

while 1:

517

f = fo.readline()

518

if f: foo = foo + f

519

else: break

520

fo.close()

521

m = md5.new(foo)

522

print format % ('keepalive readline', m.hexdigest())

523

524

def comp(N, url):

525

print ' making %i connections to:\n %s' % (N, url)

526

527

sys.stdout.write(' first using the normal urllib handlers')

528

# first use normal opener

529

opener = urllib2.build_opener()

530

urllib2.install_opener(opener)

531

t1 = fetch(N, url)

532

print ' TIME: %.3f s' % t1

533

534

sys.stdout.write(' now using the keepalive handler ')

535

# now install the keepalive handler and try again

536

opener = urllib2.build_opener(HTTPHandler())

537

urllib2.install_opener(opener)

538

t2 = fetch(N, url)

539

print ' TIME: %.3f s' % t2

540

print ' improvement factor: %.2f' % (t1/t2, )

541

542

def fetch(N, url, delay=0):

543

import time

544

lens = []

545

starttime = time.time()

546

for i in range(N):

547

if delay and i > 0: time.sleep(delay)

548

fo = urllib2.urlopen(url)

549

foo = fo.read()

550

fo.close()

551

lens.append(len(foo))

552

diff = time.time() - starttime

553

554

j = 0

555

for i in lens[1:]:

556

j = j + 1

557

if not i == lens[0]:

558

print "WARNING: inconsistent length on read %i: %i" % (j, i)

559

560

return diff

561

562

def test_timeout(url):

563

global DEBUG

564

dbbackup = DEBUG

565

class FakeLogger:

566

def debug(self, msg, *args): print msg % args

567

info = warning = error = debug

568

DEBUG = FakeLogger()

569

print " fetching the file to establish a connection"

570

fo = urllib2.urlopen(url)

571

data1 = fo.read()

572

fo.close()

573

574

i = 20

575

print " waiting %i seconds for the server to close the connection" % i

576

while i > 0:

577

sys.stdout.write('\r %2i' % i)

578

sys.stdout.flush()

579

time.sleep(1)

580

i -= 1

581

sys.stderr.write('\r')

582

583

print " fetching the file a second time"

584

fo = urllib2.urlopen(url)

585

data2 = fo.read()

586

fo.close()

587

588

if data1 == data2:

589

print ' data are identical'

590

else:

591

print ' ERROR: DATA DIFFER'

592

593

DEBUG = dbbackup

594

595

596

def test(url, N=10):

597

print "checking error hander (do this on a non-200)"

598

try: error_handler(url)

599

except IOError, e:

600

print "exiting - exception will prevent further tests"

601

sys.exit()

602

print

603

print "performing continuity test (making sure stuff isn't corrupted)"

604

continuity(url)

605

print

606

print "performing speed comparison"

607

comp(N, url)

608

print

609

print "performing dropped-connection check"

610

test_timeout(url)

611

612

if __name__ == '__main__':

613

import time

614

import sys

615

try:

616

N = int(sys.argv[1])

617

url = sys.argv[2]

618

except:

619

print "%s <integer> <url>" % sys.argv[0]

620

else:

621

test(url, N)