~wafaa-mohamed/python-ox-image/dev

« back to all changes in this revision

Viewing changes to ox/html.py

Committer: j
Date: 2013-10-24 16:40:04 UTC
Revision ID: j-20131024164004-u7vo5c3q0jcp0nn5

allow iframes in sanitize_html

files modified:
ox/html.py

Show diffs side-by-side

added added

removed removed

ox/html.py

234

'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr',

235

# other

236

'a', 'br', 'img', 'figure', 'figcaption',

237

# iframe

238

'iframe',

237

239

# special

238

240

'rtl', '[]'

239

241

]

240

242

parse = {

241

'a': {

242

'<a [^<>]*?href="((https?:\/\/|\/|mailto:).+?)".*?>': '<a href="{1}">',

243

'<\/a>': '</a>'

244

245

'img': {

246

'<img [^<>]*?src="((https?:\/\/|\/).+?)".*?>': '<img src="{1}">'

247

248

'rtl': {

249

'<rtl>': '<div style="direction: rtl">',

250

'<\/rtl>': '</div>'

251

252

'*': lambda tag: {'<(/?' + tag + ') ?/?>':'<{1}>'}

243

'a': [

244

[

245

'<a [^<>]*?href="((https?:\/\/|\/|mailto:).+?)".*?>',

246

'<a href="{1}">'

247

248

['<\/a>', '</a>']

249

250

'img': [

251

[

252

'<img [^<>]*?src="((https?:\/\/|\/)[^"]+?)".*?>',

253

'<img src="{1}">'

254

]

255

256

'iframe': [

257

[

258

'<iframe [^<>]*?width="(\d+)" height="(\d+)"[^<>]*?src="((\/|https?:\/\/)[^"]+?)".*?>',

259

'<iframe width="{1}" height="{2}" src="{3}">'

260

261

[

262

'<iframe [^<>]*?src="((\/|https?:\/\/)[^"]+?)".*?>',

263

'<iframe src="{1}">'

264

265

[

266

'<\/iframe>',

267

'</iframe>'

268

]

269

270

'rtl': [

271

[

272

'<rtl>',

273

'<div style="direction: rtl">'

274

275

['<\/rtl>', '</div>']

276

277

'*': lambda tag: [['<(/?' + tag + ') ?/?>', '<{1}>']]

253

278

}

254

279

matches = []

255

280

262

287

'<a href="\\1">\\3</a>', html);

263

288

tags = filter(lambda tag: tag != '[]', tags)

264

289

265

def replace_match(match, value, replace):

290

def replace_match(match, value, regexp):

266

291

i = 1

267

292

for m in match.groups():

268

293

value = value.replace('{%d}'%i, m)

272

297

273

298

for tag in tags:

274

299

p = parse.get(tag, parse['*'](tag))

275

for replace in p:

300

for regexp, value in p:

276

301

html = re.sub(

277

re.compile(replace, re.IGNORECASE),

278

lambda match: replace_match(match, p[replace][:], replace),

302

re.compile(regexp, re.IGNORECASE),

303

lambda match: replace_match(match, value[:], regexp),

279

304

html

280

305

)

281

306

html = escape(html)

283

308

html = html.replace('\t%d\t'%(i+1), matches[i])

284

309

html = html.replace('\n\n', '<br/><br/>')

285

310

html = add_links(html)

286

return sanitize_fragment(html)

311

return sanitize_fragment(html)

287

312

288

313

def sanitize_fragment(html):

314

'''

315

#html5lib reorders arguments, so not usable

289

316

import html5lib

290

317

return html5lib.parseFragment(html).toxml().decode('utf-8')

318

'''

319

import lxml.html

320

body = lxml.html.document_fromstring(html).find('body')

321

return lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')

291

322

Older »