~ubuntu-branches/ubuntu/jaunty/calibre/jaunty-backports

if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):

576

return False

577

for i in range(0, len(self.contents)):

578

if self.contents[i] != other.contents[i]:

579

return False

580

return True

581

582

def __ne__(self, other):

583

"""Returns true iff this tag is not identical to the other tag,

584

as defined in __eq__."""

585

return not self == other

586

587

def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):

588

"""Renders this tag as a string."""

589

return self.__str__(encoding)

590

591

def __unicode__(self):

592

return self.__str__(None)

593

594

BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"

595

+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"

596

+ ")")

597

598

def _sub_entity(self, x):

599

"""Used with a regular expression to substitute the

600

appropriate XML entity for an XML special character."""

601

return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"

602

603

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,

604

prettyPrint=False, indentLevel=0):

605

"""Returns a string or Unicode representation of this tag and

606

its contents. To get Unicode, pass None for encoding.

607

608

NOTE: since Python's HTML parser consumes whitespace, this

609

method is not certain to reproduce the whitespace present in

610

the original string."""

611

612

encodedName = self.toEncoding(self.name, encoding)

613

614

attrs = []

615

if self.attrs:

616

for key, val in self.attrs:

617

fmt = '%s="%s"'

618

if isString(val):

619

if self.containsSubstitutions and '%SOUP-ENCODING%' in val:

620

val = self.substituteEncoding(val, encoding)

621

622

# The attribute value either:

623

624

# * Contains no embedded double quotes or single quotes.

625

# No problem: we enclose it in double quotes.

626

# * Contains embedded single quotes. No problem:

627

# double quotes work here too.

628

# * Contains embedded double quotes. No problem:

629

# we enclose it in single quotes.

630

# * Embeds both single _and_ double quotes. This

631

# can't happen naturally, but it can happen if

632

# you modify an attribute value after parsing

633

# the document. Now we have a bit of a

634

# problem. We solve it by enclosing the

635

# attribute in single quotes, and escaping any

636

# embedded single quotes to XML entities.

637

if '"' in val:

638

fmt = "%s='%s'"

639

if "'" in val:

640

# TODO: replace with apos when

641

# appropriate.

642

val = val.replace("'", "&squot;")

643

644

# Now we're okay w/r/t quotes. But the attribute

645

# value might also contain angle brackets, or

646

# ampersands that aren't part of entities. We need

647

# to escape those to XML entities too.

648

val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)

649

650

attrs.append(fmt % (self.toEncoding(key, encoding),

651

self.toEncoding(val, encoding)))

652

close = ''

653

closeTag = ''

654

if self.isSelfClosing:

655

close = ' /'

656

else:

657

closeTag = '</%s>' % encodedName

658

659

indentTag, indentContents = 0, 0

660

if prettyPrint:

661

indentTag = indentLevel

662

space = (' ' * (indentTag-1))

663

indentContents = indentTag + 1

664

contents = self.renderContents(encoding, prettyPrint, indentContents)

665

if self.hidden:

666

s = contents

667

else:

668

s = []

669

attributeString = ''

670

if attrs:

671

attributeString = ' ' + ' '.join(attrs)

672

if prettyPrint:

673

s.append(space)

674

s.append('<%s%s%s>' % (encodedName, attributeString, close))

675

if prettyPrint:

676

s.append("\n")

677

s.append(contents)

678

if prettyPrint and contents and contents[-1] != "\n":

679

s.append("\n")

680

if prettyPrint and closeTag:

681

s.append(space)

682

s.append(closeTag)

683

if prettyPrint and closeTag and self.nextSibling:

684

s.append("\n")

685

s = ''.join(s)

686

return s

687

688

def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):

689

return self.__str__(encoding, True)

690

691

def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,

692

prettyPrint=False, indentLevel=0):

693

"""Renders the contents of this tag as a string in the given

694

encoding. If encoding is None, returns a Unicode string.."""

695

s=[]

696

for c in self:

697

text = None

698

if isinstance(c, NavigableString):

699

text = c.__str__(encoding)

700

elif isinstance(c, Tag):

701

s.append(c.__str__(encoding, prettyPrint, indentLevel))

702

if text and prettyPrint:

703

text = text.strip()

704

if text:

705

if prettyPrint:

706

s.append(" " * (indentLevel-1))

707

s.append(text)

708

if prettyPrint:

709

s.append("\n")

710

return ''.join(s)

711

712

#Soup methods

713

714

def find(self, name=None, attrs={}, recursive=True, text=None,

715

**kwargs):

716

"""Return only the first child of this Tag matching the given

717

criteria."""

718

r = None

719

l = self.findAll(name, attrs, recursive, text, 1, **kwargs)

720

if l:

721

r = l[0]

722

return r

723

findChild = find

724

725

def findAll(self, name=None, attrs={}, recursive=True, text=None,

726

limit=None, **kwargs):

727

"""Extracts a list of Tag objects that match the given

728

criteria. You can specify the name of the Tag and any

729

attributes you want the Tag to have.

730

731

The value of a key-value pair in the 'attrs' map can be a

732

string, a list of strings, a regular expression object, or a

733

callable that takes a string and returns whether or not the

734

string matches for some custom definition of 'matches'. The

735

same is true of the tag name."""

736

generator = self.recursiveChildGenerator

737

if not recursive:

738

generator = self.childGenerator

739

return self._findAll(name, attrs, text, limit, generator, **kwargs)

740

findChildren = findAll

741

742

# Pre-3.x compatibility methods

743

first = find

744

fetch = findAll

745

746

def fetchText(self, text=None, recursive=True, limit=None):

747

return self.findAll(text=text, recursive=recursive, limit=limit)

748

749

def firstText(self, text=None, recursive=True):

750

return self.find(text=text, recursive=recursive)

751

752

#Private methods

753

754

def _getAttrMap(self):

755

"""Initializes a map representation of this tag's attributes,

756

if not already initialized."""

757

if not getattr(self, 'attrMap'):

758

self.attrMap = {}

759

for (key, value) in self.attrs:

760

self.attrMap[key] = value

761

return self.attrMap

762

763

#Generator methods

764

def childGenerator(self):

765

for i in range(0, len(self.contents)):

766

yield self.contents[i]

767

raise StopIteration

768

769

def recursiveChildGenerator(self):

770

stack = [(self, 0)]

771

while stack:

772

tag, start = stack.pop()

773

if isinstance(tag, Tag):

774

for i in range(start, len(tag.contents)):

775

a = tag.contents[i]

776

yield a

777

if isinstance(a, Tag) and tag.contents:

778

if i < len(tag.contents) - 1:

779

stack.append((tag, i+1))

780

stack.append((a, 0))

781

break

782

raise StopIteration

783

784

# Next, a couple classes to represent queries and their results.

785

class SoupStrainer:

786

"""Encapsulates a number of ways of matching a markup element (tag or

787

text)."""

788

789

def __init__(self, name=None, attrs={}, text=None, **kwargs):

790

self.name = name

791

if isString(attrs):

792

kwargs['class'] = attrs

793

attrs = None

794

if kwargs:

795

if attrs:

796

attrs = attrs.copy()

797

attrs.update(kwargs)

798

else:

799

attrs = kwargs

800

self.attrs = attrs

801

self.text = text

802

803

def __str__(self):

804

if self.text:

805

return self.text

806

else:

807

return "%s|%s" % (self.name, self.attrs)

808

809

def searchTag(self, markupName=None, markupAttrs={}):

810

found = None

811

markup = None

812

if isinstance(markupName, Tag):

813

markup = markupName

814

markupAttrs = markup

815

callFunctionWithTagData = callable(self.name) \

816

and not isinstance(markupName, Tag)

817

818

if (not self.name) \

819

or callFunctionWithTagData \

820

or (markup and self._matches(markup, self.name)) \

821

or (not markup and self._matches(markupName, self.name)):

822

if callFunctionWithTagData:

823

match = self.name(markupName, markupAttrs)

824

else:

825

match = True

826

markupAttrMap = None

827

for attr, matchAgainst in self.attrs.items():

828

if not markupAttrMap:

829

if hasattr(markupAttrs, 'get'):

830

markupAttrMap = markupAttrs

831

else:

832

markupAttrMap = {}

833

for k,v in markupAttrs:

834

markupAttrMap[k] = v

835

attrValue = markupAttrMap.get(attr)

836

if not self._matches(attrValue, matchAgainst):

837

match = False

838

break

839

if match:

840

if markup:

841

found = markup

842

else:

843

found = markupName

844

return found

845

846

def search(self, markup):

847

#print 'looking for %s in %s' % (self, markup)

848

found = None

849

# If given a list of items, scan it for a text element that

850

# matches.

851

if isList(markup) and not isinstance(markup, Tag):

852

for element in markup:

853

if isinstance(element, NavigableString) \

854

and self.search(element):

855

found = element

856

break

857

# If it's a Tag, make sure its name or attributes match.

858

# Don't bother with Tags if we're searching for text.

859

elif isinstance(markup, Tag):

860

if not self.text:

861

found = self.searchTag(markup)

862

# If it's text, make sure the text matches.

863

elif isinstance(markup, NavigableString) or \

864

isString(markup):

865

if self._matches(markup, self.text):

866

found = markup

867

else:

868

raise Exception, "I don't know how to match against a %s" \

869

% markup.__class__

870

return found

871

872

def _matches(self, markup, matchAgainst):

873

#print "Matching %s against %s" % (markup, matchAgainst)

874

result = False

875

if matchAgainst == True and type(matchAgainst) == types.BooleanType:

876

result = markup != None

877

elif callable(matchAgainst):

878

result = matchAgainst(markup)

879

else:

880

#Custom match methods take the tag as an argument, but all

881

#other ways of matching match the tag name as a string.

882

if isinstance(markup, Tag):

883

markup = markup.name

884

if markup and not isString(markup):

885

markup = unicode(markup)

886

#Now we know that chunk is either a string, or None.

887

if hasattr(matchAgainst, 'match'):

888

# It's a regexp object.

889

result = markup and matchAgainst.search(markup)

890

elif isList(matchAgainst):

891

result = markup in matchAgainst

892

elif hasattr(matchAgainst, 'items'):

893

result = markup.has_key(matchAgainst)

894

elif matchAgainst and isString(markup):

895

if isinstance(markup, unicode):

896

matchAgainst = unicode(matchAgainst)

897

else:

898

matchAgainst = str(matchAgainst)

899

900

if not result:

901

result = matchAgainst == markup

902

return result

903

904

class ResultSet(list):

905

"""A ResultSet is just a list that keeps track of the SoupStrainer

906

that created it."""

907

def __init__(self, source):

908

list.__init__([])

909

self.source = source

910

911

# Now, some helper functions.

912

913

def isList(l):

914

"""Convenience method that works with all 2.x versions of Python

915

to determine whether or not something is listlike."""

916

return hasattr(l, '__iter__') \

917

or (type(l) in (types.ListType, types.TupleType))

918

919

def isString(s):

920

"""Convenience method that works with all 2.x versions of Python

921

to determine whether or not something is stringlike."""

922

try:

923

return isinstance(s, unicode) or isinstance(s, basestring)

924

except NameError:

925

return isinstance(s, str)

926

927

def buildTagMap(default, *args):

928

"""Turns a list of maps, lists, or scalars into a single map.

929

Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and

930

NESTING_RESET_TAGS maps out of lists and partial maps."""

931

built = {}

932

for portion in args:

933

if hasattr(portion, 'items'):

934

#It's a map. Merge it.

935

for k,v in portion.items():

936

built[k] = v

937

elif isList(portion):

938

#It's a list. Map each item to the default.

939

for k in portion:

940

built[k] = default

941

else:

942

#It's a scalar. Map it to the default.

943

built[portion] = default

944

return built

945

946

# Now, the parser classes.

947

948

class BeautifulStoneSoup(Tag, SGMLParser):

949

950

"""This class contains the basic parser and search code. It defines

951

a parser that knows nothing about tag behavior except for the

952

following:

953

954

You can't close a tag without closing all the tags it encloses.

955

That is, "<foo><bar></foo>" actually means

956

"<foo><bar></bar></foo>".

957

958

[Another possible explanation is "<foo><bar /></foo>", but since

959

this class defines no SELF_CLOSING_TAGS, it will never use that

960

explanation.]

961

962

This class is useful for parsing XML or made-up markup languages,

963

or when BeautifulSoup makes an assumption counter to what you were

964

expecting."""

965

966

SELF_CLOSING_TAGS = {}

967

NESTABLE_TAGS = {}

968

RESET_NESTING_TAGS = {}

969

QUOTE_TAGS = {}

970

971

MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),

972

lambda x: x.group(1) + ' />'),

973

(re.compile('<!\s+([^<>]*)>'),

974

lambda x: '<!' + x.group(1) + '>')

975

]

976

977

ROOT_TAG_NAME = u'[document]'

978

979

HTML_ENTITIES = "html"

980

XML_ENTITIES = "xml"

981

XHTML_ENTITIES = "xhtml"

982

# TODO: This only exists for backwards-compatibility

983

ALL_ENTITIES = XHTML_ENTITIES

984

985

# Used when determining whether a text node is all whitespace and

986

# can be replaced with a single space. A text node that contains

987

# fancy Unicode spaces (usually non-breaking) should be left

988

# alone.

989

STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }

990

991

def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,

992

markupMassage=True, smartQuotesTo=XML_ENTITIES,

993

convertEntities=None, selfClosingTags=None):

994

"""The Soup object is initialized as the 'root tag', and the

995

provided markup (which can be a string or a file-like object)

996

is fed into the underlying parser.

997

998

sgmllib will process most bad HTML, and the BeautifulSoup

999

class has some tricks for dealing with some HTML that kills

1000

sgmllib, but Beautiful Soup can nonetheless choke or lose data

1001

if your data uses self-closing tags or declarations

1002

incorrectly.

1003

1004

By default, Beautiful Soup uses regexes to sanitize input,

1005

avoiding the vast majority of these problems. If the problems

1006

don't apply to you, pass in False for markupMassage, and

1007

you'll get better performance.

1008

1009

The default parser massage techniques fix the two most common

1010

instances of invalid HTML that choke sgmllib:

1011

1012

<br/> (No space between name of closing tag and tag close)

1013

<! --Comment--> (Extraneous whitespace in declaration)

1014

1015

You can pass in a custom list of (RE object, replace method)

1016

tuples to get Beautiful Soup to scrub your input the way you

1017

want."""

1018

1019

self.parseOnlyThese = parseOnlyThese

1020

self.fromEncoding = fromEncoding

1021

self.smartQuotesTo = smartQuotesTo

1022

self.convertEntities = convertEntities

1023

# Set the rules for how we'll deal with the entities we

1024

# encounter

1025

if self.convertEntities:

1026

# It doesn't make sense to convert encoded characters to

1027

# entities even while you're converting entities to Unicode.

1028

# Just convert it all to Unicode.

1029

self.smartQuotesTo = None

1030

if convertEntities == self.HTML_ENTITIES:

1031

self.convertXMLEntities = False

1032

self.convertHTMLEntities = True

1033

self.escapeUnrecognizedEntities = True

1034

elif convertEntities == self.XHTML_ENTITIES:

1035

self.convertXMLEntities = True

1036

self.convertHTMLEntities = True

1037

self.escapeUnrecognizedEntities = False

1038

elif convertEntities == self.XML_ENTITIES:

1039

self.convertXMLEntities = True

1040

self.convertHTMLEntities = False

1041

self.escapeUnrecognizedEntities = False

1042

else:

1043

self.convertXMLEntities = False

1044

self.convertHTMLEntities = False

1045

self.escapeUnrecognizedEntities = False

1046

1047

self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)

1048

SGMLParser.__init__(self)

1049

1050

if hasattr(markup, 'read'): # It's a file-type object.

1051

markup = markup.read()

1052

self.markup = markup

1053

self.markupMassage = markupMassage

1054

try:

1055

self._feed()

1056

except StopParsing:

1057

pass

1058

self.markup = None # The markup can now be GCed

1059

1060

def convert_charref(self, name):

1061

"""This method fixes a bug in Python's SGMLParser."""

1062

try:

1063

n = int(name)

1064

except ValueError:

1065

return

1066

if not 0 <= n <= 127 : # ASCII ends at 127, not 255

1067

return

1068

return self.convert_codepoint(n)

1069

1070

def _feed(self, inDocumentEncoding=None):

1071

# Convert the document to Unicode.

1072

markup = self.markup

1073

if isinstance(markup, unicode):

1074

if not hasattr(self, 'originalEncoding'):

1075

self.originalEncoding = None

1076

else:

1077

# Changed detection by Kovid

1078

markup, self.originalEncoding = chardet.xml_to_unicode(markup)

1079

if markup:

1080

if self.markupMassage:

1081

if not isList(self.markupMassage):

1082

self.markupMassage = self.MARKUP_MASSAGE

1083

for fix, m in self.markupMassage:

1084

markup = fix.sub(m, markup)

1085

# TODO: We get rid of markupMassage so that the

1086

# soup object can be deepcopied later on. Some

1087

# Python installations can't copy regexes. If anyone

1088

# was relying on the existence of markupMassage, this

1089

# might cause problems.

1090

del(self.markupMassage)

1091

self.markup = markup

1092

self.reset()

1093

1094

SGMLParser.feed(self, markup)

1095

# Close out any unfinished strings and close all the open tags.

1096

self.endData()

1097

while self.currentTag.name != self.ROOT_TAG_NAME:

1098

self.popTag()

1099

1100

def __getattr__(self, methodName):

1101

"""This method routes method call requests to either the SGMLParser

1102

superclass or the Tag superclass, depending on the method name."""

1103

#print "__getattr__ called on %s.%s" % (self.__class__, methodName)

1104

1105

if methodName.find('start_') == 0 or methodName.find('end_') == 0 \

1106

or methodName.find('do_') == 0:

1107

return SGMLParser.__getattr__(self, methodName)

1108

elif methodName.find('__') != 0:

1109

return Tag.__getattr__(self, methodName)

1110

else:

1111

raise AttributeError

1112

1113

def isSelfClosingTag(self, name):

1114

"""Returns true iff the given string is the name of a

1115

self-closing tag according to this parser."""

1116

return self.SELF_CLOSING_TAGS.has_key(name) \

1117

or self.instanceSelfClosingTags.has_key(name)

1118

1119

def reset(self):

1120

Tag.__init__(self, self, self.ROOT_TAG_NAME)

1121

self.hidden = 1

1122

SGMLParser.reset(self)

1123

self.currentData = []

1124

self.currentTag = None

1125

self.tagStack = []

1126

self.quoteStack = []

1127

self.pushTag(self)

1128

1129

def popTag(self):

1130

tag = self.tagStack.pop()

1131

# Tags with just one string-owning child get the child as a

1132

# 'string' property, so that soup.tag.string is shorthand for

1133

# soup.tag.contents[0]

1134

if len(self.currentTag.contents) == 1 and \

1135

isinstance(self.currentTag.contents[0], NavigableString):

1136

self.currentTag.string = self.currentTag.contents[0]

1137

1138

#print "Pop", tag.name

1139

if self.tagStack:

1140

self.currentTag = self.tagStack[-1]

1141

return self.currentTag

1142

1143

def pushTag(self, tag):

1144

#print "Push", tag.name

1145

if self.currentTag:

1146

self.currentTag.contents.append(tag)

1147

self.tagStack.append(tag)

1148

self.currentTag = self.tagStack[-1]

1149

1150

def endData(self, containerClass=NavigableString):

1151

if self.currentData:

1152

currentData = ''.join(self.currentData)

1153

if not currentData.translate(self.STRIP_ASCII_SPACES):

1154

if '\n' in currentData:

1155

currentData = '\n'

1156

else:

1157

currentData = ' '

1158

self.currentData = []

1159

if self.parseOnlyThese and len(self.tagStack) <= 1 and \

1160

(not self.parseOnlyThese.text or \

1161

not self.parseOnlyThese.search(currentData)):

1162

return

1163

o = containerClass(currentData)

1164

o.setup(self.currentTag, self.previous)

1165

if self.previous:

1166

self.previous.next = o

1167

self.previous = o

1168

self.currentTag.contents.append(o)

1169

1170

1171

def _popToTag(self, name, inclusivePop=True):

1172

"""Pops the tag stack up to and including the most recent

1173

instance of the given tag. If inclusivePop is false, pops the tag

1174

stack up to but *not* including the most recent instqance of

1175

the given tag."""

1176

#print "Popping to %s" % name

1177

if name == self.ROOT_TAG_NAME:

1178

return

1179

1180

numPops = 0

1181

mostRecentTag = None

1182

for i in range(len(self.tagStack)-1, 0, -1):

1183

if name == self.tagStack[i].name:

1184

numPops = len(self.tagStack)-i

1185

break

1186

if not inclusivePop:

1187

numPops = numPops - 1

1188

1189

for i in range(0, numPops):

1190

mostRecentTag = self.popTag()

1191

return mostRecentTag

1192

1193

def _smartPop(self, name):

1194

1195

"""We need to pop up to the previous tag of this type, unless

1196

one of this tag's nesting reset triggers comes between this

1197

tag and the previous tag of this type, OR unless this tag is a

1198

generic nesting trigger and another generic nesting trigger

1199

comes between this tag and the previous tag of this type.

1200

1201

Examples:

1202

<p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.

1203

<p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.

1204

<p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.

1205

1206

<li><ul><li> *<li>* should pop to 'ul', not the first 'li'.

1207

<tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'

1208

<td><tr><td> *<td>* should pop to 'tr', not the first 'td'

1209

"""

1210

1211

nestingResetTriggers = self.NESTABLE_TAGS.get(name)

1212

isNestable = nestingResetTriggers != None

1213

isResetNesting = self.RESET_NESTING_TAGS.has_key(name)

1214

popTo = None

1215

inclusive = True

1216

for i in range(len(self.tagStack)-1, 0, -1):

1217

p = self.tagStack[i]

1218

if (not p or p.name == name) and not isNestable:

1219

#Non-nestable tags get popped to the top or to their

1220

#last occurance.

1221

popTo = name

1222

break

1223

if (nestingResetTriggers != None

1224

and p.name in nestingResetTriggers) \

1225

or (nestingResetTriggers == None and isResetNesting

1226

and self.RESET_NESTING_TAGS.has_key(p.name)):

1227

1228

#If we encounter one of the nesting reset triggers

1229

#peculiar to this tag, or we encounter another tag

1230

#that causes nesting to reset, pop up to but not

1231

#including that tag.

1232

popTo = p.name

1233

inclusive = False

1234

break

1235

p = p.parent

1236

if popTo:

1237

self._popToTag(popTo, inclusive)

1238

1239

def unknown_starttag(self, name, attrs, selfClosing=0):

1240

#print "Start tag %s: %s" % (name, attrs)

1241

if self.quoteStack:

1242

#This is not a real tag.

1243

#print "<%s> is not real!" % name

1244

attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))

1245

self.handle_data('<%s%s>' % (name, attrs))

1246

return

1247

self.endData()

1248

1249

if not self.isSelfClosingTag(name) and not selfClosing:

1250

self._smartPop(name)

1251

1252

if self.parseOnlyThese and len(self.tagStack) <= 1 \

1253

and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):

1254

return

1255

1256

tag = Tag(self, name, attrs, self.currentTag, self.previous)

1257

if self.previous:

1258

self.previous.next = tag

1259

self.previous = tag

1260

self.pushTag(tag)

1261

if selfClosing or self.isSelfClosingTag(name):

1262

self.popTag()

1263

if name in self.QUOTE_TAGS:

1264

#print "Beginning quote (%s)" % name

1265

self.quoteStack.append(name)

1266

self.literal = 1

1267

return tag

1268

1269

def unknown_endtag(self, name):

1270

#print "End tag %s" % name

1271

if self.quoteStack and self.quoteStack[-1] != name:

1272

#This is not a real end tag.

1273

#print "</%s> is not real!" % name

1274

self.handle_data('</%s>' % name)

1275

return

1276

self.endData()

1277

self._popToTag(name)

1278

if self.quoteStack and self.quoteStack[-1] == name:

1279

self.quoteStack.pop()

1280

self.literal = (len(self.quoteStack) > 0)

1281

1282

def handle_data(self, data):

1283

self.currentData.append(data)

1284

1285

def _toStringSubclass(self, text, subclass):

1286

"""Adds a certain piece of text to the tree as a NavigableString

1287

subclass."""

1288

self.endData()

1289

self.handle_data(text)

1290

self.endData(subclass)

1291

1292

def handle_pi(self, text):

1293

"""Handle a processing instruction as a ProcessingInstruction

1294

object, possibly one with a %SOUP-ENCODING% slot into which an

1295

encoding will be plugged later."""

1296

if text[:3] == "xml":

1297

text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"

1298

self._toStringSubclass(text, ProcessingInstruction)

1299

1300

def handle_comment(self, text):

1301

"Handle comments as Comment objects."

1302

self._toStringSubclass(text, Comment)

1303

1304

def handle_charref(self, ref):

1305

"Handle character references as data."

1306

if self.convertEntities:

1307

if ref.lower().startswith('x'): #

1308

ref = int(ref[1:], 16) # Added by Kovid to handle hex numeric entities

1309

data = unichr(int(ref))

1310

else:

1311

data = '&#%s;' % ref

1312

self.handle_data(data)

1313

1314

def handle_entityref(self, ref):

1315

"""Handle entity references as data, possibly converting known

1316

HTML and/or XML entity references to the corresponding Unicode

1317

characters."""

1318

data = None

1319

if self.convertHTMLEntities:

1320

try:

1321

data = unichr(name2codepoint[ref])

1322

except KeyError:

1323

pass

1324

1325

if not data and self.convertXMLEntities:

1326

data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)

1327

1328

if not data and self.convertHTMLEntities and \

1329

not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):

1330

# TODO: We've got a problem here. We're told this is

1331

# an entity reference, but it's not an XML entity

1332

# reference or an HTML entity reference. Nonetheless,

1333

# the logical thing to do is to pass it through as an

1334

# unrecognized entity reference.

1335

1336

# Except: when the input is "&carol;" this function

1337

# will be called with input "carol". When the input is

1338

# "AT&T", this function will be called with input

1339

# "T". We have no way of knowing whether a semicolon

1340

# was present originally, so we don't know whether

1341

# this is an unknown entity or just a misplaced

1342

# ampersand.

1343

1344

# The more common case is a misplaced ampersand, so I

1345

# escape the ampersand and omit the trailing semicolon.

1346

data = "&%s" % ref

1347

if not data:

1348

# This case is different from the one above, because we

1349

# haven't already gone through a supposedly comprehensive

1350

# mapping of entities to Unicode characters. We might not

1351

# have gone through any mapping at all. So the chances are

1352

# very high that this is a real entity, and not a

1353

# misplaced ampersand.

1354

data = "&%s;" % ref

1355

self.handle_data(data)

1356

1357

def handle_decl(self, data):

1358

"Handle DOCTYPEs and the like as Declaration objects."

1359

self._toStringSubclass(data, Declaration)

1360

1361

def parse_declaration(self, i):

1362

"""Treat a bogus SGML declaration as raw data. Treat a CDATA

1363

declaration as a CData object."""

1364

j = None

1365

if self.rawdata[i:i+9] == '<![CDATA[':

1366

k = self.rawdata.find(']]>', i)

1367

if k == -1:

1368

k = len(self.rawdata)

1369

data = self.rawdata[i+9:k]

1370

j = k+3

1371

self._toStringSubclass(data, CData)

1372

else:

1373

try:

1374

j = SGMLParser.parse_declaration(self, i)

1375

except SGMLParseError:

1376

toHandle = self.rawdata[i:]

1377

self.handle_data(toHandle)

1378

j = i + len(toHandle)

1379

return j

1380

1381

class BeautifulSoup(BeautifulStoneSoup):

1382

1383

"""This parser knows the following facts about HTML:

1384

1385

* Some tags have no closing tag and should be interpreted as being

1386

closed as soon as they are encountered.

1387

1388

* The text inside some tags (ie. 'script') may contain tags which

1389

are not really part of the document and which should be parsed

1390

as text, not tags. If you want to parse the text as tags, you can

1391

always fetch it and parse it explicitly.

1392

1393

* Tag nesting rules:

1394

1395

Most tags can't be nested at all. For instance, the occurance of

1396

a <p> tag should implicitly close the previous <p> tag.

1397

1398

<p>Para1<p>Para2

1399

should be transformed into:

1400

<p>Para1</p><p>Para2

1401

1402

Some tags can be nested arbitrarily. For instance, the occurance

1403

of a <blockquote> tag should _not_ implicitly close the previous

1404

<blockquote> tag.

1405

1406

Alice said: <blockquote>Bob said: <blockquote>Blah

1407

should NOT be transformed into:

1408

Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah

1409

1410

Some tags can be nested, but the nesting is reset by the

1411

interposition of other tags. For instance, a <tr> tag should

1412

implicitly close the previous <tr> tag within the same <table>,

1413

but not close a <tr> tag in another table.

1414

1415

<table><tr>Blah<tr>Blah

1416

should be transformed into:

1417

<table><tr>Blah</tr><tr>Blah

1418

but,

1419

<tr>Blah<table><tr>Blah

1420

should NOT be transformed into

1421

<tr>Blah<table></tr><tr>Blah

1422

1423

Differing assumptions about tag nesting rules are a major source

1424

of problems with the BeautifulSoup class. If BeautifulSoup is not

1425

treating as nestable a tag your page author treats as nestable,

1426

try ICantBelieveItsBeautifulSoup, MinimalSoup, or

1427

BeautifulStoneSoup before writing your own subclass."""

1428

1429

def __init__(self, *args, **kwargs):

1430

if not kwargs.has_key('smartQuotesTo'):

1431

kwargs['smartQuotesTo'] = self.HTML_ENTITIES

1432

BeautifulStoneSoup.__init__(self, *args, **kwargs)

1433

1434

SELF_CLOSING_TAGS = buildTagMap(None,

1435

['br' , 'hr', 'input', 'img', 'meta',

1436

'spacer', 'link', 'frame', 'base'])

1437

1438

QUOTE_TAGS = {'script' : None, 'textarea' : None}

1439

1440

#According to the HTML standard, each of these inline tags can

1441

#contain another tag of the same type. Furthermore, it's common

1442

#to actually use these tags this way.

1443

NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',

1444

'center']

1445

1446

#According to the HTML standard, these block tags can contain

1447

#another tag of the same type. Furthermore, it's common

1448

#to actually use these tags this way.

1449

NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']

1450

1451

#Lists can contain other lists, but there are restrictions.

1452

NESTABLE_LIST_TAGS = { 'ol' : [],

1453

'ul' : [],

1454

'li' : ['ul', 'ol'],

1455

'dl' : [],

1456

'dd' : ['dl'],

1457

'dt' : ['dl'] }

1458

1459

#Tables can contain other tables, but there are restrictions.

1460

NESTABLE_TABLE_TAGS = {'table' : [],

1461

'tr' : ['table', 'tbody', 'tfoot', 'thead'],

1462

'td' : ['tr'],

1463

'th' : ['tr'],

1464

'thead' : ['table'],

1465

'tbody' : ['table'],

1466

'tfoot' : ['table'],

1467

}

1468

1469

NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']

1470

1471

#If one of these tags is encountered, all tags up to the next tag of

1472

#this type are popped.

1473

RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',

1474

NON_NESTABLE_BLOCK_TAGS,

1475

NESTABLE_LIST_TAGS,

1476

NESTABLE_TABLE_TAGS)

1477

1478

NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,

1479

NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)

1480

1481

# Used to detect the charset in a META tag; see start_meta

1482

CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")

1483

1484

def start_meta(self, attrs):

1485

"""Beautiful Soup can detect a charset included in a META tag,

1486

try to convert the document to that charset, and re-parse the

1487

document from the beginning."""

1488

httpEquiv = None

1489

contentType = None

1490

contentTypeIndex = None

1491

tagNeedsEncodingSubstitution = False

1492

1493

for i in range(0, len(attrs)):

1494

key, value = attrs[i]

1495

key = key.lower()

1496

if key == 'http-equiv':

1497

httpEquiv = value

1498

elif key == 'content':

1499

contentType = value

1500

contentTypeIndex = i

1501

1502

if httpEquiv and contentType: # It's an interesting meta tag.

1503

match = self.CHARSET_RE.search(contentType)

1504

if match:

1505

if getattr(self, 'declaredHTMLEncoding') or \

1506

(self.originalEncoding == self.fromEncoding):

1507

# This is our second pass through the document, or

1508

# else an encoding was specified explicitly and it

1509

# worked. Rewrite the meta tag.

1510

newAttr = self.CHARSET_RE.sub\

1511

(lambda(match):match.group(1) +

1512

"%SOUP-ENCODING%", value)

1513

attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],

1514

newAttr)

1515

tagNeedsEncodingSubstitution = True

1516

else:

1517

# This is our first pass through the document.

1518

# Go through it again with the new information.

1519

newCharset = match.group(3)

1520

if newCharset and newCharset != self.originalEncoding:

1521

self.declaredHTMLEncoding = newCharset

1522

self._feed(self.declaredHTMLEncoding)

1523

raise StopParsing

1524

tag = self.unknown_starttag("meta", attrs)

1525

if tag and tagNeedsEncodingSubstitution:

1526

tag.containsSubstitutions = True

1527

1528

class StopParsing(Exception):

1529

pass

1530

1531

class ICantBelieveItsBeautifulSoup(BeautifulSoup):

1532

1533

"""The BeautifulSoup class is oriented towards skipping over

1534

common HTML errors like unclosed tags. However, sometimes it makes

1535

errors of its own. For instance, consider this fragment:

1536

1537

1538

1539

This is perfectly valid (if bizarre) HTML. However, the

1540

BeautifulSoup class will implicitly close the first b tag when it

1541

encounters the second 'b'. It will think the author wrote

1542

"<b>Foo<b>Bar", and didn't close the first 'b' tag, because

1543

there's no real-world reason to bold something that's already

1544

bold. When it encounters '</b></b>' it will close two more 'b'

1545

tags, for a grand total of three tags closed instead of two. This

1546

can throw off the rest of your document structure. The same is

1547

true of a number of other tags, listed below.

1548

1549

It's much more common for someone to forget to close a 'b' tag

1550

than to actually use nested 'b' tags, and the BeautifulSoup class

1551

handles the common case. This class handles the not-co-common

1552

case: where you can't believe someone wrote what they did, but

1553

it's valid HTML and BeautifulSoup screwed up by assuming it

1554

wouldn't be."""

1555

1556

I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \

1557

['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',

1558

'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',

1559

'big']

1560

1561

I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']

1562

1563

NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,

1564

I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,

1565

I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)

1566

1567

class MinimalSoup(BeautifulSoup):

1568

"""The MinimalSoup class is for parsing HTML that contains

1569

pathologically bad markup. It makes no assumptions about tag

1570

nesting, but it does know which tags are self-closing, that

1571

1572

META tags may contain encoding information, and so on.

1573

1574

This also makes it better for subclassing than BeautifulStoneSoup

1575

or BeautifulSoup."""

1576

1577

RESET_NESTING_TAGS = buildTagMap('noscript')

1578

NESTABLE_TAGS = {}

1579

1580

class BeautifulSOAP(BeautifulStoneSoup):

1581

"""This class will push a tag with only a single string child into

1582

the tag's parent as an attribute. The attribute's name is the tag

1583

name, and the value is the string child. An example should give

1584

the flavor of the change:

1585

1586

1587

1588

1589

1590

You can then access fooTag['bar'] instead of fooTag.barTag.string.

1591

1592

This is, of course, useful for scraping structures that tend to

1593

use subelements instead of attributes, such as SOAP messages. Note

1594

that it modifies its input, so don't print the modified version

1595

out.

1596

1597

I'm not sure how many people really want to use this class; let me

1598

know if you do. Mainly I like the name."""

1599

1600

def popTag(self):

1601

if len(self.tagStack) > 1:

1602

tag = self.tagStack[-1]

1603

parent = self.tagStack[-2]

1604

parent._getAttrMap()

1605

if (isinstance(tag, Tag) and len(tag.contents) == 1 and

1606

isinstance(tag.contents[0], NavigableString) and

1607

not parent.attrMap.has_key(tag.name)):

1608

parent[tag.name] = tag.contents[0]

1609

BeautifulStoneSoup.popTag(self)

1610

1611

#Enterprise class names! It has come to our attention that some people

1612

#think the names of the Beautiful Soup parser classes are too silly

1613

#and "unprofessional" for use in enterprise screen-scraping. We feel

1614

#your pain! For such-minded folk, the Beautiful Soup Consortium And

1615

#All-Night Kosher Bakery recommends renaming this file to

1616

#"RobustParser.py" (or, in cases of extreme enterprisiness,

1617

#"RobustParserBeanInterface.class") and using the following

1618

#enterprise-friendly class aliases:

1619

class RobustXMLParser(BeautifulStoneSoup):

1620

pass

1621

class RobustHTMLParser(BeautifulSoup):

1622

pass

1623

class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):

1624

pass

1625

class RobustInsanelyWackAssHTMLParser(MinimalSoup):

1626

pass

1627

class SimplifyingSOAPParser(BeautifulSOAP):

1628

pass

1629

1630

######################################################

1631

1632

# Bonus library: Unicode, Dammit

1633

1634

# This class forces XML data into a standard format (usually to UTF-8

1635

# or Unicode). It is heavily based on code from Mark Pilgrim's

1636

# Universal Feed Parser. It does not rewrite the XML or HTML to

1637

# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi

1638

# (XML) and BeautifulSoup.start_meta (HTML).

1639

1640

# Autodetects character encodings.

1641

# Download from http://chardet.feedparser.org/

1642

import calibre.ebooks.chardet as chardet

1643

1644

class UnicodeDammit:

1645

"""A class for detecting the encoding of a *ML document and

1646

converting it to a Unicode string. If the source encoding is

1647

windows-1252, can replace MS smart quotes with their HTML or XML

1648

equivalents."""

1649

1650

# This dictionary maps commonly seen values for "charset" in HTML

1651

# meta tags to the corresponding Python codec names. It only covers

1652

# values that aren't in Python's aliases and can't be determined

1653

# by the heuristics in find_codec.

1654

CHARSET_ALIASES = { "macintosh" : "mac-roman",

1655

"x-sjis" : "shift-jis" }

1656

1657

def __init__(self, markup, overrideEncodings=[],

1658

smartQuotesTo='xml'):

1659

self.markup, documentEncoding, sniffedEncoding = \

1660

self._detectEncoding(markup)

1661

self.smartQuotesTo = smartQuotesTo

1662

self.triedEncodings = []

1663

1664

if markup == '' or isinstance(markup, unicode):

1665

self.originalEncoding = None

1666

self.unicode = unicode(markup)

1667

return

1668

1669

u = None

1670

for proposedEncoding in overrideEncodings:

1671

u = self._convertFrom(proposedEncoding)

1672

if u: break

1673

if not u:

1674

for proposedEncoding in (documentEncoding, sniffedEncoding):

1675

u = self._convertFrom(proposedEncoding)

1676

if u: break

1677

1678

# If no luck and we have auto-detection library, try that:

1679

if not u and chardet and not isinstance(self.markup, unicode):

1680

u = self._convertFrom(chardet.detect(self.markup)['encoding'])

1681

1682

# As a last resort, try utf-8 and windows-1252:

1683

if not u:

1684

for proposed_encoding in ("utf-8", "windows-1252"):

1685

u = self._convertFrom(proposed_encoding)

1686

if u: break

1687

self.unicode = u

1688

if not u: self.originalEncoding = None

1689

1690

def _subMSChar(self, orig):

1691

"""Changes a MS smart quote character to an XML or HTML

1692

entity."""

1693

sub = self.MS_CHARS.get(orig)

1694

if type(sub) == types.TupleType:

1695

if self.smartQuotesTo == 'xml':

1696

sub = '&#x%s;' % sub[1]

1697

else:

1698

sub = '&%s;' % sub[0]

1699

return sub

1700

1701

def _convertFrom(self, proposed):

1702

proposed = self.find_codec(proposed)

1703

if not proposed or proposed in self.triedEncodings:

1704

return None

1705

self.triedEncodings.append(proposed)

1706

markup = self.markup

1707

1708

# Convert smart quotes to HTML if coming from an encoding

1709

# that might have them.

1710

if self.smartQuotesTo and proposed.lower() in("windows-1252",

1711

"iso-8859-1",

1712

"iso-8859-2"):

1713

markup = re.compile("([\x80-\x9f])").sub \

1714

(lambda(x): self._subMSChar(x.group(1)),

1715

markup)

1716

1717

try:

1718

# print "Trying to convert document to %s" % proposed

1719

u = self._toUnicode(markup, proposed)

1720

self.markup = u

1721

self.originalEncoding = proposed

1722

except Exception, e:

1723

#print "That didn't work!"

1724

#print e

1725

return None

1726

#print "Correct encoding: %s" % proposed

1727

return self.markup

1728

1729

def _toUnicode(self, data, encoding):

1730

'''Given a string and its encoding, decodes the string into Unicode.

1731

%encoding is a string recognized by encodings.aliases'''

1732

1733

# strip Byte Order Mark (if present)

1734

if (len(data) >= 4) and (data[:2] == '\xfe\xff') \

1735

and (data[2:4] != '\x00\x00'):

1736

encoding = 'utf-16be'

1737

data = data[2:]

1738

elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \

1739

and (data[2:4] != '\x00\x00'):

1740

encoding = 'utf-16le'

1741

data = data[2:]

1742

elif data[:3] == '\xef\xbb\xbf':

1743

encoding = 'utf-8'

1744

data = data[3:]

1745

elif data[:4] == '\x00\x00\xfe\xff':

1746

encoding = 'utf-32be'

1747

data = data[4:]

1748

elif data[:4] == '\xff\xfe\x00\x00':

1749

encoding = 'utf-32le'

1750

data = data[4:]

1751

1752

newdata = unicode(data, encoding)

1753

1754

return newdata

1755

1756

def _detectEncoding(self, xml_data):

1757

"""Given a document, tries to detect its XML encoding."""

1758

xml_encoding = sniffed_xml_encoding = None

1759

try:

1760

if xml_data[:4] == '\x4c\x6f\xa7\x94':

1761

# EBCDIC

1762

xml_data = self._ebcdic_to_ascii(xml_data)

1763

1764

# By Kovid commented out all the recoding to UTF-8 of UTF-16 and UTF-32

1765

# as this doesn't make sense and doesn't work for the test case

1766

# BeautifulSoup.UnicodeDammit(u'abcd'.encode('utf-16')).unicode

1767

elif xml_data[:4] == '\x00\x3c\x00\x3f':

1768

# UTF-16BE

1769

sniffed_xml_encoding = 'utf-16be'

1770

#xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')

1771

elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \

1772

and (xml_data[2:4] != '\x00\x00'):

1773

# UTF-16BE with BOM

1774

sniffed_xml_encoding = 'utf-16be'

1775

#xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')

1776

elif xml_data[:4] == '\x3c\x00\x3f\x00':

1777

# UTF-16LE

1778

sniffed_xml_encoding = 'utf-16le'

1779

#xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')

1780

elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \

1781

(xml_data[2:4] != '\x00\x00'):

1782

# UTF-16LE with BOM

1783

sniffed_xml_encoding = 'utf-16le'

1784

#xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')

1785

elif xml_data[:4] == '\x00\x00\x00\x3c':

1786

# UTF-32BE

1787

sniffed_xml_encoding = 'utf-32be'

1788

#xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')

1789

elif xml_data[:4] == '\x3c\x00\x00\x00':

1790

# UTF-32LE

1791

sniffed_xml_encoding = 'utf-32le'

1792

#xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')

1793

elif xml_data[:4] == '\x00\x00\xfe\xff':

1794

# UTF-32BE with BOM

1795

sniffed_xml_encoding = 'utf-32be'

1796

#xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')

1797

elif xml_data[:4] == '\xff\xfe\x00\x00':

1798

# UTF-32LE with BOM

1799

sniffed_xml_encoding = 'utf-32le'

1800

#xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')

1801

elif xml_data[:3] == '\xef\xbb\xbf':

1802

# UTF-8 with BOM

1803

sniffed_xml_encoding = 'utf-8'

1804

#xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')

1805

else:

1806

sniffed_xml_encoding = 'ascii'

1807

pass

1808

xml_encoding_match = re.compile \

1809

('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\

1810

.match(xml_data)

1811

if xml_encoding_match is None: # By Kovid to use the content-type header in HTML files

1812

xml_encoding_match = re.compile(r'<meta.*?content=[\'"].*?charset=(\S+).*?[\'"]', re.IGNORECASE).search(xml_data)

1813

except:

1814

xml_encoding_match = None

1815

if xml_encoding_match:

1816

xml_encoding = xml_encoding_match.groups()[0].lower()

1817

1818

if sniffed_xml_encoding and \

1819

(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',

1820

'iso-10646-ucs-4', 'ucs-4', 'csucs4',

1821

'utf-16', 'utf-32', 'utf_16', 'utf_32',

1822

'utf16', 'u16')):

1823

xml_encoding = sniffed_xml_encoding

1824

1825

return xml_data, xml_encoding, sniffed_xml_encoding

1826

1827

1828

def find_codec(self, charset):

1829

return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \

1830

or (charset and self._codec(charset.replace("-", ""))) \

1831

or (charset and self._codec(charset.replace("-", "_"))) \

1832

or charset

1833

1834

def _codec(self, charset):

1835

if not charset: return charset

1836

codec = None

1837

try:

1838

codecs.lookup(charset)

1839

codec = charset

1840

except (LookupError, ValueError):

1841

pass

1842

return codec

1843

1844

EBCDIC_TO_ASCII_MAP = None

1845

def _ebcdic_to_ascii(self, s):

1846

c = self.__class__

1847

if not c.EBCDIC_TO_ASCII_MAP:

1848

emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,

1849

16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,

1850

128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,

1851

144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,

1852

32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,

1853

38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,

1854

45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,

1855

186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,

1856

195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,

1857

201,202,106,107,108,109,110,111,112,113,114,203,204,205,

1858

206,207,208,209,126,115,116,117,118,119,120,121,122,210,

1859

211,212,213,214,215,216,217,218,219,220,221,222,223,224,

1860

225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,

1861

73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,

1862

82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,

1863

90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,

1864

250,251,252,253,254,255)

1865

import string

1866

c.EBCDIC_TO_ASCII_MAP = string.maketrans( \

1867

''.join(map(chr, range(256))), ''.join(map(chr, emap)))

1868

return s.translate(c.EBCDIC_TO_ASCII_MAP)

1869

1870

MS_CHARS = { '\x80' : ('euro', '20AC'),

1871

'\x81' : ' ',

1872

'\x82' : ('sbquo', '201A'),

1873

'\x83' : ('fnof', '192'),

1874

'\x84' : ('bdquo', '201E'),

1875

'\x85' : ('hellip', '2026'),

1876

'\x86' : ('dagger', '2020'),

1877

'\x87' : ('Dagger', '2021'),

1878

'\x88' : ('circ', '2C6'),

1879

'\x89' : ('permil', '2030'),

1880

'\x8A' : ('Scaron', '160'),

1881

'\x8B' : ('lsaquo', '2039'),

1882

'\x8C' : ('OElig', '152'),

1883

'\x8D' : '?',

1884

'\x8E' : ('#x17D', '17D'),

1885

'\x8F' : '?',

1886

'\x90' : '?',

1887

'\x91' : ('lsquo', '2018'),

1888

'\x92' : ('rsquo', '2019'),

1889

'\x93' : ('ldquo', '201C'),

1890

'\x94' : ('rdquo', '201D'),

1891

'\x95' : ('bull', '2022'),

1892

'\x96' : ('ndash', '2013'),

1893

'\x97' : ('mdash', '2014'),

1894

'\x98' : ('tilde', '2DC'),

1895

'\x99' : ('trade', '2122'),

1896

'\x9a' : ('scaron', '161'),

1897

'\x9b' : ('rsaquo', '203A'),

1898

'\x9c' : ('oelig', '153'),

1899

'\x9d' : '?',

1900

'\x9e' : ('#x17E', '17E'),

1901

'\x9f' : ('Yuml', ''),}

1902

1903

#######################################################################

1904

1905

1906

#By default, act as an HTML pretty-printer.

1907

if __name__ == '__main__':

1908

import sys

1909

soup = BeautifulSoup(sys.stdin.read())

1910

print soup.prettify()

Older »