~leonardr/beautifulsoup/3.2

« back to all changes in this revision

Viewing changes to BeautifulSoup.py

  • Committer: Leonard Richardson
  • Date: 2012-02-16 13:13:20 UTC
  • Revision ID: leonard.richardson@canonical.com-20120216131320-77f51kgu9sd27av3
Substitute XML entities for bare ampersands and angle brackets within strings, not just within attribute values.

Show diffs side-by-side

added added

removed removed

Lines of Context:
114
114
    """Contains the navigational information for some part of the page
115
115
    (either a tag or a piece of text)"""
116
116
 
 
117
    def _invert(h):
 
118
        "Cheap function to invert a hash."
 
119
        i = {}
 
120
        for k,v in h.items():
 
121
            i[v] = k
 
122
        return i
 
123
 
 
124
    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
 
125
                                      "quot" : '"',
 
126
                                      "amp" : "&",
 
127
                                      "lt" : "<",
 
128
                                      "gt" : ">" }
 
129
 
 
130
    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 
131
 
117
132
    def setup(self, parent=None, previous=None):
118
133
        """Sets up the initial relations between this element and
119
134
        other elements."""
421
436
                s = unicode(s)
422
437
        return s
423
438
 
 
439
    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 
440
                                           + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 
441
                                           + ")")
 
442
 
 
443
    def _sub_entity(self, x):
 
444
        """Used with a regular expression to substitute the
 
445
        appropriate XML entity for an XML special character."""
 
446
        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
 
447
 
 
448
 
424
449
class NavigableString(unicode, PageElement):
425
450
 
426
451
    def __new__(cls, value):
451
476
        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
452
477
 
453
478
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 
479
        # Substitute outgoing XML entities.
 
480
        data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
454
481
        if encoding:
455
 
            return self.encode(encoding)
 
482
            return data.encode(encoding)
456
483
        else:
457
 
            return self
 
484
            return data
458
485
 
459
486
class CData(NavigableString):
460
487
 
480
507
 
481
508
    """Represents a found HTML tag with its attributes and contents."""
482
509
 
483
 
    def _invert(h):
484
 
        "Cheap function to invert a hash."
485
 
        i = {}
486
 
        for k,v in h.items():
487
 
            i[v] = k
488
 
        return i
489
 
 
490
 
    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
491
 
                                      "quot" : '"',
492
 
                                      "amp" : "&",
493
 
                                      "lt" : "<",
494
 
                                      "gt" : ">" }
495
 
 
496
 
    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
497
 
 
498
510
    def _convertEntities(self, match):
499
511
        """Used in a call to re.sub to replace HTML, XML, and numeric
500
512
        entities with the appropriate Unicode characters. If HTML
681
693
    def __unicode__(self):
682
694
        return self.__str__(None)
683
695
 
684
 
    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
685
 
                                           + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
686
 
                                           + ")")
687
 
 
688
 
    def _sub_entity(self, x):
689
 
        """Used with a regular expression to substitute the
690
 
        appropriate XML entity for an XML special character."""
691
 
        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
692
 
 
693
696
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
694
697
                prettyPrint=False, indentLevel=0):
695
698
        """Returns a string or Unicode representation of this tag and