~gnome-el-l10n/gnome.gr-website/gnomegr-venus

« back to all changes in this revision

Viewing changes to planet/vendor/html5lib/tokenizer.py

  • Committer: Sam Ruby
  • Date: 2007-08-12 19:08:29 UTC
  • Revision ID: rubys@intertwingly.net-20070812190829-8ypg0qvs2rsohcim
Upgrade to the latest html5lib
Fixes the following error:
http://lists.planetplanet.org/archives/devel/2007-August/001638.html

Show diffs side-by-side

added added

removed removed

Lines of Context:
32
32
 
33
33
    # XXX need to fix documentation
34
34
 
35
 
    def __init__(self, stream, encoding=None, parseMeta=True):
 
35
    def __init__(self, stream, encoding=None, parseMeta=True,
 
36
                 lowercaseElementName=True, lowercaseAttrName=True,):
36
37
        self.stream = HTMLInputStream(stream, encoding, parseMeta)
37
 
 
 
38
        
 
39
        #Perform case conversions?
 
40
        self.lowercaseElementName = lowercaseElementName
 
41
        self.lowercaseAttrName = lowercaseAttrName
 
42
        
38
43
        self.states = {
39
44
            "data":self.dataState,
40
45
            "entityData":self.entityDataState,
111
116
            self.currentToken["type"] = "EmptyTag"
112
117
        else:
113
118
            self.tokenQueue.append({"type": "ParseError", "data":
114
 
              _("Solidus (/) incorrectly placed in tag.")})
 
119
              _(u"Solidus (/) incorrectly placed in tag.")})
115
120
 
116
121
        # The character we just consumed need to be put back on the stack so it
117
122
        # doesn't get lost...
146
151
 
147
152
        if charAsInt == 13:
148
153
            self.tokenQueue.append({"type": "ParseError", "data":
149
 
              _("Incorrect CR newline entity. Replaced with LF.")})
 
154
              _(u"Incorrect CR newline entity. Replaced with LF.")})
150
155
            charAsInt = 10
151
156
        elif 127 < charAsInt < 160:
152
157
            # If the integer is between 127 and 160 (so 128 and bigger and 159
153
158
            # and smaller) we need to do the "windows trick".
154
159
            self.tokenQueue.append({"type": "ParseError", "data":
155
 
              _("Entity used with illegal number (windows-1252 reference).")})
 
160
              _(u"Entity used with illegal number (windows-1252 reference).")})
156
161
 
157
162
            charAsInt = entitiesWindows1252[charAsInt - 128]
158
163
 
168
173
                    char = eval("u'\\U%08x'" % charAsInt)
169
174
                except:
170
175
                    self.tokenQueue.append({"type": "ParseError", "data":
171
 
                      _("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
 
176
                      _(u"Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
172
177
        else:
173
178
            char = u"\uFFFD"
174
179
            self.tokenQueue.append({"type": "ParseError", "data":
175
 
              _("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
 
180
              _(u"Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
176
181
 
177
182
        # Discard the ; if present. Otherwise, put it back on the queue and
178
183
        # invoke parseError on parser.
179
184
        if c != u";":
180
185
            self.tokenQueue.append({"type": "ParseError", "data":
181
 
              _("Numeric entity didn't end with ';'.")})
 
186
              _(u"Numeric entity didn't end with ';'.")})
182
187
            self.stream.unget(c)
183
188
 
184
189
        return char
191
196
        elif charStack[0] == u"#":
192
197
            # We might have a number entity here.
193
198
            charStack.extend([self.stream.char(), self.stream.char()])
194
 
            if EOF in charStack:
 
199
            if EOF in charStack[:2]:
195
200
                # If we reach the end of the file put everything up to EOF
196
201
                # back in the queue
197
202
                charStack = charStack[:charStack.index(EOF)]
198
203
                self.stream.unget(charStack)
199
204
                self.tokenQueue.append({"type": "ParseError", "data":
200
 
                  _("Numeric entity expected. Got end of file instead.")})
 
205
                  _(u"Numeric entity expected. Got end of file instead.")})
201
206
            else:
202
207
                if charStack[1].lower() == u"x" \
203
208
                  and charStack[2] in hexDigits:
212
217
                    # No number entity detected.
213
218
                    self.stream.unget(charStack)
214
219
                    self.tokenQueue.append({"type": "ParseError", "data":
215
 
                      _("Numeric entity expected but none found.")})
 
220
                      _(u"Numeric entity expected but none found.")})
216
221
        else:
217
222
            # At this point in the process might have named entity. Entities
218
223
            # are stored in the global variable "entities".
244
249
            if entityName is not None:
245
250
                if entityName[-1] != ";":
246
251
                    self.tokenQueue.append({"type": "ParseError", "data":
247
 
                      _("Named entity didn't end with ';'.")})
 
252
                      _(u"Named entity didn't end with ';'.")})
248
253
                if entityName[-1] != ";" and fromAttribute and \
249
254
                  (charStack[entityLength] in asciiLetters
250
255
                  or charStack[entityLength] in digits):
254
259
                    self.stream.unget(charStack[entityLength:])
255
260
            else:
256
261
                self.tokenQueue.append({"type": "ParseError", "data":
257
 
                  _("Named entity expected. Got none.")})
 
262
                  _(u"Named entity expected. Got none.")})
258
263
                self.stream.unget(charStack)
259
264
        return char
260
265
 
272
277
        the state to "data" because that's what's needed after a token has been
273
278
        emitted.
274
279
        """
275
 
 
 
280
        token = self.currentToken
276
281
        # Add token to the queue to be yielded
277
 
        self.tokenQueue.append(self.currentToken)
 
282
        if (token["type"] in ("StartTag", "EndTag", "EmptyTag")):
 
283
            if self.lowercaseElementName:
 
284
                token["name"] = token["name"].translate(asciiUpper2Lower)
 
285
            if token["type"] == "EndTag" and token["data"]:
 
286
               self.tokenQueue.append({"type":"ParseError",
 
287
                                       "data":_(u"End tag contains unexpected attributes.")})
 
288
        self.tokenQueue.append(token)
278
289
        self.state = self.states["data"]
279
290
 
280
291
 
286
297
 
287
298
    def dataState(self):
288
299
        data = self.stream.char()
 
300
 
 
301
        # Keep a charbuffer to handle the escapeFlag
289
302
        if self.contentModelFlag in\
290
303
          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
291
304
            if len(self.lastFourChars) == 4:
292
305
                self.lastFourChars.pop(0)
293
306
            self.lastFourChars.append(data)
 
307
 
 
308
        # The rest of the logic
294
309
        if data == "&" and self.contentModelFlag in\
295
 
          (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
 
310
          (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not\
 
311
          self.escapeFlag:
296
312
            self.state = self.states["entityData"]
297
313
        elif data == "-" and self.contentModelFlag in\
298
 
          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
299
 
          self.escapeFlag == False and\
300
 
          "".join(self.lastFourChars) == "<!--":
 
314
          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not\
 
315
          self.escapeFlag and "".join(self.lastFourChars) == "<!--":
301
316
            self.escapeFlag = True
302
317
            self.tokenQueue.append({"type": "Characters", "data":data})
303
318
        elif data == "<" and (self.contentModelFlag ==\
307
322
            self.state = self.states["tagOpen"]
308
323
        elif data == ">" and self.contentModelFlag in\
309
324
          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
310
 
          self.escapeFlag == True and "".join(self.lastFourChars)[1:] == "-->":
 
325
          self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->":
311
326
            self.escapeFlag = False
312
327
            self.tokenQueue.append({"type": "Characters", "data":data})
313
328
        elif data == EOF:
317
332
            # Directly after emitting a token you switch back to the "data
318
333
            # state". At that point spaceCharacters are important so they are
319
334
            # emitted separately.
320
 
            # XXX need to check if we don't need a special "spaces" flag on
321
 
            # characters.
322
335
            self.tokenQueue.append({"type": "SpaceCharacters", "data":
323
336
              data + self.stream.charsUntil(spaceCharacters, True)})
324
337
        else:
350
363
                # XXX In theory it could be something besides a tag name. But
351
364
                # do we really care?
352
365
                self.tokenQueue.append({"type": "ParseError", "data":
353
 
                  _("Expected tag name. Got '>' instead.")})
 
366
                  _(u"Expected tag name. Got '>' instead.")})
354
367
                self.tokenQueue.append({"type": "Characters", "data": u"<>"})
355
368
                self.state = self.states["data"]
356
369
            elif data == u"?":
357
370
                # XXX In theory it could be something besides a tag name. But
358
371
                # do we really care?
359
372
                self.tokenQueue.append({"type": "ParseError", "data":
360
 
                  _("Expected tag name. Got '?' instead (HTML doesn't "
 
373
                  _(u"Expected tag name. Got '?' instead (HTML doesn't "
361
374
                  "support processing instructions).")})
362
375
                self.stream.unget(data)
363
376
                self.state = self.states["bogusComment"]
364
377
            else:
365
378
                # XXX
366
379
                self.tokenQueue.append({"type": "ParseError", "data":
367
 
                  _("Expected tag name. Got something else instead")})
 
380
                  _(u"Expected tag name. Got something else instead")})
368
381
                self.tokenQueue.append({"type": "Characters", "data": u"<"})
369
382
                self.stream.unget(data)
370
383
                self.state = self.states["data"]
423
436
            self.state = self.states["tagName"]
424
437
        elif data == u">":
425
438
            self.tokenQueue.append({"type": "ParseError", "data":
426
 
              _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
 
439
              _(u"Expected closing tag. Got '>' instead. Ignoring '</>'.")})
427
440
            self.state = self.states["data"]
428
441
        elif data == EOF:
429
442
            self.tokenQueue.append({"type": "ParseError", "data":
430
 
              _("Expected closing tag. Unexpected end of file.")})
 
443
              _(u"Expected closing tag. Unexpected end of file.")})
431
444
            self.tokenQueue.append({"type": "Characters", "data": u"</"})
432
445
            self.state = self.states["data"]
433
446
        else:
434
447
            # XXX data can be _'_...
435
448
            self.tokenQueue.append({"type": "ParseError", "data":
436
 
              _("Expected closing tag. Unexpected character '" + data + "' found.")})
 
449
              _(u"Expected closing tag. Unexpected character '%s' found.") % (data,)})
437
450
            self.stream.unget(data)
438
451
            self.state = self.states["bogusComment"]
439
452
        return True
449
462
            self.emitCurrentToken()
450
463
        elif data == EOF:
451
464
            self.tokenQueue.append({"type": "ParseError", "data":
452
 
              _("Unexpected end of file in the tag name.")})
 
465
              _(u"Unexpected end of file in the tag name.")})
453
466
            self.emitCurrentToken()
454
467
        elif data == u"/":
455
468
            self.processSolidusInTag()
471
484
            self.processSolidusInTag()
472
485
        elif data == EOF:
473
486
            self.tokenQueue.append({"type": "ParseError", "data":
474
 
              _("Unexpected end of file. Expected attribute name instead.")})
 
487
              _(u"Unexpected end of file. Expected attribute name instead.")})
475
488
            self.emitCurrentToken()
476
489
        else:
477
490
            self.currentToken["data"].append([data, ""])
481
494
    def attributeNameState(self):
482
495
        data = self.stream.char()
483
496
        leavingThisState = True
 
497
        emitToken = False
484
498
        if data == u"=":
485
499
            self.state = self.states["beforeAttributeValue"]
486
500
        elif data in asciiLetters:
491
505
            # XXX If we emit here the attributes are converted to a dict
492
506
            # without being checked and when the code below runs we error
493
507
            # because data is a dict not a list
494
 
            pass
 
508
            emitToken = True
495
509
        elif data in spaceCharacters:
496
510
            self.state = self.states["afterAttributeName"]
497
511
        elif data == u"/":
499
513
            self.state = self.states["beforeAttributeName"]
500
514
        elif data == EOF:
501
515
            self.tokenQueue.append({"type": "ParseError", "data":
502
 
              _("Unexpected end of file in attribute name.")})
503
 
            self.emitCurrentToken()
504
 
            leavingThisState = False
 
516
              _(u"Unexpected end of file in attribute name.")})
 
517
            self.state = self.states["data"]
 
518
            emitToken = True
505
519
        else:
506
520
            self.currentToken["data"][-1][0] += data
507
521
            leavingThisState = False
510
524
            # Attributes are not dropped at this stage. That happens when the
511
525
            # start tag token is emitted so values can still be safely appended
512
526
            # to attributes, but we do want to report the parse error in time.
 
527
            if self.lowercaseAttrName:
 
528
                self.currentToken["data"][-1][0] = (
 
529
                    self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
513
530
            for name, value in self.currentToken["data"][:-1]:
514
531
                if self.currentToken["data"][-1][0] == name:
515
532
                    self.tokenQueue.append({"type": "ParseError", "data":
516
 
                      _("Dropped duplicate attribute on tag.")})
 
533
                      _(u"Dropped duplicate attribute on tag.")})
 
534
                    break
517
535
            # XXX Fix for above XXX
518
 
            if data == u">":
 
536
            if emitToken:
519
537
                self.emitCurrentToken()
520
538
        return True
521
539
 
535
553
            self.state = self.states["beforeAttributeName"]
536
554
        elif data == EOF:
537
555
            self.tokenQueue.append({"type": "ParseError", "data":
538
 
              _("Unexpected end of file. Expected = or end of tag.")})
 
556
              _(u"Unexpected end of file. Expected = or end of tag.")})
539
557
            self.emitCurrentToken()
540
558
        else:
541
559
            self.currentToken["data"].append([data, ""])
557
575
            self.emitCurrentToken()
558
576
        elif data == EOF:
559
577
            self.tokenQueue.append({"type": "ParseError", "data":
560
 
              _("Unexpected end of file. Expected attribute value.")})
 
578
              _(u"Unexpected end of file. Expected attribute value.")})
561
579
            self.emitCurrentToken()
562
580
        else:
563
581
            self.currentToken["data"][-1][1] += data
572
590
            self.processEntityInAttribute()
573
591
        elif data == EOF:
574
592
            self.tokenQueue.append({"type": "ParseError", "data":
575
 
              _("Unexpected end of file in attribute value (\").")})
 
593
              _(u"Unexpected end of file in attribute value (\").")})
576
594
            self.emitCurrentToken()
577
595
        else:
578
596
            self.currentToken["data"][-1][1] += data +\
587
605
            self.processEntityInAttribute()
588
606
        elif data == EOF:
589
607
            self.tokenQueue.append({"type": "ParseError", "data":
590
 
              _("Unexpected end of file in attribute value (').")})
 
608
              _(u"Unexpected end of file in attribute value (').")})
591
609
            self.emitCurrentToken()
592
610
        else:
593
611
            self.currentToken["data"][-1][1] += data +\
604
622
            self.emitCurrentToken()
605
623
        elif data == EOF:
606
624
            self.tokenQueue.append({"type": "ParseError", "data":
607
 
              _("Unexpected end of file in attribute value.")})
 
625
              _(u"Unexpected end of file in attribute value.")})
608
626
            self.emitCurrentToken()
609
627
        else:
610
628
            self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
627
645
    def markupDeclarationOpenState(self):
628
646
        charStack = [self.stream.char(), self.stream.char()]
629
647
        if charStack == [u"-", u"-"]:
630
 
            self.currentToken = {"type": "Comment", "data": ""}
 
648
            self.currentToken = {"type": "Comment", "data": u""}
631
649
            self.state = self.states["commentStart"]
632
650
        else:
633
651
            for x in xrange(5):
635
653
            # Put in explicit EOF check
636
654
            if (not EOF in charStack and
637
655
                "".join(charStack).upper() == u"DOCTYPE"):
638
 
                self.currentToken = {"type":"Doctype", "name":"",
 
656
                self.currentToken = {"type":"Doctype", "name":u"",
639
657
                  "publicId":None, "systemId":None, "correct":True}
640
658
                self.state = self.states["doctype"]
641
659
            else:
642
660
                self.tokenQueue.append({"type": "ParseError", "data":
643
 
                  _("Expected '--' or 'DOCTYPE'. Not found.")})
 
661
                  _(u"Expected '--' or 'DOCTYPE'. Not found.")})
644
662
                self.stream.unget(charStack)
645
663
                self.state = self.states["bogusComment"]
646
664
        return True
651
669
            self.state = self.states["commentStartDash"]
652
670
        elif data == ">":
653
671
            self.tokenQueue.append({"type": "ParseError", "data":
654
 
              _("Incorrect comment.")})
 
672
              _(u"Incorrect comment.")})
655
673
            self.tokenQueue.append(self.currentToken)
656
674
            self.state = self.states["data"]
657
675
        elif data == EOF:
658
676
            self.tokenQueue.append({"type": "ParseError", "data":
659
 
              _("Unexpected end of file in comment.")})
 
677
              _(u"Unexpected end of file in comment.")})
660
678
            self.tokenQueue.append(self.currentToken)
661
679
            self.state = self.states["data"]
662
680
        else:
670
688
            self.state = self.states["commentEnd"]
671
689
        elif data == ">":
672
690
            self.tokenQueue.append({"type": "ParseError", "data":
673
 
              _("Incorrect comment.")})
 
691
              _(u"Incorrect comment.")})
674
692
            self.tokenQueue.append(self.currentToken)
675
693
            self.state = self.states["data"]
676
694
        elif data == EOF:
677
695
            self.tokenQueue.append({"type": "ParseError", "data":
678
 
              _("Unexpected end of file in comment.")})
 
696
              _(u"Unexpected end of file in comment.")})
679
697
            self.tokenQueue.append(self.currentToken)
680
698
            self.state = self.states["data"]
681
699
        else:
682
 
            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
 
700
            self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-")
683
701
            self.state = self.states["comment"]
684
702
        return True
685
703
 
690
708
            self.state = self.states["commentEndDash"]
691
709
        elif data == EOF:
692
710
            self.tokenQueue.append({"type": "ParseError", "data":
693
 
              _("Unexpected end of file in comment.")})
 
711
              _(u"Unexpected end of file in comment.")})
694
712
            self.tokenQueue.append(self.currentToken)
695
713
            self.state = self.states["data"]
696
714
        else:
703
721
            self.state = self.states["commentEnd"]
704
722
        elif data == EOF:
705
723
            self.tokenQueue.append({"type": "ParseError", "data":
706
 
              _("Unexpected end of file in comment (-)")})
 
724
              _(u"Unexpected end of file in comment (-)")})
707
725
            self.tokenQueue.append(self.currentToken)
708
726
            self.state = self.states["data"]
709
727
        else:
722
740
            self.state = self.states["data"]
723
741
        elif data == u"-":
724
742
            self.tokenQueue.append({"type": "ParseError", "data":
725
 
              _("Unexpected '-' after '--' found in comment.")})
 
743
              _(u"Unexpected '-' after '--' found in comment.")})
726
744
            self.currentToken["data"] += data
727
745
        elif data == EOF:
728
746
            self.tokenQueue.append({"type": "ParseError", "data":
729
 
              _("Unexpected end of file in comment (--).")})
 
747
              _(u"Unexpected end of file in comment (--).")})
730
748
            self.tokenQueue.append(self.currentToken)
731
749
            self.state = self.states["data"]
732
750
        else:
733
751
            # XXX
734
752
            self.tokenQueue.append({"type": "ParseError", "data":
735
 
              _("Unexpected character in comment found.")})
 
753
              _(u"Unexpected character in comment found.")})
736
754
            self.currentToken["data"] += u"--" + data
737
755
            self.state = self.states["comment"]
738
756
        return True
743
761
            self.state = self.states["beforeDoctypeName"]
744
762
        else:
745
763
            self.tokenQueue.append({"type": "ParseError", "data":
746
 
              _("No space after literal string 'DOCTYPE'.")})
 
764
              _(u"No space after literal string 'DOCTYPE'.")})
747
765
            self.stream.unget(data)
748
766
            self.state = self.states["beforeDoctypeName"]
749
767
        return True
754
772
            pass
755
773
        elif data == u">":
756
774
            self.tokenQueue.append({"type": "ParseError", "data":
757
 
              _("Unexpected > character. Expected DOCTYPE name.")})
 
775
              _(u"Unexpected > character. Expected DOCTYPE name.")})
758
776
            self.currentToken["correct"] = False
759
777
            self.tokenQueue.append(self.currentToken)
760
778
            self.state = self.states["data"]
761
779
        elif data == EOF:
762
780
            self.tokenQueue.append({"type": "ParseError", "data":
763
 
              _("Unexpected end of file. Expected DOCTYPE name.")})
 
781
              _(u"Unexpected end of file. Expected DOCTYPE name.")})
764
782
            self.currentToken["correct"] = False
765
783
            self.tokenQueue.append(self.currentToken)
766
784
            self.state = self.states["data"]
778
796
            self.state = self.states["data"]
779
797
        elif data == EOF:
780
798
            self.tokenQueue.append({"type": "ParseError", "data":
781
 
              _("Unexpected end of file in DOCTYPE name.")})
 
799
              _(u"Unexpected end of file in DOCTYPE name.")})
782
800
            self.currentToken["correct"] = False
783
801
            self.tokenQueue.append(self.currentToken)
784
802
            self.state = self.states["data"]
797
815
            self.currentToken["correct"] = False
798
816
            self.stream.unget(data)
799
817
            self.tokenQueue.append({"type": "ParseError", "data":
800
 
              _("Unexpected end of file in DOCTYPE.")})
 
818
              _(u"Unexpected end of file in DOCTYPE.")})
801
819
            self.tokenQueue.append(self.currentToken)
802
820
            self.state = self.states["data"]
803
821
        else:
813
831
            else:
814
832
                self.stream.unget(charStack)
815
833
                self.tokenQueue.append({"type": "ParseError", "data":
816
 
                  _("Expected space or '>'. Got '" + data + "'")})
 
834
                  _(u"Expected space or '>'. Got '%s'") % (data,)})
817
835
                self.state = self.states["bogusDoctype"]
818
836
        return True
819
837
    
822
840
        if data in spaceCharacters:
823
841
            pass
824
842
        elif data == "\"":
825
 
            self.currentToken["publicId"] = ""
 
843
            self.currentToken["publicId"] = u""
826
844
            self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
827
845
        elif data == "'":
828
 
            self.currentToken["publicId"] = ""
 
846
            self.currentToken["publicId"] = u""
829
847
            self.state = self.states["doctypePublicIdentifierSingleQuoted"]
830
848
        elif data == ">":
831
849
            self.tokenQueue.append({"type": "ParseError", "data":
832
 
              _("Unexpected end of DOCTYPE.")})
 
850
              _(u"Unexpected end of DOCTYPE.")})
833
851
            self.currentToken["correct"] = False
834
852
            self.tokenQueue.append(self.currentToken)
835
853
            self.state = self.states["data"]
836
854
        elif data == EOF:
837
855
            self.tokenQueue.append({"type": "ParseError", "data":
838
 
              _("Unexpected end of file in DOCTYPE.")})
 
856
              _(u"Unexpected end of file in DOCTYPE.")})
839
857
            self.currentToken["correct"] = False
840
858
            self.tokenQueue.append(self.currentToken)
841
859
            self.state = self.states["data"]
842
860
        else:
843
861
            self.tokenQueue.append({"type": "ParseError", "data":
844
 
              _("Unexpected character in DOCTYPE.")})
 
862
              _(u"Unexpected character in DOCTYPE.")})
845
863
            self.state = self.states["bogusDoctype"]
846
864
        return True
847
865
 
851
869
            self.state = self.states["afterDoctypePublicIdentifier"]
852
870
        elif data == EOF:
853
871
            self.tokenQueue.append({"type": "ParseError", "data":
854
 
              _("Unexpected end of file in DOCTYPE.")})
 
872
              _(u"Unexpected end of file in DOCTYPE.")})
855
873
            self.currentToken["correct"] = False
856
874
            self.tokenQueue.append(self.currentToken)
857
875
            self.state = self.states["data"]
865
883
            self.state = self.states["afterDoctypePublicIdentifier"]
866
884
        elif data == EOF:
867
885
            self.tokenQueue.append({"type": "ParseError", "data":
868
 
              _("Unexpected end of file in DOCTYPE.")})
 
886
              _(u"Unexpected end of file in DOCTYPE.")})
869
887
            self.currentToken["correct"] = False
870
888
            self.tokenQueue.append(self.currentToken)
871
889
            self.state = self.states["data"]
878
896
        if data in spaceCharacters:
879
897
            pass
880
898
        elif data == "\"":
881
 
            self.currentToken["systemId"] = ""
 
899
            self.currentToken["systemId"] = u""
882
900
            self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
883
901
        elif data == "'":
884
 
            self.currentToken["systemId"] = ""
 
902
            self.currentToken["systemId"] = u""
885
903
            self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
886
904
        elif data == ">":
887
905
            self.tokenQueue.append(self.currentToken)
888
906
            self.state = self.states["data"]
889
907
        elif data == EOF:
890
908
            self.tokenQueue.append({"type": "ParseError", "data":
891
 
              _("Unexpected end of file in DOCTYPE.")})
 
909
              _(u"Unexpected end of file in DOCTYPE.")})
892
910
            self.currentToken["correct"] = False
893
911
            self.tokenQueue.append(self.currentToken)
894
912
            self.state = self.states["data"]
895
913
        else:
896
914
            self.tokenQueue.append({"type": "ParseError", "data":
897
 
              _("Unexpected character in DOCTYPE.")})
 
915
              _(u"Unexpected character in DOCTYPE.")})
898
916
            self.state = self.states["bogusDoctype"]
899
917
        return True
900
918
    
903
921
        if data in spaceCharacters:
904
922
            pass
905
923
        elif data == "\"":
906
 
            self.currentToken["systemId"] = ""
 
924
            self.currentToken["systemId"] = u""
907
925
            self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
908
926
        elif data == "'":
909
 
            self.currentToken["systemId"] = ""
 
927
            self.currentToken["systemId"] = u""
910
928
            self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
911
929
        elif data == ">":
912
930
            self.tokenQueue.append({"type": "ParseError", "data":
913
 
              _("Unexpected character in DOCTYPE.")})
 
931
              _(u"Unexpected character in DOCTYPE.")})
914
932
            self.currentToken["correct"] = False
915
933
            self.tokenQueue.append(self.currentToken)
916
934
            self.state = self.states["data"]
917
935
        elif data == EOF:
918
936
            self.tokenQueue.append({"type": "ParseError", "data":
919
 
              _("Unexpected end of file in DOCTYPE.")})
 
937
              _(u"Unexpected end of file in DOCTYPE.")})
920
938
            self.currentToken["correct"] = False
921
939
            self.tokenQueue.append(self.currentToken)
922
940
            self.state = self.states["data"]
923
941
        else:
924
942
            self.tokenQueue.append({"type": "ParseError", "data":
925
 
              _("Unexpected character in DOCTYPE.")})
 
943
              _(u"Unexpected character in DOCTYPE.")})
926
944
            self.state = self.states["bogusDoctype"]
927
945
        return True
928
946
 
932
950
            self.state = self.states["afterDoctypeSystemIdentifier"]
933
951
        elif data == EOF:
934
952
            self.tokenQueue.append({"type": "ParseError", "data":
935
 
              _("Unexpected end of file in DOCTYPE.")})
 
953
              _(u"Unexpected end of file in DOCTYPE.")})
936
954
            self.currentToken["correct"] = False
937
955
            self.tokenQueue.append(self.currentToken)
938
956
            self.state = self.states["data"]
946
964
            self.state = self.states["afterDoctypeSystemIdentifier"]
947
965
        elif data == EOF:
948
966
            self.tokenQueue.append({"type": "ParseError", "data":
949
 
              _("Unexpected end of file in DOCTYPE.")})
 
967
              _(u"Unexpected end of file in DOCTYPE.")})
950
968
            self.currentToken["correct"] = False
951
969
            self.tokenQueue.append(self.currentToken)
952
970
            self.state = self.states["data"]
963
981
            self.state = self.states["data"]
964
982
        elif data == EOF:
965
983
            self.tokenQueue.append({"type": "ParseError", "data":
966
 
              _("Unexpected end of file in DOCTYPE.")})
 
984
              _(u"Unexpected end of file in DOCTYPE.")})
967
985
            self.currentToken["correct"] = False
968
986
            self.tokenQueue.append(self.currentToken)
969
987
            self.state = self.states["data"]
970
988
        else:
971
989
            self.tokenQueue.append({"type": "ParseError", "data":
972
 
              _("Unexpected character in DOCTYPE.")})
 
990
              _(u"Unexpected character in DOCTYPE.")})
973
991
            self.state = self.states["bogusDoctype"]
974
992
        return True
975
993
 
983
1001
            # XXX EMIT
984
1002
            self.stream.unget(data)
985
1003
            self.tokenQueue.append({"type": "ParseError", "data":
986
 
              _("Unexpected end of file in bogus doctype.")})
 
1004
              _(u"Unexpected end of file in bogus doctype.")})
987
1005
            self.tokenQueue.append(self.currentToken)
988
1006
            self.state = self.states["data"]
989
1007
        else: