2
* Copyright (C) 2008 Apple Inc. All Rights Reserved.
3
* Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4
* Copyright (C) 2010 Google, Inc. All Rights Reserved.
6
* Redistribution and use in source and binary forms, with or without
7
* modification, are permitted provided that the following conditions
9
* 1. Redistributions of source code must retain the above copyright
10
* notice, this list of conditions and the following disclaimer.
11
* 2. Redistributions in binary form must reproduce the above copyright
12
* notice, this list of conditions and the following disclaimer in the
13
* documentation and/or other materials provided with the distribution.
15
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
#include "XMLTokenizer.h"
31
#include "MarkupTokenizerInlines.h"
32
#include "NotImplemented.h"
33
#include "XMLCharacterReferenceParser.h"
35
#include <wtf/ASCIICType.h>
36
#include <wtf/CurrentTime.h>
37
#include <wtf/UnusedParam.h>
38
#include <wtf/text/AtomicString.h>
39
#include <wtf/text/CString.h>
40
#include <wtf/text/StringBuilder.h>
46
// This has to go in a .cpp file, as the linker doesn't like it being included more than once.
47
// We don't have an XMLToken.cpp though, so this is the next best place.
49
QualifiedName AtomicMarkupTokenBase<XMLToken>::nameForAttribute(const XMLToken::Attribute& attribute) const
51
return QualifiedName(attribute.m_prefix.isEmpty() ? nullAtom : AtomicString(attribute.m_prefix.data(), attribute.m_prefix.size()), AtomicString(attribute.m_name.data(), attribute.m_name.size()), nullAtom);
55
bool AtomicMarkupTokenBase<XMLToken>::usesName() const
57
return m_type == XMLTokenTypes::StartTag || m_type == XMLTokenTypes::EndTag || m_type == XMLTokenTypes::DOCTYPE || m_type == XMLTokenTypes::Entity;
61
bool AtomicMarkupTokenBase<XMLToken>::usesAttributes() const
63
return m_type == XMLTokenTypes::StartTag || m_type == XMLTokenTypes::EndTag;
68
inline bool isValidNameStart(UChar cc)
125
// FIXME: support non-BMP planes
130
inline bool isValidNameChar(UChar cc)
132
if (isValidNameStart(cc))
134
if (cc == '-' || cc == '.')
152
inline bool isValidLiteralChar(UChar cc)
154
if (cc == 0xD || cc == 0xA)
158
if (cc == '"' || cc == '&')
162
if (cc == '<' || cc == '>')
178
#define XML_BEGIN_STATE(stateName) BEGIN_STATE(XMLTokenizerState, stateName)
179
#define XML_ADVANCE_TO(stateName) ADVANCE_TO(XMLTokenizerState, stateName)
180
#define XML_SWITCH_TO(stateName) SWITCH_TO(XMLTokenizerState, stateName)
182
#define EQ_STATE(CurrentState, NextState) \
183
XML_BEGIN_STATE(CurrentState) { \
184
if (isTokenizerWhitespace(cc)) \
185
XML_ADVANCE_TO(CurrentState); \
186
else if (cc == '=') \
187
XML_ADVANCE_TO(NextState); \
190
return emitEndOfFile(source); \
195
#define EQ_BEFORE_VALUE_STATES(EqualsState, BeforeValueState, ValueState) \
196
EQ_STATE(EqualsState, BeforeValueState) \
197
XML_BEGIN_STATE(BeforeValueState) { \
198
if (isTokenizerWhitespace(cc)) \
199
XML_ADVANCE_TO(BeforeValueState); \
200
else if (cc == '"' || cc == '\'') { \
201
m_additionalAllowedCharacter = cc; \
202
XML_ADVANCE_TO(ValueState); \
205
return emitEndOfFile(source); \
210
XMLTokenizer::XMLTokenizer()
216
inline bool MarkupTokenizerBase<XMLToken, XMLTokenizerState>::shouldSkipNullCharacters() const
221
bool XMLTokenizer::nextToken(SegmentedString& source, XMLToken& token)
223
// If we have a token in progress, then we're supposed to be called back
224
// with the same token so we can finish it.
225
ASSERT(!m_token || m_token == &token || token.type() == XMLTokenTypes::Uninitialized);
228
if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source))
229
return haveBufferedCharacterToken();
230
UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
233
XML_BEGIN_STATE(DataState) {
235
XML_ADVANCE_TO(CharacterReferenceStartState);
236
else if (cc == '<') {
237
if (m_token->type() == XMLTokenTypes::Character) {
238
// We have a bunch of character tokens queued up that we
239
// are emitting lazily here.
242
XML_ADVANCE_TO(TagOpenState);
243
} else if (cc == InputStreamPreprocessor::endOfFileMarker)
244
return emitEndOfFile(source);
247
XML_ADVANCE_TO(DataState);
252
XML_BEGIN_STATE(CharacterReferenceStartState) {
254
bool notEnoughCharacters = false;
255
StringBuilder decodedCharacter;
256
if (consumeXMLCharacterReference(source, decodedCharacter, notEnoughCharacters)) {
257
for (unsigned i = 0; i < decodedCharacter.length(); ++i)
258
bufferCharacter(decodedCharacter[i]);
259
XML_SWITCH_TO(DataState);
260
} else if (notEnoughCharacters)
261
return haveBufferedCharacterToken();
262
} else if (isValidNameStart(cc)) {
263
if (m_token->type() == XMLTokenTypes::Character)
264
return emitAndReconsumeIn(source, XMLTokenizerState::CharacterReferenceStartState);
265
m_token->beginEntity(cc);
266
XML_ADVANCE_TO(EntityReferenceState);
269
return emitEndOfFile(source);
273
XML_BEGIN_STATE(EntityReferenceState) {
274
if (isValidNameChar(cc)) {
275
m_token->appendToName(cc);
276
XML_ADVANCE_TO(EntityReferenceState);
277
} else if (cc == ';')
278
return emitAndResumeIn(source, XMLTokenizerState::DataState);
281
return emitEndOfFile(source);
286
XML_BEGIN_STATE(TagOpenState) {
288
XML_ADVANCE_TO(MarkupDeclarationOpenState);
290
XML_ADVANCE_TO(EndTagOpenState);
291
else if (isValidNameStart(cc)) {
292
m_token->beginStartTag(cc);
293
XML_ADVANCE_TO(TagNameState);
294
} else if (cc == '?')
295
XML_ADVANCE_TO(ProcessingInstructionTargetStartState);
298
return emitEndOfFile(source);
303
XML_BEGIN_STATE(EndTagOpenState) {
304
if (isValidNameStart(cc)) {
305
m_token->beginEndTag(cc);
306
XML_ADVANCE_TO(EndTagNameState);
309
return emitEndOfFile(source);
314
XML_BEGIN_STATE(TagNameState) {
315
if (isTokenizerWhitespace(cc))
316
XML_ADVANCE_TO(BeforeAttributeNameState);
318
XML_ADVANCE_TO(SelfClosingStartTagState);
320
return emitAndResumeIn(source, XMLTokenizerState::DataState);
321
else if (isValidNameChar(cc)) {
322
m_token->appendToName(cc);
323
XML_ADVANCE_TO(TagNameState);
324
} else if (cc == ':' && !m_token->hasPrefix()) {
325
m_token->endPrefix();
326
XML_ADVANCE_TO(TagNameState);
329
return emitEndOfFile(source);
334
XML_BEGIN_STATE(EndTagNameState) {
335
if (isTokenizerWhitespace(cc))
336
XML_ADVANCE_TO(EndTagSpaceState);
338
return emitAndResumeIn(source, XMLTokenizerState::DataState);
339
else if (isValidNameChar(cc)) {
340
m_token->appendToName(cc);
341
XML_ADVANCE_TO(EndTagNameState);
342
} else if (cc == ':' && !m_token->hasPrefix()) {
343
m_token->endPrefix();
344
XML_ADVANCE_TO(EndTagNameState);
347
return emitEndOfFile(source);
352
XML_BEGIN_STATE(EndTagSpaceState) {
353
if (isTokenizerWhitespace(cc))
354
XML_ADVANCE_TO(EndTagSpaceState);
356
return emitAndResumeIn(source, XMLTokenizerState::DataState);
359
return emitEndOfFile(source);
364
XML_BEGIN_STATE(BeforeAttributeNameState) {
365
if (isTokenizerWhitespace(cc))
366
XML_ADVANCE_TO(BeforeAttributeNameState);
368
XML_ADVANCE_TO(SelfClosingStartTagState);
370
return emitAndResumeIn(source, XMLTokenizerState::DataState);
371
else if (isValidNameStart(cc)) {
372
m_token->addNewAttribute();
373
m_token->beginAttributeName(source.numberOfCharactersConsumed());
374
m_token->appendToAttributeName(cc);
375
XML_ADVANCE_TO(AttributeNameState);
378
return emitEndOfFile(source);
383
XML_BEGIN_STATE(AttributeNameState) {
384
if (isTokenizerWhitespace(cc)) {
385
m_token->endAttributeName(source.numberOfCharactersConsumed());
386
XML_ADVANCE_TO(AfterAttributeNameState);
387
} else if (cc == '=') {
388
m_token->endAttributeName(source.numberOfCharactersConsumed());
389
XML_ADVANCE_TO(BeforeAttributeValueState);
390
} else if (isValidNameChar(cc)) {
391
m_token->appendToAttributeName(cc);
392
XML_ADVANCE_TO(AttributeNameState);
393
} else if (cc == ':' && !m_token->attributeHasPrefix()) {
394
m_token->endAttributePrefix(source.numberOfCharactersConsumed());
395
XML_ADVANCE_TO(AttributeNameState);
398
m_token->endAttributeName(source.numberOfCharactersConsumed());
399
return emitEndOfFile(source);
404
EQ_STATE(AfterAttributeNameState, BeforeAttributeValueState)
406
XML_BEGIN_STATE(BeforeAttributeValueState) {
407
if (isTokenizerWhitespace(cc))
408
XML_ADVANCE_TO(BeforeAttributeValueState);
409
else if (cc == '"' || cc == '\'') {
410
m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
411
m_additionalAllowedCharacter = cc;
412
XML_ADVANCE_TO(AttributeValueQuotedState);
415
return emitEndOfFile(source);
420
XML_BEGIN_STATE(AttributeValueQuotedState) {
421
if (cc == m_additionalAllowedCharacter) {
422
m_token->endAttributeValue(source.numberOfCharactersConsumed());
423
XML_ADVANCE_TO(AfterAttributeValueQuotedState);
424
} else if (cc == '&')
425
XML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
426
else if (cc == '<' || cc == InputStreamPreprocessor::endOfFileMarker) {
428
m_token->endAttributeValue(source.numberOfCharactersConsumed());
429
return emitEndOfFile(source);
431
m_token->appendToAttributeValue(cc);
432
XML_ADVANCE_TO(AttributeValueQuotedState);
437
XML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
439
bool notEnoughCharacters = false;
440
StringBuilder decodedCharacter;
442
if (consumeXMLCharacterReference(source, decodedCharacter, notEnoughCharacters)) {
443
for (unsigned i = 0; i < decodedCharacter.length(); ++i)
444
m_token->appendToAttributeValue(decodedCharacter[i]);
445
XML_ADVANCE_TO(AttributeValueQuotedState);
446
} else if (notEnoughCharacters)
447
return haveBufferedCharacterToken();
450
return emitEndOfFile(source);
453
m_token->appendToAttributeValue('&');
454
m_token->appendToAttributeValue(cc);
455
XML_ADVANCE_TO(AttributeValueQuotedState);
460
XML_BEGIN_STATE(AfterAttributeValueQuotedState) {
461
if (isTokenizerWhitespace(cc))
462
XML_ADVANCE_TO(BeforeAttributeNameState);
464
XML_ADVANCE_TO(SelfClosingStartTagState);
466
return emitAndResumeIn(source, XMLTokenizerState::DataState);
469
return emitEndOfFile(source);
474
XML_BEGIN_STATE(SelfClosingStartTagState) {
476
m_token->setSelfClosing();
477
return emitAndResumeIn(source, XMLTokenizerState::DataState);
480
return emitEndOfFile(source);
484
XML_BEGIN_STATE(ProcessingInstructionTargetStartState) {
485
DEFINE_STATIC_LOCAL(String, xmlString, (ASCIILiteral("xml")));
486
// FIXME: this probably shouldn't be case-insensitive, but I don't know if people try capitalizing it ever.
487
if (cc == 'x' || cc == 'X') {
488
SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(xmlString);
489
if (result == SegmentedString::DidMatch) {
490
advanceStringAndASSERTIgnoringCase(source, "xml");
491
XML_SWITCH_TO(XMLDeclAfterXMLState);
492
} else if (result == SegmentedString::NotEnoughCharacters)
493
return haveBufferedCharacterToken();
495
if (m_token->type() == XMLTokenTypes::ProcessingInstruction && isValidNameChar(cc))
496
m_token->appendToProcessingInstructionTarget(cc);
497
else if (isValidNameStart(cc))
498
m_token->beginProcessingInstruction(cc);
501
return emitEndOfFile(source);
503
XML_ADVANCE_TO(ProcessingInstructionTargetState);
507
XML_BEGIN_STATE(XMLDeclAfterXMLState) {
508
if (isTokenizerWhitespace(cc)) {
509
m_token->beginXMLDeclaration();
510
XML_ADVANCE_TO(XMLDeclBeforeVersionNameState);
511
} else if (isValidNameChar(cc)) {
512
m_token->beginProcessingInstruction('x');
513
m_token->appendToProcessingInstructionTarget('m');
514
m_token->appendToProcessingInstructionTarget('l');
515
m_token->appendToProcessingInstructionTarget(cc);
516
XML_ADVANCE_TO(ProcessingInstructionTargetState);
520
return emitEndOfFile(source);
525
XML_BEGIN_STATE(XMLDeclBeforeVersionNameState) {
526
DEFINE_STATIC_LOCAL(String, versionString, (ASCIILiteral("version")));
527
if (isTokenizerWhitespace(cc))
528
XML_ADVANCE_TO(XMLDeclBeforeVersionNameState);
530
SegmentedString::LookAheadResult result = source.lookAhead(versionString);
531
if (result == SegmentedString::DidMatch) {
532
advanceStringAndASSERT(source, "version");
533
XML_SWITCH_TO(XMLDeclAfterVersionNameState);
534
} else if (result == SegmentedString::NotEnoughCharacters)
535
return haveBufferedCharacterToken();
538
return emitEndOfFile(source);
542
EQ_BEFORE_VALUE_STATES(XMLDeclAfterVersionNameState, XMLDeclBeforeVersionValueState, XMLDeclBeforeVersionOnePointState)
544
XML_BEGIN_STATE(XMLDeclBeforeVersionOnePointState) {
545
DEFINE_STATIC_LOCAL(String, onePointString, (ASCIILiteral("1.")));
546
SegmentedString::LookAheadResult result = source.lookAhead(onePointString);
547
if (result == SegmentedString::DidMatch) {
548
source.advanceAndASSERT('1');
549
source.advanceAndASSERT('.');
550
m_token->appendToXMLVersion('1');
551
m_token->appendToXMLVersion('.');
552
XML_SWITCH_TO(XMLDeclVersionValueQuotedState);
553
} else if (result == SegmentedString::NotEnoughCharacters)
554
return haveBufferedCharacterToken();
556
return emitEndOfFile(source);
560
XML_BEGIN_STATE(XMLDeclVersionValueQuotedState) {
561
if (cc == m_additionalAllowedCharacter) {
562
XML_ADVANCE_TO(XMLDeclAfterVersionState);
563
} else if (isASCIIDigit(cc)) {
564
m_token->appendToXMLVersion(cc);
565
XML_ADVANCE_TO(XMLDeclVersionValueQuotedState);
568
return emitEndOfFile(source);
573
XML_BEGIN_STATE(XMLDeclAfterVersionState) {
574
if (isTokenizerWhitespace(cc))
575
XML_ADVANCE_TO(XMLDeclBeforeEncodingNameState);
577
XML_ADVANCE_TO(XMLDeclCloseState);
580
return emitEndOfFile(source);
585
XML_BEGIN_STATE(XMLDeclBeforeEncodingNameState) {
586
DEFINE_STATIC_LOCAL(String, encodingString, (ASCIILiteral("encoding")));
587
DEFINE_STATIC_LOCAL(String, standaloneString, (ASCIILiteral("standalone")));
588
if (isTokenizerWhitespace(cc))
589
XML_ADVANCE_TO(XMLDeclBeforeEncodingNameState);
590
else if (cc == 'e') {
591
SegmentedString::LookAheadResult result = source.lookAhead(encodingString);
592
if (result == SegmentedString::DidMatch) {
593
advanceStringAndASSERT(source, "encoding");
594
XML_SWITCH_TO(XMLDeclAfterEncodingNameState);
595
} else if (result == SegmentedString::NotEnoughCharacters)
596
return haveBufferedCharacterToken();
597
} else if (cc == 's') {
598
SegmentedString::LookAheadResult result = source.lookAhead(standaloneString);
599
if (result == SegmentedString::DidMatch) {
600
advanceStringAndASSERT(source, "standalone");
601
XML_SWITCH_TO(XMLDeclAfterStandaloneNameState);
602
} else if (result == SegmentedString::NotEnoughCharacters)
603
return haveBufferedCharacterToken();
604
} else if (cc == '?')
605
XML_ADVANCE_TO(XMLDeclCloseState);
607
return emitEndOfFile(source);
611
EQ_BEFORE_VALUE_STATES(XMLDeclAfterEncodingNameState, XMLDeclBeforeEncodingValueState, XMLDeclEncodingValueStartQuotedState)
613
XML_BEGIN_STATE(XMLDeclEncodingValueStartQuotedState) {
614
if (isASCIIAlpha(cc)) {
615
m_token->beginXMLEncoding(cc);
616
XML_ADVANCE_TO(XMLDeclEncodingValueQuotedState);
619
return emitEndOfFile(source);
623
XML_BEGIN_STATE(XMLDeclEncodingValueQuotedState) {
624
if (cc == m_additionalAllowedCharacter) {
625
XML_ADVANCE_TO(XMLDeclAfterEncodingState);
626
} else if (isASCIIAlphanumeric(cc) || cc == '-') {
627
m_token->appendToXMLEncoding(cc);
628
XML_ADVANCE_TO(XMLDeclEncodingValueQuotedState);
631
return emitEndOfFile(source);
636
XML_BEGIN_STATE(XMLDeclAfterEncodingState) {
637
if (isTokenizerWhitespace(cc))
638
XML_ADVANCE_TO(XMLDeclBeforeStandaloneNameState);
640
XML_ADVANCE_TO(XMLDeclCloseState);
643
return emitEndOfFile(source);
648
XML_BEGIN_STATE(XMLDeclBeforeStandaloneNameState) {
649
DEFINE_STATIC_LOCAL(String, standaloneString, (ASCIILiteral("standalone")));
650
if (isTokenizerWhitespace(cc))
651
XML_ADVANCE_TO(XMLDeclBeforeStandaloneNameState);
652
else if (cc == 's') {
653
SegmentedString::LookAheadResult result = source.lookAhead(standaloneString);
654
if (result == SegmentedString::DidMatch) {
655
advanceStringAndASSERT(source, "standalone");
656
XML_SWITCH_TO(XMLDeclAfterStandaloneNameState);
657
} else if (result == SegmentedString::NotEnoughCharacters)
658
return haveBufferedCharacterToken();
659
} else if (cc == '?')
660
XML_ADVANCE_TO(XMLDeclCloseState);
662
return emitEndOfFile(source);
666
EQ_BEFORE_VALUE_STATES(XMLDeclAfterStandaloneNameState, XMLDeclBeforeStandaloneValueState, XMLDeclStandaloneValueQuotedState)
668
XML_BEGIN_STATE(XMLDeclStandaloneValueQuotedState) {
669
DEFINE_STATIC_LOCAL(String, yesString, (ASCIILiteral("yes\"")));
670
DEFINE_STATIC_LOCAL(String, noString, (ASCIILiteral("no\"")));
672
SegmentedString::LookAheadResult result = source.lookAhead(yesString);
673
if (result == SegmentedString::DidMatch) {
674
advanceStringAndASSERT(source, "yes\"");
675
m_token->setXMLStandalone(true);
676
XML_SWITCH_TO(XMLDeclAfterStandaloneState);
677
} else if (result == SegmentedString::NotEnoughCharacters)
678
return haveBufferedCharacterToken();
679
} else if (cc == 'n') {
680
SegmentedString::LookAheadResult result = source.lookAhead(noString);
681
if (result == SegmentedString::DidMatch) {
682
advanceStringAndASSERT(source, "no\"");
683
m_token->setXMLStandalone(false);
684
XML_SWITCH_TO(XMLDeclAfterStandaloneState);
685
} else if (result == SegmentedString::NotEnoughCharacters)
686
return haveBufferedCharacterToken();
689
return emitEndOfFile(source);
693
XML_BEGIN_STATE(XMLDeclAfterStandaloneState) {
694
if (isTokenizerWhitespace(cc))
695
XML_ADVANCE_TO(XMLDeclAfterStandaloneState);
697
XML_ADVANCE_TO(XMLDeclCloseState);
700
return emitEndOfFile(source);
705
XML_BEGIN_STATE(XMLDeclCloseState) {
707
return emitAndResumeIn(source, XMLTokenizerState::DataState);
709
return emitEndOfFile(source);
713
XML_BEGIN_STATE(ProcessingInstructionTargetState) {
714
if (isTokenizerWhitespace(cc)) {
715
XML_ADVANCE_TO(ProcessingInstructionAfterTargetState);
716
} else if (isValidNameChar(cc)) {
717
m_token->appendToProcessingInstructionTarget(cc);
718
XML_ADVANCE_TO(ProcessingInstructionTargetState);
721
return emitEndOfFile(source);
726
XML_BEGIN_STATE(ProcessingInstructionAfterTargetState) {
727
if (isTokenizerWhitespace(cc))
728
XML_ADVANCE_TO(ProcessingInstructionAfterTargetState);
730
XML_ADVANCE_TO(ProcessingInstructionCloseState);
731
else if (cc == InputStreamPreprocessor::endOfFileMarker) {
733
return emitEndOfFile(source);
735
m_token->appendToProcessingInstructionData(cc);
736
XML_ADVANCE_TO(ProcessingInstructionDataState);
741
XML_BEGIN_STATE(ProcessingInstructionDataState) {
743
XML_ADVANCE_TO(ProcessingInstructionCloseState);
744
else if (cc == InputStreamPreprocessor::endOfFileMarker) {
746
return emitEndOfFile(source);
748
m_token->appendToProcessingInstructionData(cc);
749
XML_ADVANCE_TO(ProcessingInstructionDataState);
754
XML_BEGIN_STATE(ProcessingInstructionCloseState) {
756
return emitAndResumeIn(source, XMLTokenizerState::DataState);
757
if (cc == InputStreamPreprocessor::endOfFileMarker) {
759
return emitEndOfFile(source);
761
m_token->appendToProcessingInstructionData('?');
762
m_token->appendToProcessingInstructionData(cc);
763
XML_ADVANCE_TO(ProcessingInstructionDataState);
767
XML_BEGIN_STATE(MarkupDeclarationOpenState) {
768
DEFINE_STATIC_LOCAL(String, dashDashString, (ASCIILiteral("--")));
769
DEFINE_STATIC_LOCAL(String, doctypeString, (ASCIILiteral("doctype")));
770
DEFINE_STATIC_LOCAL(String, cdataString, (ASCIILiteral("[CDATA[")));
772
SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
773
if (result == SegmentedString::DidMatch) {
774
source.advanceAndASSERT('-');
775
source.advanceAndASSERT('-');
776
m_token->beginComment();
777
XML_SWITCH_TO(CommentState);
778
} else if (result == SegmentedString::NotEnoughCharacters)
779
return haveBufferedCharacterToken();
780
} else if (cc == 'D' || cc == 'd') {
781
SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
782
if (result == SegmentedString::DidMatch) {
783
advanceStringAndASSERTIgnoringCase(source, "doctype");
784
XML_SWITCH_TO(BeforeDOCTYPENameState);
785
} else if (result == SegmentedString::NotEnoughCharacters)
786
return haveBufferedCharacterToken();
787
} else if (cc == '[') {
788
SegmentedString::LookAheadResult result = source.lookAhead(cdataString);
789
if (result == SegmentedString::DidMatch) {
790
advanceStringAndASSERT(source, "[CDATA[");
791
m_token->beginCDATA();
792
XML_SWITCH_TO(CDATASectionState);
793
} else if (result == SegmentedString::NotEnoughCharacters)
794
return haveBufferedCharacterToken();
797
return emitEndOfFile(source);
801
XML_BEGIN_STATE(CommentState) {
803
XML_ADVANCE_TO(CommentDashState);
804
else if (cc == InputStreamPreprocessor::endOfFileMarker) {
806
return emitEndOfFile(source);
808
m_token->appendToComment(cc);
809
XML_ADVANCE_TO(CommentState);
814
XML_BEGIN_STATE(CommentDashState) {
816
XML_ADVANCE_TO(CommentEndState);
817
else if (cc == InputStreamPreprocessor::endOfFileMarker) {
819
return emitEndOfFile(source);
821
m_token->appendToComment('-');
822
m_token->appendToComment(cc);
823
XML_ADVANCE_TO(CommentState);
828
XML_BEGIN_STATE(CommentEndState) {
830
return emitAndResumeIn(source, XMLTokenizerState::DataState);
833
return emitEndOfFile(source);
836
return emitAndReconsumeIn(source, XMLTokenizerState::DataState);
840
XML_BEGIN_STATE(BeforeDOCTYPENameState) {
841
if (isTokenizerWhitespace(cc))
842
XML_ADVANCE_TO(BeforeDOCTYPENameState);
843
else if (isValidNameStart(cc)) {
844
m_token->beginDOCTYPE(cc);
845
XML_ADVANCE_TO(DOCTYPENameState);
848
return emitEndOfFile(source);
853
XML_BEGIN_STATE(DOCTYPENameState) {
854
if (isTokenizerWhitespace(cc))
855
XML_ADVANCE_TO(AfterDOCTYPENameState);
857
return emitAndResumeIn(source, XMLTokenizerState::DataState);
858
else if (isValidNameChar(cc)) {
859
m_token->appendToName(cc);
860
XML_ADVANCE_TO(DOCTYPENameState);
863
return emitEndOfFile(source);
868
XML_BEGIN_STATE(AfterDOCTYPENameState) {
869
if (isTokenizerWhitespace(cc))
870
XML_ADVANCE_TO(AfterDOCTYPENameState);
872
return emitAndResumeIn(source, XMLTokenizerState::DataState);
873
if (cc == InputStreamPreprocessor::endOfFileMarker) {
875
return emitEndOfFile(source);
877
DEFINE_STATIC_LOCAL(String, publicString, (ASCIILiteral("public")));
878
DEFINE_STATIC_LOCAL(String, systemString, (ASCIILiteral("system")));
879
if (cc == 'P' || cc == 'p') {
880
SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
881
if (result == SegmentedString::DidMatch) {
882
advanceStringAndASSERTIgnoringCase(source, "public");
883
XML_SWITCH_TO(AfterDOCTYPEPublicKeywordState);
884
} else if (result == SegmentedString::NotEnoughCharacters)
885
return haveBufferedCharacterToken();
886
} else if (cc == 'S' || cc == 's') {
887
SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
888
if (result == SegmentedString::DidMatch) {
889
advanceStringAndASSERTIgnoringCase(source, "system");
890
XML_SWITCH_TO(AfterDOCTYPESystemKeywordState);
891
} else if (result == SegmentedString::NotEnoughCharacters)
892
return haveBufferedCharacterToken();
893
} else if (cc == '[')
894
XML_ADVANCE_TO(BeforeDOCTYPEInternalSubsetState);
897
return emitEndOfFile(source);
902
XML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
903
if (isTokenizerWhitespace(cc))
904
XML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
907
return emitEndOfFile(source);
912
XML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
913
if (isTokenizerWhitespace(cc))
914
XML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
915
else if (cc == '"' || cc == '\'') {
916
m_token->setPublicIdentifierToEmptyString();
917
m_additionalAllowedCharacter = cc;
918
XML_ADVANCE_TO(DOCTYPEPublicIdentifierQuotedState);
921
return emitEndOfFile(source);
926
XML_BEGIN_STATE(DOCTYPEPublicIdentifierQuotedState) {
927
if (cc == m_additionalAllowedCharacter)
928
XML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
929
else if (cc == InputStreamPreprocessor::endOfFileMarker) {
931
return emitEndOfFile(source);
933
m_token->appendToPublicIdentifier(cc);
934
XML_ADVANCE_TO(DOCTYPEPublicIdentifierQuotedState);
939
XML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
940
if (isTokenizerWhitespace(cc))
941
XML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
944
return emitEndOfFile(source);
949
XML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
950
if (isTokenizerWhitespace(cc))
951
XML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
954
return emitEndOfFile(source);
959
XML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
960
if (isTokenizerWhitespace(cc))
961
XML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
962
else if (cc == '"' || cc == '\'') {
963
m_token->setSystemIdentifierToEmptyString();
964
m_additionalAllowedCharacter = cc;
965
XML_ADVANCE_TO(DOCTYPESystemIdentifierQuotedState);
968
return emitEndOfFile(source);
973
XML_BEGIN_STATE(DOCTYPESystemIdentifierQuotedState) {
974
if (cc == m_additionalAllowedCharacter)
975
XML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
976
else if (isValidLiteralChar(cc)) {
977
m_token->appendToSystemIdentifier(cc);
978
XML_ADVANCE_TO(DOCTYPESystemIdentifierQuotedState);
981
return emitEndOfFile(source);
986
XML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
987
if (isTokenizerWhitespace(cc))
988
XML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
990
return emitAndResumeIn(source, XMLTokenizerState::DataState);
992
XML_ADVANCE_TO(BeforeDOCTYPEInternalSubsetState);
995
return emitEndOfFile(source);
1000
XML_BEGIN_STATE(BeforeDOCTYPEInternalSubsetState) {
1002
XML_ADVANCE_TO(AfterDOCTYPEInternalSubsetState);
1004
// FIXME implement internal subset
1006
return emitEndOfFile(source);
1011
XML_BEGIN_STATE(AfterDOCTYPEInternalSubsetState) {
1012
if (isTokenizerWhitespace(cc))
1013
XML_ADVANCE_TO(AfterDOCTYPEInternalSubsetState);
1015
return emitAndResumeIn(source, XMLTokenizerState::DataState);
1018
return emitEndOfFile(source);
1023
XML_BEGIN_STATE(CDATASectionState) {
1024
DEFINE_STATIC_LOCAL(String, closeString, (ASCIILiteral("]]>")));
1026
SegmentedString::LookAheadResult result = source.lookAhead(closeString);
1027
if (result == SegmentedString::DidMatch) {
1028
advanceStringAndASSERT(source, "]]>");
1029
return emitAndReconsumeIn(source, XMLTokenizerState::DataState);
1031
if (result == SegmentedString::NotEnoughCharacters)
1032
return haveBufferedCharacterToken();
1034
if (cc == InputStreamPreprocessor::endOfFileMarker) {
1036
return emitEndOfFile(source);
1038
m_token->appendToCDATA(cc);
1039
XML_ADVANCE_TO(CDATASectionState);
1045
ASSERT_NOT_REACHED();
1049
inline void XMLTokenizer::bufferCharacter(UChar character)
1051
ASSERT(character != InputStreamPreprocessor::endOfFileMarker);
1052
m_token->ensureIsCharacterToken();
1053
m_token->appendToCharacter(character);
1056
inline void XMLTokenizer::parseError()
1058
m_errorDuringParsing = true;