2
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
4
* The contents of this file are subject to the Netscape Public
5
* License Version 1.1 (the "License"); you may not use this file
6
* except in compliance with the License. You may obtain a copy of
7
* the License at http://www.mozilla.org/NPL/
9
* Software distributed under the License is distributed on an "AS
10
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
11
* implied. See the License for the specific language governing
12
* rights and limitations under the License.
14
* The Original Code is mozilla.org code.
16
* The Initial Developer of the Original Code is Netscape
17
* Communications Corporation. Portions created by Netscape are
18
* Copyright (C) 1998 Netscape Communications Corporation. All
28
* This file contains the declarations for all the HTML specific token types that
29
* our DTD's understand. In fact, the same set of token types are used for XML.
30
* Currently we have tokens for text, comments, start and end tags, entities,
31
* attributes, style, script and skipped content. Whitespace and newlines also
32
* have their own token types, but don't count on them to stay forever.
34
* If you're looking for the html tags, they're in a file called nsHTMLTag.h/cpp.
36
* Most of the token types have a similar API. They have methods to get the type
37
* of token (GetTokenType); those that represent HTML tags also have a method to
38
* get type tag type (GetTypeID). In addition, most have a method that causes the
39
* token to help in the parsing process called (Consume). We've also thrown in a
40
* few standard debugging methods as well.
47
#include "nsHTMLTags.h"
48
#include "nsParserError.h"
50
#include "nsScannerString.h"
54
/*******************************************************************
55
* This enum defines the set of token types that we currently support.
56
*******************************************************************/
58
enum eHTMLTokenTypes {
60
eToken_start=1, eToken_end, eToken_comment, eToken_entity,
61
eToken_whitespace, eToken_newline, eToken_text, eToken_attribute,
62
eToken_script, eToken_style, eToken_skippedcontent, eToken_instruction,
63
eToken_cdatasection, eToken_error, eToken_doctypeDecl, eToken_markupDecl,
64
eToken_last //make sure this stays the last token...
68
eHTMLCategory_unknown=0,
71
eHTMLCategory_blockAndInline,
74
eHTMLCategory_tablepart,
75
eHTMLCategory_tablerow,
76
eHTMLCategory_tabledata,
81
eHTMLCategory_options,
82
eHTMLCategory_frameset,
87
nsresult ConsumeQuotedString(PRUnichar aChar,nsString& aString,nsScanner& aScanner);
88
nsresult ConsumeAttributeText(PRUnichar aChar,nsString& aString,nsScanner& aScanner);
89
const PRUnichar* GetTagName(PRInt32 aTag);
90
//PRInt32 FindEntityIndex(nsString& aString,PRInt32 aCount=-1);
95
* This declares the basic token type used in the HTML DTD's.
96
* @update gess 3/25/98
98
class CHTMLToken : public CToken {
100
virtual ~CHTMLToken();
102
CHTMLToken(eHTMLTags aTag);
104
virtual eContainerInfo GetContainerInfo(void) const {return eFormUnknown;}
105
virtual void SetContainerInfo(eContainerInfo aInfo) { }
111
* This declares start tokens, which always take the form <xxxx>.
112
* This class also knows how to consume related attributes.
114
* @update gess 3/25/98
116
class CStartToken: public CHTMLToken {
120
CStartToken(eHTMLTags aTag=eHTMLTag_unknown);
121
CStartToken(const nsAString& aString);
122
CStartToken(const nsAString& aName,eHTMLTags aTag);
124
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
125
virtual PRInt32 GetTypeID(void);
126
virtual const char* GetClassName(void);
127
virtual PRInt32 GetTokenType(void);
129
virtual PRBool IsEmpty(void);
130
virtual void SetEmpty(PRBool aValue);
132
virtual const nsAString& GetStringValue();
133
virtual void GetSource(nsString& anOutputString);
134
virtual void AppendSourceTo(nsAString& anOutputString);
136
//the following info is used to set well-formedness state on start tags...
137
virtual eContainerInfo GetContainerInfo(void) const {return mContainerInfo;}
138
virtual void SetContainerInfo(eContainerInfo aContainerInfo) {mContainerInfo=aContainerInfo;}
139
virtual PRBool IsWellFormed(void) const {return PRBool(eWellFormed==mContainerInfo);}
143
* Get and set the ID attribute atom for this element.
144
* See http://www.w3.org/TR/1998/REC-xml-19980210#sec-attribute-types
145
* for the definition of an ID attribute.
148
virtual nsresult GetIDAttributeAtom(nsIAtom** aResult);
149
virtual nsresult SetIDAttributeAtom(nsIAtom* aID);
152
nsString mTrailingContent;
154
eContainerInfo mContainerInfo;
155
nsCOMPtr<nsIAtom> mIDAttributeAtom;
158
PRPackedBool mAttributed;
164
* This declares end tokens, which always take the
165
* form </xxxx>. This class also knows how to consume
166
* related attributes.
168
* @update gess 3/25/98
170
class CEndToken: public CHTMLToken {
174
CEndToken(eHTMLTags aTag);
175
CEndToken(const nsAString& aString);
176
CEndToken(const nsAString& aName,eHTMLTags aTag);
177
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
178
virtual PRInt32 GetTypeID(void);
179
virtual const char* GetClassName(void);
180
virtual PRInt32 GetTokenType(void);
182
virtual const nsAString& GetStringValue();
183
virtual void GetSource(nsString& anOutputString);
184
virtual void AppendSourceTo(nsAString& anOutputString);
192
* This declares comment tokens. Comments are usually
193
* thought of as tokens, but we treat them that way
194
* here so that the parser can have a consistent view
197
* @update gess 3/25/98
199
class CCommentToken: public CHTMLToken {
204
CCommentToken(const nsAString& aString);
205
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
206
virtual const char* GetClassName(void);
207
virtual PRInt32 GetTokenType(void);
208
virtual const nsAString& GetStringValue(void);
209
virtual void AppendSourceTo(nsAString& anOutputString);
211
nsresult ConsumeStrictComment(nsScanner& aScanner);
212
nsresult ConsumeQuirksComment(nsScanner& aScanner);
215
nsScannerSubstring mComment; // does not include MDO & MDC
216
nsScannerSubstring mCommentDecl; // includes MDO & MDC
221
* This class declares entity tokens, which always take
222
* the form &xxxx;. This class also offers a few utility
223
* methods that allow you to easily reduce entities.
225
* @update gess 3/25/98
227
class CEntityToken : public CHTMLToken {
232
CEntityToken(const nsAString& aString);
233
virtual const char* GetClassName(void);
234
virtual PRInt32 GetTokenType(void);
235
PRInt32 TranslateToUnicodeStr(nsString& aString);
236
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
237
static nsresult ConsumeEntity(PRUnichar aChar,nsString& aString,nsScanner& aScanner);
238
static PRInt32 TranslateToUnicodeStr(PRInt32 aValue,nsString& aString);
240
virtual const nsAString& GetStringValue(void);
241
virtual void GetSource(nsString& anOutputString);
242
virtual void AppendSourceTo(nsAString& anOutputString);
250
* Whitespace tokens are used where whitespace can be
251
* detected as distinct from text. This allows us to
252
* easily skip leading/trailing whitespace when desired.
254
* @update gess 3/25/98
256
class CWhitespaceToken: public CHTMLToken {
261
CWhitespaceToken(const nsAString& aString);
262
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
263
virtual const char* GetClassName(void);
264
virtual PRInt32 GetTokenType(void);
265
virtual const nsAString& GetStringValue(void);
272
* Text tokens contain the normalized form of html text.
273
* These tokens are guaranteed not to contain entities,
274
* start or end tags, or newlines.
276
* @update gess 3/25/98
278
class CTextToken: public CHTMLToken {
283
CTextToken(const nsAString& aString);
284
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
285
nsresult ConsumeUntil(PRUnichar aChar,PRBool aIgnoreComments,nsScanner& aScanner,
286
nsString& aEndTagName,PRInt32 aFlag,PRBool& aFlushTokens);
287
virtual const char* GetClassName(void);
288
virtual PRInt32 GetTokenType(void);
289
virtual PRInt32 GetTextLength(void);
290
virtual void CopyTo(nsAString& aStr);
291
virtual const nsAString& GetStringValue(void);
292
virtual void Bind(nsScanner* aScanner, nsScannerIterator& aStart, nsScannerIterator& aEnd);
293
virtual void Bind(const nsAString& aStr);
296
nsScannerSubstring mTextValue;
301
* CDATASection tokens contain raw unescaped text content delimited by
303
* XXX Not really a HTML construct - maybe we need a separation
305
* @update vidur 11/12/98
307
class CCDATASectionToken : public CHTMLToken {
311
CCDATASectionToken(eHTMLTags aTag = eHTMLTag_unknown);
312
CCDATASectionToken(const nsAString& aString);
313
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
314
virtual const char* GetClassName(void);
315
virtual PRInt32 GetTokenType(void);
316
virtual const nsAString& GetStringValue(void);
324
* Declaration tokens contain raw unescaped text content (not really, but
325
* right now we use this only for view source).
326
* XXX Not really a HTML construct - maybe we need a separation
329
class CMarkupDeclToken : public CHTMLToken {
334
CMarkupDeclToken(const nsAString& aString);
335
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
336
virtual const char* GetClassName(void);
337
virtual PRInt32 GetTokenType(void);
338
virtual const nsAString& GetStringValue(void);
341
nsScannerSubstring mTextValue;
346
* Attribute tokens are used to contain attribute key/value
347
* pairs whereever they may occur. Typically, they should
348
* occur only in start tokens. However, we may expand that
349
* ability when XML tokens become commonplace.
351
* @update gess 3/25/98
353
class CAttributeToken: public CHTMLToken {
358
CAttributeToken(const nsAString& aString);
359
CAttributeToken(const nsAString& aKey, const nsAString& aString);
360
~CAttributeToken() {}
361
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
362
virtual const char* GetClassName(void);
363
virtual PRInt32 GetTokenType(void);
364
virtual const nsAString& GetKey(void); // XXX {return mTextKey;}
365
virtual void SetKey(const nsAString& aKey);
366
virtual void BindKey(nsScanner* aScanner, nsScannerIterator& aStart, nsScannerIterator& aEnd);
367
virtual const nsAString& GetValue(void) {return mTextValue;}
368
virtual void SanitizeKey();
369
virtual const nsAString& GetStringValue(void);
370
virtual void GetSource(nsString& anOutputString);
371
virtual void AppendSourceTo(nsAString& anOutputString);
373
PRPackedBool mHasEqualWithoutValue;
376
PRPackedBool mLastAttribute;
378
nsAutoString mTextValue;
379
nsScannerSubstring mTextKey;
384
* Newline tokens contain, you guessed it, newlines.
385
* They consume newline (CR/LF) either alone or in pairs.
387
* @update gess 3/25/98
389
class CNewlineToken: public CHTMLToken {
394
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
395
virtual const char* GetClassName(void);
396
virtual PRInt32 GetTokenType(void);
397
virtual const nsAString& GetStringValue(void);
399
static void AllocNewline();
400
static void FreeNewline();
405
* Script tokens contain sequences of javascript (or, gulp,
406
* any other script you care to send). We don't tokenize
407
* it here, nor validate it. We just wrap it up, and pass
408
* it along to the html parser, who sends it (later on)
409
* to the scripting engine.
411
* @update gess 3/25/98
413
class CScriptToken: public CHTMLToken {
418
CScriptToken(const nsAString& aString);
419
virtual const char* GetClassName(void);
420
virtual PRInt32 GetTokenType(void);
421
virtual const nsAString& GetStringValue(void);
429
* Style tokens contain sequences of css style. We don't
430
* tokenize it here, nor validate it. We just wrap it up,
431
* and pass it along to the html parser, who sends it
432
* (later on) to the style engine.
434
* @update gess 3/25/98
436
class CStyleToken: public CHTMLToken {
441
CStyleToken(const nsAString& aString);
442
virtual const char* GetClassName(void);
443
virtual PRInt32 GetTokenType(void);
444
virtual const nsAString& GetStringValue(void);
452
* Whitespace tokens are used where whitespace can be
453
* detected as distinct from text. This allows us to
454
* easily skip leading/trailing whitespace when desired.
456
* @update gess 3/25/98
458
class CInstructionToken: public CHTMLToken {
463
CInstructionToken(const nsAString& aString);
464
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
465
virtual const char* GetClassName(void);
466
virtual PRInt32 GetTokenType(void);
467
virtual const nsAString& GetStringValue(void);
473
class CErrorToken : public CHTMLToken {
477
CErrorToken(nsParserError* aError=0);
479
virtual const char* GetClassName(void);
480
virtual PRInt32 GetTokenType(void);
482
void SetError(nsParserError* aError); // CErrorToken takes ownership of aError
484
// The nsParserError object returned by GetError is still owned by CErrorToken.
485
// DO NOT use the delete operator on it. Should we change this so that a copy
486
// of nsParserError is returned which needs to be destroyed by the consumer?
487
const nsParserError* GetError(void);
489
virtual const nsAString& GetStringValue(void);
492
nsParserError* mError;
496
* This token is generated by the HTML and Expat tokenizers
497
* when they see the doctype declaration ("<!DOCTYPE ... >")
501
class CDoctypeDeclToken: public CHTMLToken {
505
CDoctypeDeclToken(eHTMLTags aTag=eHTMLTag_unknown);
506
CDoctypeDeclToken(const nsAString& aString,eHTMLTags aTag=eHTMLTag_unknown);
507
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
508
virtual const char* GetClassName(void);
509
virtual PRInt32 GetTokenType(void);
510
virtual const nsAString& GetStringValue(void);
511
virtual void SetStringValue(const nsAString& aStr);