2
$Id: sax_html.pp,v 1.5 2003/03/16 22:38:09 sg Exp $
3
This file is part of the Free Component Library
5
HTML parser with SAX-like interface
6
Copyright (c) 2000-2002 by
7
Areca Systems GmbH / Sebastian Guenther, sg@freepascal.org
9
See the file COPYING.FPC, included in this distribution,
10
for details about the copyright.
12
This program is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16
**********************************************************************}
21
* The whitespace handling does only work for processing the DOM tree.
22
Storing the DOM tree to a XML file will result in a quite ugly file.
23
(This probably has got much better with recent versions, which do
24
decent whitespace converting, but it's not tested really good.)
25
* Entity references in attribute values don't get parsed.
33
uses SysUtils, Classes, SAX, DOM, DOM_HTML;
37
{ THTMLReader: The HTML reader class }
39
THTMLScannerContext = (
41
scWhitespace, // within whitespace
42
scText, // within text
43
scEntityReference, // within entity reference ("&...;")
44
scTag); // within a start tag or end tag
46
THTMLReader = class(TSAXReader)
49
FEndOfStream: Boolean;
50
FScannerContext: THTMLScannerContext;
51
FTokenText: SAXString;
52
FCurStringValueDelimiter: Char;
53
FAttrNameRead: Boolean;
55
procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
58
destructor Destroy; override;
60
procedure Parse(AInput: TSAXInputSource); override; overload;
62
property EndOfStream: Boolean read FEndOfStream;
63
property ScannerContext: THTMLScannerContext read FScannerContext;
64
property TokenText: SAXString read FTokenText;
68
{ THTMLToDOMConverter }
70
THTMLNodeType = (ntWhitespace, ntText, ntEntityReference, ntTag);
73
NodeType: THTMLNodeType;
77
THTMLToDOMConverter = class
80
FDocument: TDOMDocument;
83
IsFragmentMode, FragmentRootSet: Boolean;
84
FragmentRoot: TDOMNode;
86
procedure ReaderCharacters(Sender: TObject; const ch: PSAXChar;
87
Start, Count: Integer);
88
procedure ReaderIgnorableWhitespace(Sender: TObject; const ch: PSAXChar;
89
Start, Count: Integer);
90
procedure ReaderSkippedEntity(Sender: TObject; const Name: SAXString);
91
procedure ReaderStartElement(Sender: TObject;
92
const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
93
procedure ReaderEndElement(Sender: TObject;
94
const NamespaceURI, LocalName, RawName: SAXString);
97
constructor Create(AReader: THTMLReader; ADocument: TDOMDocument);
98
constructor CreateFragment(AReader: THTMLReader; AFragmentRoot: TDOMNode);
99
destructor Destroy; override;
103
// Helper functions; these ones are HTML equivalents of ReadXML[File|Fragment]
105
procedure ReadHTMLFile(var ADoc: THTMLDocument; const AFilename: String);
106
procedure ReadHTMLFile(var ADoc: THTMLDocument; var f: TStream);
108
procedure ReadHTMLFragment(AParentNode: TDOMNode; const AFilename: String);
109
procedure ReadHTMLFragment(AParentNode: TDOMNode; var f: TStream);
118
WhitespaceChars = [#9, #10, #13, ' '];
121
constructor THTMLReader.Create;
124
FScannerContext := scUnknown;
127
destructor THTMLReader.Destroy;
134
procedure THTMLReader.Parse(AInput: TSAXInputSource);
136
MaxBufferSize = 1024;
138
Buffer: array[0..MaxBufferSize - 1] of Char;
139
BufferSize, BufferPos: Integer;
147
FEndOfStream := False;
150
// Read data into the input buffer
151
BufferSize := AInput.Stream.Read(Buffer, MaxBufferSize);
152
if BufferSize = 0 then
154
FEndOfStream := True;
159
while BufferPos < BufferSize do
160
case ScannerContext of
162
case Buffer[BufferPos] of
164
EnterNewScannerContext(scWhitespace);
168
EnterNewScannerContext(scEntityReference);
173
EnterNewScannerContext(scTag);
176
EnterNewScannerContext(scText);
179
case Buffer[BufferPos] of
182
FTokenText := FTokenText + Buffer[BufferPos];
188
EnterNewScannerContext(scEntityReference);
193
EnterNewScannerContext(scTag);
196
EnterNewScannerContext(scText);
199
case Buffer[BufferPos] of
201
EnterNewScannerContext(scWhitespace);
205
EnterNewScannerContext(scEntityReference);
210
EnterNewScannerContext(scTag);
214
FTokenText := FTokenText + Buffer[BufferPos];
219
if Buffer[BufferPos] = ';' then
222
EnterNewScannerContext(scUnknown);
223
end else if not (Buffer[BufferPos] in
224
['a'..'z', 'A'..'Z', '0'..'9', '#']) then
225
EnterNewScannerContext(scUnknown)
228
FTokenText := FTokenText + Buffer[BufferPos];
232
case Buffer[BufferPos] of
235
if FAttrNameRead then
237
if FCurStringValueDelimiter = #0 then
238
FCurStringValueDelimiter := Buffer[BufferPos]
239
else if FCurStringValueDelimiter = Buffer[BufferPos] then
241
FCurStringValueDelimiter := #0;
242
FAttrNameRead := False;
245
FTokenText := FTokenText + Buffer[BufferPos];
250
FAttrNameRead := True;
251
FTokenText := FTokenText + Buffer[BufferPos];
257
if FCurStringValueDelimiter = #0 then
258
EnterNewScannerContext(scUnknown);
262
FTokenText := FTokenText + Buffer[BufferPos];
270
procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
272
function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
276
ValueDelimiter: Char;
282
Result := LowerCase(s)
285
Result := LowerCase(Copy(s, 1, i - 1));
286
Attr := TSAXAttributes.Create;
290
while (i <= Length(s)) and (s[i] in WhitespaceChars) do
293
SetLength(AttrName, 0);
296
while j <= Length(s) do
299
AttrName := LowerCase(Copy(s, i, j - i));
301
if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
303
ValueDelimiter := s[j];
306
ValueDelimiter := #0;
309
while j <= Length(s) do
310
if ValueDelimiter = #0 then
311
if s[j] in WhitespaceChars then
315
else if s[j] = ValueDelimiter then
322
Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
327
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
331
else if s[j] in WhitespaceChars then
333
Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
335
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
344
Attr: TSAXAttributes;
345
EntString, TagName: String;
350
case ScannerContext of
352
DoIgnorableWhitespace(PSAXChar(TokenText), 1, Length(TokenText));
354
DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
357
if ResolveHTMLEntityReference(TokenText, Ent) then
360
DoCharacters(PSAXChar(EntString), 0, 1);
363
{ Is this a predefined Unicode character entity? We must check this,
364
as undefined entities must be handled as text, for compatiblity
365
to popular browsers... }
367
for i := Low(UnicodeHTMLEntities) to High(UnicodeHTMLEntities) do
368
if UnicodeHTMLEntities[i] = TokenText then
374
DoSkippedEntity(TokenText)
376
DoCharacters(PSAXChar('&' + TokenText), 0, Length(TokenText) + 1);
380
if Length(TokenText) > 0 then
383
if TokenText[1] = '/' then
386
SplitTagString(Copy(TokenText, 2, Length(TokenText)), Attr), '');
387
end else if TokenText[1] <> '!' then
389
// Do NOT combine to a single line, as Attr is an output value!
390
TagName := SplitTagString(TokenText, Attr);
391
DoStartElement('', TagName, '', Attr);
393
if Assigned(Attr) then
397
FScannerContext := NewContext;
398
SetLength(FTokenText, 0);
399
FCurStringValueDelimiter := #0;
400
FAttrNameRead := False;
404
{ THTMLToDOMConverter }
406
constructor THTMLToDOMConverter.Create(AReader: THTMLReader;
407
ADocument: TDOMDocument);
411
FReader.OnCharacters := @ReaderCharacters;
412
FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
413
FReader.OnSkippedEntity := @ReaderSkippedEntity;
414
FReader.OnStartElement := @ReaderStartElement;
415
FReader.OnEndElement := @ReaderEndElement;
416
FDocument := ADocument;
417
FElementStack := TList.Create;
418
FNodeBuffer := TList.Create;
421
constructor THTMLToDOMConverter.CreateFragment(AReader: THTMLReader;
422
AFragmentRoot: TDOMNode);
426
FReader.OnCharacters := @ReaderCharacters;
427
FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
428
FReader.OnSkippedEntity := @ReaderSkippedEntity;
429
FReader.OnStartElement := @ReaderStartElement;
430
FReader.OnEndElement := @ReaderEndElement;
431
FDocument := AFragmentRoot.OwnerDocument;
432
FElementStack := TList.Create;
433
FNodeBuffer := TList.Create;
434
FragmentRoot := AFragmentRoot;
435
IsFragmentMode := True;
438
destructor THTMLToDOMConverter.Destroy;
442
// Theoretically, always exactly one item will remain - the root element:
443
for i := 0 to FNodeBuffer.Count - 1 do
444
THTMLNodeInfo(FNodeBuffer[i]).Free;
451
procedure THTMLToDOMConverter.ReaderCharacters(Sender: TObject;
452
const ch: PSAXChar; Start, Count: Integer);
455
NodeInfo: THTMLNodeInfo;
458
Move(ch^, s[1], Count * SizeOf(SAXChar));
460
NodeInfo := THTMLNodeInfo.Create;
461
NodeInfo.NodeType := ntText;
462
NodeInfo.DOMNode := FDocument.CreateTextNode(s);
463
FNodeBuffer.Add(NodeInfo);
466
procedure THTMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
467
const ch: PSAXChar; Start, Count: Integer);
470
NodeInfo: THTMLNodeInfo;
473
Move(ch^, s[1], Count * SizeOf(SAXChar));
475
NodeInfo := THTMLNodeInfo.Create;
476
NodeInfo.NodeType := ntWhitespace;
477
NodeInfo.DOMNode := FDocument.CreateTextNode(s);
478
FNodeBuffer.Add(NodeInfo);
481
procedure THTMLToDOMConverter.ReaderSkippedEntity(Sender: TObject;
482
const Name: SAXString);
484
NodeInfo: THTMLNodeInfo;
486
NodeInfo := THTMLNodeInfo.Create;
487
NodeInfo.NodeType := ntEntityReference;
488
NodeInfo.DOMNode := FDocument.CreateEntityReference(Name);
489
FNodeBuffer.Add(NodeInfo);
492
procedure THTMLToDOMConverter.ReaderStartElement(Sender: TObject;
493
const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
495
NodeInfo: THTMLNodeInfo;
496
Element: TDOMElement;
499
// WriteLn('Start: ', LocalName, '. Node buffer before: ', FNodeBuffer.Count, ' elements');
500
Element := FDocument.CreateElement(LocalName);
501
if Assigned(Attr) then
503
// WriteLn('Attribute: ', Attr.GetLength);
504
for i := 0 to Attr.GetLength - 1 do
506
// WriteLn('#', i, ': LocalName = ', Attr.GetLocalName(i), ', Value = ', Attr.GetValue(i));
507
Element[Attr.GetLocalName(i)] := Attr.GetValue(i);
511
NodeInfo := THTMLNodeInfo.Create;
512
NodeInfo.NodeType := ntTag;
513
NodeInfo.DOMNode := Element;
514
if IsFragmentMode then
516
if not FragmentRootSet then
518
FragmentRoot.AppendChild(Element);
519
FragmentRootSet := True;
522
if not Assigned(FDocument.DocumentElement) then
523
FDocument.AppendChild(Element);
524
FNodeBuffer.Add(NodeInfo);
525
// WriteLn('Start: ', LocalName, '. Node buffer after: ', FNodeBuffer.Count, ' elements');
528
procedure THTMLToDOMConverter.ReaderEndElement(Sender: TObject;
529
const NamespaceURI, LocalName, RawName: SAXString);
531
NodeInfo, NodeInfo2: THTMLNodeInfo;
533
TagInfo: PHTMLElementProps;
535
// WriteLn('End: ', LocalName, '. Node buffer: ', FNodeBuffer.Count, ' elements');
536
// Find the matching start tag
537
i := FNodeBuffer.Count - 1;
540
NodeInfo := THTMLNodeInfo(FNodeBuffer.Items[i]);
541
if (NodeInfo.NodeType = ntTag) and
542
(CompareText(NodeInfo.DOMNode.NodeName, LocalName) = 0) then
544
// We found the matching start tag
547
for j := Low(HTMLElProps) to High(HTMLElProps) do
548
if CompareText(HTMLElProps[j].Name, LocalName) = 0 then
550
TagInfo := @HTMLElProps[j];
555
while i < FNodeBuffer.Count do
557
NodeInfo2 := THTMLNodeInfo(FNodeBuffer.Items[i]);
559
if (NodeInfo2.NodeType = ntWhitespace) and Assigned(TagInfo) and
560
(not (efPreserveWhitespace in TagInfo^.Flags)) then
561
// Handle whitespace, which doesn't need to get preserved...
562
if not (efPCDATAContent in TagInfo^.Flags) then
563
// No character data allowed within the current element
564
NodeInfo2.DOMNode.Free
567
// Character data allowed, so normalize it
568
NodeInfo2.DOMNode.NodeValue := ' ';
569
NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode)
572
NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode);
575
FNodeBuffer.Delete(i);
584
procedure ReadHTMLFile(var ADoc: THTMLDocument; const AFilename: String);
589
f := TFileStream.Create(AFilename, fmOpenRead);
591
ReadHTMLFile(ADoc, f);
597
procedure ReadHTMLFile(var ADoc: THTMLDocument; var f: TStream);
600
Converter: THTMLToDOMConverter;
602
ADoc := THTMLDocument.Create;
603
Reader := THTMLReader.Create;
605
Converter := THTMLToDOMConverter.Create(Reader, ADoc);
607
Reader.ParseStream(f);
616
procedure ReadHTMLFragment(AParentNode: TDOMNode; const AFilename: String);
620
f := TFileStream.Create(AFilename, fmOpenRead);
622
ReadHTMLFragment(AParentNode, f);
628
procedure ReadHTMLFragment(AParentNode: TDOMNode; var f: TStream);
631
Converter: THTMLToDOMConverter;
633
Reader := THTMLReader.Create;
635
Converter := THTMLToDOMConverter.CreateFragment(Reader, AParentNode);
637
Reader.ParseStream(f);
651
$Log: sax_html.pp,v $
652
Revision 1.5 2003/03/16 22:38:09 sg
653
* Added fragment parsing functions
655
Revision 1.4 2002/12/14 19:18:21 sg
656
* Improved whitespace handling (although it's still not perfect in all
659
Revision 1.3 2002/12/12 20:17:32 sg
660
* More WideString fixes
662
Revision 1.2 2002/12/12 13:43:38 michael
663
+ Patches from peter to fix 1.1 compile
665
Revision 1.1 2002/12/11 21:06:07 sg
667
* Replaced htmldoc unit with dom_html unit
668
* Added SAX parser framework and SAX HTML parser