1
{ Simple Wiki parser for the FreePascal/Lazarus Wiki export pages
3
Copyright (C) 2012 Mattias Gaertner mattias@freepascal.org
5
This source is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free
7
Software Foundation; either version 2 of the License, or (at your option)
10
This code is distributed in the hope that it will be useful, but WITHOUT ANY
11
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
15
A copy of the GNU General Public License is available on the World Wide Web
16
at <http://www.gnu.org/copyleft/gpl.html>. You can also obtain it by writing
17
to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
links without brackets: http://... see bidimode
22
div, div class="key", font: this is pure HTML and maybe should be better fixed in the wiki
23
attributes in pre <pre>'''Text'''</pre> see page BidiMode
24
code in list items, see Compile_and_Develop_on_Maemo_device
30
{ $DEFINE VerboseWikiStack}
31
{ $DEFINE VerboseWikiOnToken}
32
{ $DEFINE VerboseUnknownOpenTags}
37
Classes, SysUtils, laz2_XMLRead, laz2_DOM, LazUTF8, LazLogger,
38
BasicCodeTools, KeywordFuncLists;
44
wptText, // TWPTextToken
45
wptAttribute, // e.g. class="code" TWPNameValueToken
46
wptLineBreak, // <br> /br> <br/>
51
wptStrikeTagShort, // <s>
52
wptStrikeTagLong, // <strike>
53
wptUnderlineTag, // <u>
59
wptString, // <string>
64
wptCode, // TWPNameValueToken
65
wptSpecial, // {{text}}
66
wptPre, // space at line start
71
wptCenter, // <center>
72
wptInternLink, // [[]]
74
wptHorizontalRow, // ----
76
wptOrderedListTag, // <ol>
78
wptUnorderedListTag, // <ul>
79
wptDefinitionList, // : or ;
81
wptListItemTag, // <li>
82
wptTable, // wiki tag for table
83
wptTableTag, // <table>
84
wptTableRow, // wiki tag for table row
85
wptTableRowTag, // <tr>
86
wptTableHeadCell, // wiki tag for table head cell
87
wptTableHeadCellTag, // <th>
88
wptTableCell, // wiki tag for table cell
89
wptTableCellTag, // <td>
90
wptSection, // started/ended by =
91
wptSubSection, // started automatically, ended on empty line
97
TWPTokenTypes = set of TWPTokenType;
104
TWPTokenInfoFlags = set of TWPTokenInfoFlag;
114
TWPTokenGroups = set of TWPTokenGroup;
122
TWPTokenInfo = record
124
Flags: TWPTokenInfoFlags;
125
Group: TWPTokenGroup;
126
BaseToken: TWPTokenType;
130
WPTWikiLists = [wptNumberedList,wptBulletList,wptDefinitionList,wptListItem];
132
WPTokenInfos: array[TWPTokenType] of TWPTokenInfo = (
133
(Caption: 'Text'; Flags: []; Group: wpgFont; BaseToken: wptText), // wptText,
134
(Caption: 'Attribute'; Flags: []; Group: wpgFont; BaseToken: wptAttribute), // wptAttribute,
135
(Caption: 'LineBreak'; Flags: []; Group: wpgFont; BaseToken: wptLineBreak), // wptLineBreak,
136
(Caption: 'Bold'; Flags: []; Group: wpgFont; BaseToken: wptBold), // wptBold,
137
(Caption: 'BoldTag'; Flags: []; Group: wpgFont; BaseToken: wptBold), // wptBoldTag,
138
(Caption: 'Italic'; Flags: []; Group: wpgFont; BaseToken: wptItalic), // wptItalic,
139
(Caption: 'ItalicTag'; Flags: []; Group: wpgFont; BaseToken: wptItalic), // wptItalicTag,
140
(Caption: 'StrikeTagShort'; Flags: []; Group: wpgFont; BaseToken: wptStrikeTagShort), // wptStrikeTagShort,
141
(Caption: 'StrikeTagLong'; Flags: []; Group: wpgFont; BaseToken: wptStrikeTagShort), // wptStrikeTagLong,
142
(Caption: 'UnderlineTag'; Flags: []; Group: wpgFont; BaseToken: wptUnderlineTag), // wptUnderlineTag,
143
(Caption: 'TT'; Flags: []; Group: wpgFont; BaseToken: wptTT), // wptTT,
144
(Caption: 'Sup'; Flags: []; Group: wpgFont; BaseToken: wptSup), // wptSup,
145
(Caption: 'Sub'; Flags: []; Group: wpgFont; BaseToken: wptSub), // wptSub,
146
(Caption: 'Small'; Flags: []; Group: wpgFont; BaseToken: wptSmall), // wptSmall,
147
(Caption: 'Em'; Flags: []; Group: wpgFont; BaseToken: wptEm), // wptEm,
148
(Caption: 'String'; Flags: []; Group: wpgFont; BaseToken: wptString), // wptString,
149
(Caption: 'Var'; Flags: []; Group: wpgFont; BaseToken: wptVar), // wptVar,
150
(Caption: 'Key'; Flags: []; Group: wpgFont; BaseToken: wptKey), // wptKey,
151
(Caption: 'Cmt'; Flags: []; Group: wpgFont; BaseToken: wptCmt), // wptCmt,
152
(Caption: 'Span'; Flags: []; Group: wpgFont; BaseToken: wptSpan), // wptSpan,
153
(Caption: 'Code'; Flags: []; Group: wpgFont; BaseToken: wptCode), // wptCode,
154
(Caption: 'Special'; Flags: []; Group: wpgFont; BaseToken: wptSpecial), // wptSpecial,
155
(Caption: 'Pre'; Flags: []; Group: wpgParagraph; BaseToken: wptPre), // wptPre,
156
(Caption: 'PreTag'; Flags: []; Group: wpgParagraph; BaseToken: wptPre), // wptPreTag,
157
(Caption: 'P'; Flags: []; Group: wpgParagraph; BaseToken: wptP), // wptP,
158
(Caption: 'PTag'; Flags: []; Group: wpgParagraph; BaseToken: wptP), // wptPTag,
159
(Caption: 'DivTag'; Flags: []; Group: wpgParagraph; BaseToken: wptP), // wptDivTag,
160
(Caption: 'Center'; Flags: []; Group: wpgParagraph; BaseToken: wptCenter), // wptCenter
161
(Caption: 'InternLink'; Flags: []; Group: wpgParagraph; BaseToken: wptInternLink), // wptInternLink,
162
(Caption: 'ExternLink'; Flags: []; Group: wpgParagraph; BaseToken: wptExternLink), // wptExternLink,
163
(Caption: 'HorizontalRow'; Flags: []; Group: wpgParagraph; BaseToken: wptHorizontalRow), // wptHorizontalRow,
164
(Caption: 'NumberedList'; Flags: []; Group: wpgList; BaseToken: wptNumberedList), // wptNumberedList,
165
(Caption: 'OrderedListTag'; Flags: []; Group: wpgList; BaseToken: wptNumberedList), // wptOrderedListTag,
166
(Caption: 'BulletList'; Flags: []; Group: wpgList; BaseToken: wptBulletList), // wptBulletList,
167
(Caption: 'UnorderedListTag'; Flags: []; Group: wpgList; BaseToken: wptBulletList), // wptUnorderedListTag,
168
(Caption: 'DefinitionList'; Flags: []; Group: wpgList; BaseToken: wptDefinitionList), // wptDefinitionList,
169
(Caption: 'ListItem'; Flags: []; Group: wpgList; BaseToken: wptListItem), // wptListItem,
170
(Caption: 'ListItemTag'; Flags: []; Group: wpgList; BaseToken: wptListItem), // wptListItemTag,
171
(Caption: 'Table'; Flags: []; Group: wpgTable; BaseToken: wptTable), // wptTable,
172
(Caption: 'TableTag'; Flags: []; Group: wpgTable; BaseToken: wptTable), // wptTableTag,
173
(Caption: 'TableRow'; Flags: []; Group: wpgTable; BaseToken: wptTableRow), // wptTableRow,
174
(Caption: 'TableRowTag'; Flags: []; Group: wpgTable; BaseToken: wptTableRow), // wptTableRowTag,
175
(Caption: 'TableHeadCell'; Flags: []; Group: wpgTable; BaseToken: wptTableHeadCell), // wptTableHeadCell,
176
(Caption: 'TableHeadCellTag'; Flags: []; Group: wpgTable; BaseToken: wptTableHeadCell), // wptTableHeadCellTag,
177
(Caption: 'TableCell'; Flags: []; Group: wpgTable; BaseToken: wptTableCell), // wptTableCell,
178
(Caption: 'TableCellTag'; Flags: []; Group: wpgTable; BaseToken: wptTableCell), // wptTableCellTag,
179
(Caption: 'Section'; Flags: []; Group: wpgSection; BaseToken: wptSection), // wptSection,
180
(Caption: 'SubSection'; Flags: []; Group: wpgSubSection; BaseToken: wptP), // wptSubSection,
181
(Caption: 'Header'; Flags: []; Group: wpgSection; BaseToken: wptHeader), // wptHeader,
182
(Caption: 'Header1'; Flags: []; Group: wpgSection; BaseToken: wptHeader), // wptHeader1,
183
(Caption: 'Header2'; Flags: []; Group: wpgSection; BaseToken: wptHeader), // wptHeader2,
184
(Caption: 'Header3'; Flags: []; Group: wpgSection; BaseToken: wptHeader) // wptHeader3,
186
WPTokenRangeNames: array[TWPTokenRange] of string = (
200
SubToken: TWPTokenType;
201
Range: TWPTokenRange;
204
constructor Create(ThePage: TWikiPage; TheUserDate: Pointer);
207
TWPTextToken = class(TWPToken)
209
StartPos, EndPos: integer;
212
TWPLinkToken = class(TWPToken)
214
LinkStartPos, LinkEndPos: integer;
215
Link: string; // trimmed and cleaned up
216
CaptionStartPos, CaptionEndPos: integer;
219
TWPNameValueToken = class(TWPToken)
221
NameStartPos, NameEndPos: integer;
222
ValueStartPos, ValueEndPos: integer;
225
TWikiTokenEvent = procedure(Token: TWPToken) of object;
227
TWikiPageVerbosity = (
235
TWikiOnLog = procedure(Msg: string) of object;
242
TWPStackItem = record
246
PStackItem = ^TWPStackItem;
250
FAutoFixUTF8: boolean;
251
FLanguageTags: TKeyWordFunctionList;
255
FStackCapacity: integer;
263
FRangeToken: TWPToken;
265
FTextToken: TWPTextToken;
266
FLinkToken: TWPLinkToken;
267
FNameValueToken: TWPNameValueToken;
268
FOnToken: TWikiTokenEvent;
269
FVerbosity: TWikiPageVerbosity;
270
FInPre: integer; // >0 means in a pre range
271
procedure HandleAngleBracket; // tags
272
procedure HandleCode; // <code>
273
procedure HandleApostroph; // bold, italic
274
procedure HandleCurlyBracketOpen; // special, start of table
275
procedure HandlePipe; // new row, end of table
276
procedure HandleExclamationMark; // head cell
277
procedure HandleEdgedBracketOpen; // links
278
procedure HandleUnderScore; // __TOC__
279
procedure HandleEqual; // headers
280
procedure HandleListChar; // lists '*', '#', ':', ';'
281
procedure HandleSpace; // preserve space
282
procedure EmitFlag(Typ: TWPTokenType; Range: TWPTokenRange; TagLen: integer);
283
procedure EmitToggle(Typ: TWPTokenType; TagLen: integer);
284
procedure EmitTag(Typ: TWPTokenType; Range: TWPTokenRange);
285
procedure EmitLineBreak;
286
procedure EmitTextToken;
288
procedure ParseAttributes(StartPos, EndPos: PChar);
289
procedure ParseNoWiki;
290
procedure CloseTableCell;
291
procedure CloseRangeToken(Typ: TWPTokenType);
292
procedure OpenRangeToken(Typ: TWPTokenType);
293
function FindTagEnd(TagStart: PChar): PChar;
294
procedure SetAutoFixUTF8(AValue: boolean);
295
procedure SetSrc(AValue: string);
296
function TokenIs(Tag: PChar): boolean;
297
procedure ClearStack;
298
procedure Push(Token: TWPTokenType; StartPos: PChar);
299
function Pop(Token: TWPTokenType): boolean;
300
procedure Pop(Index: integer);
301
function TopToken: TWPTokenType;
302
function FindGroupStackPos(Group: TWPTokenGroup; OrHigher: boolean): integer;
303
function FindStackItem(Typ: TWPTokenType): integer;
304
procedure DoToken(Token: TWPToken);
307
destructor Destroy; override;
308
procedure LoadFromFile(Filename: string);
309
procedure LoadFromDoc(doc: TDOMNode);
310
procedure Parse(const OnToken: TWikiTokenEvent; Data: Pointer = nil);
311
property ID: String read FID write FID; // mediawiki/siteinfo/page/id
312
property Title: String read FTitle write FTitle; // mediawiki/siteinfo/page/title
313
property Revision: String read FRevision write FRevision; // mediawiki/siteinfo/page/revision/id
314
property TimeStamp: String read FTimeStamp write FTimeStamp; // mediawiki/siteinfo/page/timestamp
315
property Filename: string read FFilename write FFilename; // mediawiki/siteinfo/page/id
316
property BaseURL: string read FBaseURL write FBaseURL; // ExtractFilePath(mediawiki/siteinfo/base)
317
property Verbosity: TWikiPageVerbosity read FVerbosity write FVerbosity;
318
property AutoFixUTF8: boolean read FAutoFixUTF8 write SetAutoFixUTF8;
320
property Src: string read FSrc write SetSrc;
321
function StrPos(p: PChar): integer;
322
function PosToStr(p: PChar; WithFilename: boolean = false): string;
323
function PosToStr(p: integer; WithFilename: boolean = false): string;
324
function AtLineStart(p: PChar): boolean;
325
function TrimLink(const Link: string): string;
326
function CurrentPos: integer;
327
property LanguageTags: TKeyWordFunctionList read FLanguageTags write FLanguageTags;
328
procedure Log(Msg: string);
329
property OnLog: TWikiOnLog read FOnLog write FOnLog;
334
IsWikiTagChar: array[char] of boolean;
336
// normalize link to get the page, e.g. convert spaces to underscores
337
function WikiInternalLinkToPage(Link: string): string;
338
function WikiIsExternalLink(Link: string): boolean;
340
function GetWikiPageID(doc: TDOMNode): string;
341
function GetWikiPageID(s: TStream): string;
342
function WikiPageToCaseID(Page: string): string; // create a bit vector for each letter
344
function dbgs(t: TWPTokenType): string; overload;
345
function dbgs(r: TWPTokenRange): string; overload;
351
constructor TWPToken.Create(ThePage: TWikiPage; TheUserDate: Pointer);
354
UserData:=TheUserDate;
359
function TWikiPage.StrPos(p: PChar): integer;
361
Result:=p-PChar(FSrc)+1;
364
function TWikiPage.AtLineStart(p: PChar): boolean;
366
Result:=(p=PChar(FSrc)) or (p[-1] in [#10,#13]);
369
function TWikiPage.PosToStr(p: PChar; WithFilename: boolean): string;
373
else if (Src='') then
375
else if p<PChar(FSrc) then
376
Result:='(invalid pos <0)'
378
Result:=PosToStr(StrPos(p),WithFilename);
382
function TWikiPage.PosToStr(p: integer; WithFilename: boolean): string;
388
if SrcPosToLineCol(FSrc,p,y,x) then
389
Result:='('+IntToStr(y)+','+IntToStr(x)+')'
392
if WithFilename then begin
395
s:=ExtractFilename(Filename)
396
else if Title<>'' then begin
399
s:=LeftStr(s,19)+'...'+RightStr(s,19);
401
Result:=Result+' in "'+s+'"'
405
procedure TWikiPage.SetAutoFixUTF8(AValue: boolean);
407
if FAutoFixUTF8=AValue then Exit;
408
FAutoFixUTF8:=AValue;
413
procedure TWikiPage.SetSrc(AValue: string);
415
if FSrc=AValue then Exit;
417
if AutoFixUTF8 then FixUTF8;
420
function TWikiPage.TokenIs(Tag: PChar): boolean;
425
while (p2^<>#0) and (UpChars[p2^]=UpChars[Tag^]) do begin
432
procedure TWikiPage.ClearStack;
434
ReAllocMem(FStack,0);
440
procedure TWikiPage.Push(Token: TWPTokenType; StartPos: PChar);
442
NewCapacity: Integer;
446
{$IFDEF VerboseWikiStack}
447
Log(['Push :',GetIndentStr(FStackPtr*2),dbgs(Token),' at ',PosToStr(FCurP)]);
449
if FStackPtr>=FStackCapacity then begin
450
NewCapacity:=FStackCapacity*2+8;
451
ReAllocMem(FStack,SizeOf(TWPStackItem)*NewCapacity);
452
FStackCapacity:=NewCapacity;
454
Item:=@FStack[FStackPtr];
456
Item^.StartPos:=StartPos;
457
if Token in [wptPre,wptPreTag] then
459
OpenRangeToken(Token);
462
function TWikiPage.Pop(Token: TWPTokenType): boolean;
464
procedure LogMissingClose;
468
Item:=@FStack[FStackPtr];
469
Log('TWikiPage.Pop WARNING: missing closing for '+dbgs(Item^.Token)+' at '+PosToStr(FCurP,true));
472
procedure LogNotOpen;
474
Log('TWikiPage.Pop Hint: tag was not open: '+dbgs(Token)+' at '+PosToStr(FCurP,true));
479
Group: TWPTokenGroup;
484
Group:=WPTokenInfos[Token].Group;
485
while (i>=0) and (ord(WPTokenInfos[FStack[i].Token].Group) <= ord(Group)) do
487
if FStack[i].Token=Token then begin
489
while FStackPtr>=i do begin
490
Item:=@FStack[FStackPtr];
491
if (Verbosity>=wpvWarning)
492
and (FStackPtr>i) and (wpifWarnOnAutoClose in WPTokenInfos[Item^.Token].Flags)
495
{$IFDEF VerboseWikiStack}
496
Log('Pop :'+GetIndentStr(FStackPtr*2)+dbgs(Item^.Token)+' at '+PosToStr(FCurP));
498
if Item^.Token in [wptPre,wptPreTag] then
500
CloseRangeToken(Item^.Token);
508
if Verbosity>=wpvHint then
512
procedure TWikiPage.Pop(Index: integer);
514
if Index<0 then Index:=0;
515
while FStackPtr>=Index do
519
function TWikiPage.TopToken: TWPTokenType;
522
Result:=FStack[FStackPtr].Token
527
function TWikiPage.FindGroupStackPos(Group: TWPTokenGroup; OrHigher: boolean
530
CurGroup: TWPTokenGroup;
533
while (Result>=0) do begin
534
CurGroup:=WPTokenInfos[FStack[Result].Token].Group;
535
if (ord(CurGroup)>=ord(Group)) then begin
536
if (not OrHigher) and (CurGroup<>Group) then
544
function TWikiPage.FindStackItem(Typ: TWPTokenType): integer;
547
while (Result>=0) and (FStack[Result].Token<>Typ) do dec(Result);
550
procedure TWikiPage.DoToken(Token: TWPToken);
551
{$IFDEF VerboseWikiOnToken}
556
Token.Token:=WPTokenInfos[Token.SubToken].BaseToken;
557
{$IFDEF VerboseWikiOnToken}
560
Log('Token:'+GetIndentStr(i*2)+dbgs(Token.Token)+' at '+PosToStr(FCurP));
565
procedure TWikiPage.EmitTextToken;
567
if FStackPtr<0 then begin
568
// highest level => skip space at start
569
while (FLastEmitPos<FCurP) and (FLastEmitPos^ in [#1..#32]) do
572
if FCurP<=FLastEmitPos then exit;
574
if (FStackPtr<0) or (TopToken=wptSection) then begin
575
// highest level => start a paragraph
576
Push(wptSubSection,FCurP);
578
// maybe: add an option and when enabled combine multiple spaces and replace line breaks with space
579
FTextToken.SubToken:=wptText;
580
FTextToken.Range:=wprNone;
581
FTextToken.StartPos:=StrPos(FLastEmitPos);
582
FTextToken.EndPos:=StrPos(FCurP);
587
procedure TWikiPage.ParseAttributes(StartPos, EndPos: PChar);
591
//Log('TWikiPage.ParseAttributes '+PosToStr(StartPos)+' '+PosToStr(EndPos)+' <'+dbgstr(StartPos,EndPos-StartPos),'>');
595
while p^ in [' ',#9,#10,#13] do inc(p);
596
if p>=EndPos then break;
598
if not IsIdentStartChar[p^] then break;
599
FNameValueToken.NameStartPos:=StrPos(p);
600
while IsIdentChar[p^] do inc(p);
601
FNameValueToken.NameEndPos:=StrPos(p);
603
while p^ in [' ',#9,#10,#13] do inc(p);
604
if p>=EndPos then break;
606
if p^<>'=' then break;
609
while p^ in [' ',#9,#10,#13] do inc(p);
610
if p>=EndPos then break;
612
if p^<>'"' then break;
614
FNameValueToken.ValueStartPos:=StrPos(p);
615
while not (p^ in ['"',#0]) do inc(p);
616
if p^<>'"' then break;
617
FNameValueToken.ValueEndPos:=StrPos(p);
618
if p>=EndPos then break;
619
FNameValueToken.SubToken:=wptAttribute;
620
DoToken(FNameValueToken);
623
//Log(['TWikiPage.ParseAttributes stopped at <'+dbgstr(StartPos,p-StartPos)+'>');
626
procedure TWikiPage.ParseNoWiki;
629
// this is not the same as pre (preformatted treats spaces and line breaks)
631
FCurP:=FindTagEnd(FCurP);
637
if TokenIs('</nowiki>') then
643
FCurP:=FindTagEnd(FCurP);
647
procedure TWikiPage.CloseTableCell;
651
while FStackPtr>=0 do begin
653
if (t in [wptTableCell,wptTableHeadCell])
654
or (WPTokenInfos[t].Group<wpgTable) then
661
function TWikiPage.TrimLink(const Link: string): string;
663
Result:=UTF8Trim(Link);
666
function TWikiPage.CurrentPos: integer;
668
Result:=StrPos(FCurP);
671
procedure TWikiPage.Log(Msg: string);
673
if Assigned(OnLog) then
679
procedure TWikiPage.CloseRangeToken(Typ: TWPTokenType);
681
FRangeToken.SubToken:=Typ;
682
FRangeToken.Range:=wprClose;
683
DoToken(FRangeToken);
686
procedure TWikiPage.OpenRangeToken(Typ: TWPTokenType);
688
FRangeToken.SubToken:=Typ;
689
FRangeToken.Range:=wprOpen;
690
DoToken(FRangeToken);
693
function TWikiPage.FindTagEnd(TagStart: PChar): PChar;
696
if Result^='<' then inc(Result);
697
if Result^='/' then inc(Result);
698
while IsWikiTagChar[Result^] do inc(Result);
699
while Result^<>#0 do begin
701
#0,'<','>','/': break;
705
until Result^ in ['"','>','<',#0];
709
until Result^ in ['''','>','<',#0];
714
if Result^='/' then inc(Result);
715
if Result^='>' then inc(Result);
718
procedure TWikiPage.HandleUnderScore;
720
if (FCurP[1]='_') and (AtLineStart(FCurP)) and TokenIs('__TOC__') then begin
722
inc(FCurP, length('__TOC__'));
728
procedure TWikiPage.HandleEqual;
735
if (FInPre>0) then begin
739
// header => close section(s), start new section
741
while (Depth<MaxHeaderDepth) and (FCurP[Depth]='=') do inc(Depth);
745
while (i<=FStackPtr) do begin
749
else if t=wptHeader then begin
750
// this is the end of the header
760
//Log(['HandleHeader START '+PosToStr(FCurP)+' '+AtLineStart(FCurP));
761
if not AtLineStart(FCurP) then begin
769
while (FStackPtr>=0) and (OldDepth>=Depth) do begin
770
if FStack[FStackPtr].Token=wptSection then
774
// start new section(s) (it is allowed to start a subsubsection without a subsection)
775
for i:=OldDepth+1 to Depth do
776
Push(wptSection,FCurP);
778
Push(wptHeader,FCurP);
783
procedure TWikiPage.HandleListChar;
785
function CharToListType(c: char): TWPTokenType;
788
'*': Result:=wptBulletList;
789
'#': Result:=wptNumberedList;
790
':',';': Result:=wptDefinitionList;
791
else Result:=wptText;
800
if (not AtLineStart(FCurP)) or (FInPre>0) then begin
806
while FCurP[NewDepth] in ['*','#',':',';'] do inc(NewDepth);
808
// a list closes all fonts and spans => skip all fonts and spans
809
i:=FindGroupStackPos(wpgList,true);
810
// check all lists with wiki syntax, keep lists with html syntax
812
and (FStack[i].Token in WPTWikiLists) do
816
while (CurDepth<NewDepth) do begin
817
// compare old list hierarchy with new list hierarchy
818
if (i>FStackPtr) or (FStack[i].Token<>CharToListType(FCurP[CurDepth])) then begin
819
{dbgout(['TWikiPage.HandleListChar listtype does not fit: i=',i,' CurDepth=',CurDepth,' should=',dbgs(CharToListType(FCurP[CurDepth]))]);
820
if i<=FStackPtr then dbgout(' is=',dbgs(FStack[i].Token));
825
EmitFlag(CharToListType(FCurP[CurDepth]),wprOpen,0);
829
if CurDepth=NewDepth then begin
830
// close fonts, spans and previous list item
831
//Log('TWikiPage.HandleListChar close fonts, spans, listitem');
834
if (i>FStackPtr) then
835
EmitFlag(wptListItem,wprOpen,0); // new list item
836
if FStack[i].Token<>wptListItem then
837
raise Exception.Create('broken list: should='+dbgs(wptListItem)+' is='+dbgs(FStack[i].Token));
845
procedure TWikiPage.HandleSpace;
849
if (not AtLineStart(FCurP)) or (FInPre>0) then begin
854
while (NonSpace^ in [' ',#9]) do inc(NonSpace);
855
if NonSpace^ in [#10,#13,#0] then begin
861
//Log('TWikiPage.HandleSpace start pre "'+dbgstr(GetLineInSrc(Src,StrPos(FCurP)))+'"');
863
EmitFlag(wptPre,wprOpen,1);
865
while not (FCurP^ in [#10,#13,#0]) do inc(FCurP);
867
if FCurP^=#0 then break;
868
if (FCurP[1] in [#10,#13]) and (FCurP^<>FCurP[1]) then
872
if FCurP^<>' ' then break;
873
// next line is also preformatted
875
//Log('TWikiPage.HandleSpace line break');
877
EmitFlag(wptLineBreak,wprNone,0);
879
//Log('TWikiPage.HandleSpace end pre');
881
EmitFlag(wptPre,wprClose,0);
884
procedure TWikiPage.HandleCurlyBracketOpen;
886
if (FCurP[1]='{') and (FInPre=0) then begin
887
// {{special}} or {{name|special}}
890
FNameValueToken.NameStartPos:=StrPos(FCurP);
894
'}': if FCurP[1]='}' then break;
898
if FCurP^='|' then begin
899
FNameValueToken.NameEndPos:=StrPos(FCurP);
901
FNameValueToken.ValueStartPos:=StrPos(FCurP);
903
FNameValueToken.NameEndPos:=FNameValueToken.NameStartPos;
904
FNameValueToken.ValueStartPos:=FNameValueToken.NameStartPos;
909
'}': if FCurP[1]='}' then break;
913
FNameValueToken.ValueEndPos:=StrPos(FCurP);
914
if FCurP^='}' then inc(FCurP,2);
916
FNameValueToken.SubToken:=wptSpecial;
917
DoToken(FNameValueToken);
918
end else if (FCurP[1]='|') and AtLineStart(FCurP) and (FInPre=0) then begin
921
EmitFlag(wptTable,wprOpen,2);
922
// rest of line are attributes
923
while not (FCurP^ in [#0,#10,#13]) do inc(FCurP);
924
ParseAttributes(FLastEmitPos,FCurP);
925
while FCurP^ in [#10,#13] do inc(FCurP);
926
if (FCurP^='|') and (FCurP[1]='+') then begin
929
EmitFlag(wptTableRow,wprOpen,2);
930
EmitFlag(wptTableHeadCell,wprOpen,0);
937
procedure TWikiPage.HandlePipe;
941
i:=FindGroupStackPos(wpgTable,false);
944
if AtLineStart(FCurP) then begin
945
if (FCurP[1]='-') then begin
948
if TopToken=wptTableRow then
950
EmitFlag(wptTableRow,wprOpen,2);
951
while FCurP^='-' do inc(FCurP);
954
while not (FCurP^ in [#0,#10,#13]) do inc(FCurP);
955
ParseAttributes(FLastEmitPos,FCurP);
958
end else if FCurP[1]='}' then begin
960
EmitFlag(wptTable,wprClose,2);
964
if AtLineStart(FCurP) or (FCurP[1]='|') then begin
972
procedure TWikiPage.HandleExclamationMark;
976
i:=FindGroupStackPos(wpgTable,false);
979
if AtLineStart(FCurP) then begin
987
procedure TWikiPage.HandleApostroph;
989
if FCurP[1]='''' then begin
990
if FCurP[2]='''' then begin
992
EmitToggle(wptBold, 3);
995
EmitToggle(wptItalic, 2);
1003
procedure TWikiPage.HandleEdgedBracketOpen;
1007
if FCurP[1] in [#0..#31,' '] then begin
1014
if FCurP^='[' then begin
1016
// for example [[url|caption]]
1018
FLinkToken.SubToken:=wptInternLink;
1019
FLinkToken.LinkStartPos:=StrPos(FCurP);
1020
while not (FCurP^ in [#0..#31, '|', ']']) do inc(FCurP);
1021
FLinkToken.LinkEndPos:=StrPos(FCurP);
1022
FLinkToken.Link:=TrimLink(copy(Src,FLinkToken.LinkStartPos,FLinkToken.LinkEndPos-FLinkToken.LinkStartPos));
1023
FLinkToken.CaptionStartPos:=FLinkToken.LinkStartPos;
1024
FLinkToken.CaptionEndPos:=FLinkToken.LinkEndPos;
1027
// for example [url|caption] or [url caption]
1029
if not IsIdentStartChar[p^] then exit; // not a valid scheme
1031
while IsIdentChar[p^] do inc(p);
1032
if (p^<>':') or (p[1]<>'/') or (p[2]<>'/') then exit; // not a valid scheme
1033
FLinkToken.SubToken:=wptExternLink;
1034
FLinkToken.LinkStartPos:=StrPos(FCurP);
1035
while not (FCurP^ in [#0..#31, ' ' , '|' , ']']) do inc(FCurP);
1036
FLinkToken.LinkEndPos:=StrPos(FCurP);
1037
FLinkToken.Link:=TrimLink(copy(Src,FLinkToken.LinkStartPos,FLinkToken.LinkEndPos-FLinkToken.LinkStartPos));
1038
if FCurP^=' ' then begin
1041
FLinkToken.CaptionStartPos:=StrPos(FCurP);
1042
while not (FCurP^ in [#0..#31, '|', ']']) do inc(FCurP);
1043
FLinkToken.CaptionEndPos:=StrPos(FCurP);
1046
FLinkToken.CaptionStartPos:=FLinkToken.LinkStartPos;
1047
FLinkToken.CaptionEndPos:=FLinkToken.LinkEndPos;
1052
and (LeftStr(FLinkToken.Link,length(BaseURL))=BaseURL) then begin
1053
// a link to a wiki page, but with full URL => shorten
1054
FLinkToken.SubToken:=wptInternLink;
1055
Delete(FLinkToken.Link,1,length(BaseURL));
1056
while (FLinkToken.Link<>'') and (FLinkToken.Link[1]='/') do
1057
Delete(FLinkToken.Link,1,1);
1060
if FCurP^='|' then begin
1061
// link with caption
1063
FLinkToken.CaptionStartPos:=StrPos(FCurP);
1064
while not (FCurP^ in [#0..#31, ']']) do inc(FCurP);
1065
FLinkToken.CaptionEndPos:=StrPos(FCurP);
1067
if FCurP^=']' then begin
1069
if (FLinkToken.SubToken=wptInternLink) and (FCurP^=']') then
1073
DoToken(FLinkToken);
1075
FLastEmitPos:=FCurP;
1077
// ToDo: implement postfix notation [[url]]caption and [[url]]''caption''
1081
procedure TWikiPage.ParseCell;
1085
// linestart | or linestart ! or ||
1087
// => close previous cell
1090
if TopToken=wptTable then
1091
EmitFlag(wptTableRow, wprOpen, 0);
1092
if AtLineStart(FCurP) then
1093
EmitFlag(wptTableCell, wprOpen, 1) // linestart | or linestart !
1095
EmitFlag(wptTableCell, wprOpen, 2); // ||
1097
while not (NextBar^ in [#0, #10, #13, '|']) do begin
1098
if NextBar^='[' then begin
1101
end else if (NextBar^='<') and IsIdentStartChar[NextBar[1]] then begin
1107
if (NextBar^='|') and (NextBar[1]<>'|') then begin
1108
// the text in front of the first single | are attributes
1109
ParseAttributes(FCurP, NextBar);
1112
FLastEmitPos:=FCurP;
1115
procedure TWikiPage.HandleAngleBracket;
1117
procedure UnknownTag;
1120
if Verbosity>=wpvWarning then begin
1121
if IsWikiTagStartChar[FCurP[1]] then begin
1122
{$IFDEF VerboseUnknownOpenTags}
1123
Log('WARNING: TWikiPage.Parse unknown opening tag: <'+GetIdentifier(FCurP+1)+'> at '+PosToStr(FCurP,true));
1125
end else if (FCurP[1]='/') and IsWikiTagStartChar[FCurP[2]] then
1126
Log('WARNING: TWikiPage.Parse unknown closing tag: </'+GetIdentifier(FCurP+2)+'> at '+PosToStr(FCurP,true))
1128
Log('WARNING: TWikiPage.Parse broken close tag at '+PosToStr(FCurP,true));
1135
Range: TWPTokenRange;
1139
if NameP^='/' then begin
1144
if IsWikiTagStartChar[NameP^] then begin
1145
TagEndP:=FindTagEnd(FCurP);
1146
if ((TagEndP-1)^='>') and ((TagEndP-2)^='/') then begin
1148
if CompareIdentifiers(NameP,'br')=0 then EmitTag(wptLineBreak,
1150
else if CompareIdentifiers(NameP,'p')=0 then EmitTag(wptPTag,
1154
else if CompareIdentifiers(NameP,'br')=0 then EmitTag(wptLineBreak, wprNone)
1155
else if CompareIdentifiers(NameP,'b')=0 then EmitTag(wptBoldTag,Range)
1156
else if CompareIdentifiers(NameP,'i')=0 then EmitTag(wptItalicTag,Range)
1157
else if CompareIdentifiers(NameP,'u')=0 then EmitTag(wptUnderlineTag,Range)
1158
else if CompareIdentifiers(NameP,'s')=0 then EmitTag(wptStrikeTagShort, Range)
1159
else if CompareIdentifiers(NameP,'strike')=0 then EmitTag(wptStrikeTagLong, Range)
1160
else if CompareIdentifiers(NameP,'tt')=0 then EmitTag(wptTT,Range)
1161
else if CompareIdentifiers(NameP,'sup')=0 then EmitTag(wptSup,Range)
1162
else if CompareIdentifiers(NameP,'sub')=0 then EmitTag(wptSub,Range)
1163
else if CompareIdentifiers(NameP,'small')=0 then EmitTag(wptSmall,Range)
1164
else if CompareIdentifiers(NameP,'em')=0 then EmitTag(wptEm,Range)
1165
else if CompareIdentifiers(NameP,'string')=0 then EmitTag(wptString, Range)
1166
else if CompareIdentifiers(NameP,'var')=0 then EmitTag(wptVar,Range)
1167
else if CompareIdentifiers(NameP,'key')=0 then EmitTag(wptKey,Range)
1168
else if CompareIdentifiers(NameP,'cmt')=0 then EmitTag(wptCmt,Range)
1169
else if CompareIdentifiers(NameP,'span')=0 then EmitTag(wptSpan,Range)
1170
else if CompareIdentifiers(NameP,'p')=0 then EmitTag(wptPTag,Range)
1171
else if CompareIdentifiers(NameP,'div')=0 then EmitTag(wptDivTag,Range)
1172
else if CompareIdentifiers(NameP,'pre')=0 then EmitTag(wptPreTag,Range)
1173
else if CompareIdentifiers(NameP,'center')=0 then EmitTag(wptCenter,Range)
1174
else if CompareIdentifiers(NameP,'ol')=0 then EmitTag(wptOrderedListTag,Range)
1175
else if CompareIdentifiers(NameP,'ul')=0 then EmitTag(wptUnorderedListTag,Range)
1176
else if (CompareIdentifiers(NameP,'li')=0) and (TopToken in [wptOrderedListTag,wptUnorderedListTag])
1177
then EmitTag(wptUnorderedListTag, Range)
1178
else if CompareIdentifiers(NameP,'table')=0 then EmitTag(wptTableTag,Range)
1179
else if CompareIdentifiers(NameP,'tr')=0 then EmitTag(wptTableRowTag,Range)
1180
else if CompareIdentifiers(NameP,'td')=0 then EmitTag(wptTableCellTag,Range)
1181
else if CompareIdentifiers(NameP,'th')=0 then EmitTag(wptTableHeadCellTag,Range)
1182
else if CompareIdentifiers(NameP,'h1')=0 then EmitTag(wptHeader1,Range)
1183
else if CompareIdentifiers(NameP,'h2')=0 then EmitTag(wptHeader2,Range)
1184
else if CompareIdentifiers(NameP,'h3')=0 then EmitTag(wptHeader3,Range)
1185
else if (Range=wprOpen)
1186
and (FLanguageTags<>nil)
1187
and FLanguageTags.DoIdentifier(NameP)
1189
// special parse for different language
1190
//Log('TWikiPage.Parse code tag '+dbgs(Pointer(FCurP))+' tag='+GetIdentifier(NameP)+' '+FindTagEnd(FCurP)-FCurP);
1192
end else if TokenIs('<nowiki>') then begin
1203
procedure TWikiPage.HandleCode;
1208
if (FCurP^<>'<') or (not IsIdentStartChar[FCurP[1]]) then begin
1215
// by default the lange is the tag, e.g. "pascal" of <pascal>
1216
FNameValueToken.NameStartPos:=StrPos(p);
1217
while IsIdentChar[p^] do inc(p);
1218
FNameValueToken.NameEndPos:=StrPos(p);
1219
while p^ in [' ',#9,#10,#13] do inc(p);
1220
if CompareIdentifiers(p,'lang')=0 then begin
1221
// read language from lang attribute
1222
// e.g. <code lang=pascal">
1224
while p^ in [' ',#9,#10,#13] do inc(p);
1225
if p^='=' then begin
1227
while p^ in [' ',#9,#10,#13] do inc(p);
1228
if p^='"' then begin
1230
FNameValueToken.NameStartPos:=StrPos(p);
1231
while not (p^ in ['"',#0,'<','>']) do inc(p);
1232
FNameValueToken.NameEndPos:=StrPos(p);
1237
p:=FindTagEnd(FCurP);
1238
FNameValueToken.ValueStartPos:=StrPos(p);
1244
if (p[1]='/') and (CompareIdentifiers(NameP, p+2)=0) then
1249
FNameValueToken.ValueEndPos:=StrPos(p);
1250
FCurP:=FindTagEnd(p);
1251
FNameValueToken.SubToken:=wptCode;
1252
//Log('TWikiPage.HandleCode name="'+copy(Src,FNameValueToken.NameStartPos,FNameValueToken.NameEndPos-FNameValueToken.NameStartPos)+'"');
1253
DoToken(FNameValueToken);
1254
FLastEmitPos:=FCurP;
1257
procedure TWikiPage.EmitFlag(Typ: TWPTokenType; Range: TWPTokenRange;
1261
if ord(WPTokenInfos[Typ].Group)>ord(wpgFont) then begin
1262
// auto close paragraph
1263
while TopToken=wptP do
1265
end else if (Range=wprOpen) and (WPTokenInfos[Typ].Group=wpgFont) then begin
1267
if (FStackPtr<0) or (TopToken=wptSection) then begin
1268
// highest level => start a sub section
1269
Push(wptSubSection,FCurP);
1272
if Range=wprOpen then begin
1275
else if Range=wprClose then
1278
FRangeToken.SubToken:=Typ;
1279
FRangeToken.Range:=Range;
1280
DoToken(FRangeToken);
1283
FLastEmitPos:=FCurP;
1286
procedure TWikiPage.EmitToggle(Typ: TWPTokenType; TagLen: integer);
1292
while (i>=0) do begin
1293
if FStack[i].Token=Typ then begin
1298
if (WPTokenInfos[FStack[i].Token].Group<>wpgFont) then begin
1299
// toggles can only skip the font group
1310
FLastEmitPos:=FCurP;
1313
procedure TWikiPage.EmitTag(Typ: TWPTokenType; Range: TWPTokenRange);
1315
function GetAttributesStart: PChar;
1321
if p^<>'<' then exit;
1323
while IsWikiTagChar[p^] do inc(p);
1324
while p^ in [' ',#9] do inc(p); // wiki does not allow multiline attributes
1325
if not IsWikiTagChar[p^] then exit;
1333
if Range<>wprClose then begin
1334
StartPos:=GetAttributesStart;
1335
if StartPos<>nil then begin
1338
while not (p^ in [#0,#10,#13,'>']) do inc(p);
1341
EmitFlag(Typ,wprOpen,p-FCurP);
1342
ParseAttributes(StartPos,p);
1343
if Range=wprNone then
1348
// has no attributes
1349
EmitFlag(Typ,Range,FindTagEnd(FCurP)-FCurP);
1352
procedure TWikiPage.EmitLineBreak;
1354
if FCurP[1] in [#10,#13] then
1355
EmitFlag(wptLineBreak,wprNone,2)
1357
EmitFlag(wptLineBreak,wprNone,1);
1361
constructor TWikiPage.Create;
1367
destructor TWikiPage.Destroy;
1373
procedure TWikiPage.LoadFromFile(Filename: string);
1379
ReadXMLFile(doc,Filename);
1386
procedure TWikiPage.LoadFromDoc(doc: TDOMNode);
1389
ParentName: DOMString;
1390
GrandParentName: String;
1394
for Node in doc.GetEnumeratorAllChildren do begin
1396
GrandParentName:='';
1397
if Node.ParentNode is TDOMElement then begin
1398
ParentName:=TDOMElement(Node.ParentNode).TagName;
1399
if Node.ParentNode.ParentNode is TDOMElement then
1400
GrandParentName:=TDOMElement(Node.ParentNode.ParentNode).TagName;
1402
if Node is TDOMText then begin
1403
Data:=TDOMText(Node).Data;
1404
if (GrandParentName='page') then begin
1405
if ParentName='id' then
1407
else if ParentName='title' then
1409
end else if GrandParentName='revision' then begin
1410
if ParentName='id' then
1412
else if ParentName='timestamp' then
1414
else if ParentName='text' then
1416
end else if (ParentName='base') and (GrandParentName='siteinfo') then begin
1418
while (p>=1) and (Data[p]<>'/') do dec(p);
1419
BaseURL:=copy(Data,1,p-1);
1425
procedure TWikiPage.Parse(const OnToken: TWikiTokenEvent; Data: Pointer);
1427
if FSrc='' then exit;
1431
FLastEmitPos:=FCurP;
1434
FTextToken:=TWPTextToken.Create(Self,Data);
1435
FRangeToken:=TWPToken.Create(Self,Data);
1436
FLinkToken:=TWPLinkToken.Create(Self,Data);
1437
FNameValueToken:=TWPNameValueToken.Create(Self,Data);
1438
while FCurP^<>#0 do begin
1443
// special character as normal character
1446
FLastEmitPos:=FCurP;
1447
if FCurP^<>#0 then inc(FCurP);
1453
if (FCurP[1] in [#10,#13]) and (FCurP^<>FCurP[1]) then
1457
if FCurP^ in [#10,#13] then begin
1458
// empty line(s) closes lists, paragraphs and subsections
1459
while TopToken in ([wptP,wptSubSection]+WPTWikiLists) do
1461
while FCurP^ in [#10,#13] do inc(FCurP);
1463
// line breaks closes head cells
1464
if TopToken=wptTableHeadCell then
1465
Pop(wptTableHeadCell);
1468
'''': HandleApostroph;
1469
'{': HandleCurlyBracketOpen;
1471
'!': HandleExclamationMark;
1473
'_': HandleUnderScore;
1474
'[': HandleEdgedBracketOpen;
1475
'<': HandleAngleBracket;
1476
'*','#',':',';': HandleListChar;
1480
if (FCurP[1]='-') and AtLineStart(FCurP) and TokenIs('----') then
1481
EmitFlag(wptHorizontalRow,wprNone,4)
1490
while FStackPtr>=0 do
1493
FreeAndNil(FRangeToken);
1494
FreeAndNil(FTextToken);
1495
FreeAndNil(FLinkToken);
1496
FreeAndNil(FNameValueToken);
1501
procedure TWikiPage.FixUTF8;
1503
UTF8FixBroken(FSrc);
1510
for c:=low(char) to high(char) do begin
1511
IsWikiTagStartChar[c]:=c in ['a'..'z','A'..'Z','_',#192..#255];
1512
IsWikiTagChar[c]:=c in ['a'..'z','A'..'Z','_','0'..'9',#128..#255];
1516
function WikiInternalLinkToPage(Link: string): string;
1529
#0..#8,#10..#31,'$','[',']','{','}','<','>':
1535
while (i+j<=length(Result)) do begin
1538
'0'..'9': if Code<16 then Code:=Code*16+ord(c)-ord('0');
1539
'a'..'z': if Code<16 then Code:=Code*16+ord(c)-ord('a')+10;
1540
'A'..'Z': if Code<16 then Code:=Code*16+ord(c)-ord('A')+10;
1546
ReplaceSubstring(Result,i,j+1,chr(Code));
1547
continue; // check the new character
1554
function WikiIsExternalLink(Link: string): boolean;
1555
// check if Link starts with a scheme http://
1560
if Link='' then exit;
1562
while p^ in ['a'..'z','A'..'Z'] do inc(p);
1563
if p=PChar(Link) then exit;
1564
if p^<>':' then exit;
1566
if p^<>'/' then exit;
1568
if p^<>'/' then exit;
1573
function GetWikiPageID(doc: TDOMNode): string;
1578
for Node in doc.GetEnumeratorAllChildren do begin
1579
if (Node is TDOMText)
1580
and (Node.ParentNode is TDOMElement)
1581
and (TDOMElement(Node.ParentNode).TagName='id')
1582
and (Node.ParentNode.ParentNode is TDOMElement)
1583
and (TDOMElement(Node.ParentNode.ParentNode).TagName='page') then begin
1584
Result:=TDOMText(Node).Data;
1589
function GetWikiPageID(s: TStream): string;
1598
Result:=GetWikiPageID(doc);
1606
function WikiPageToCaseID(Page: string): string;
1618
if Page='' then exit;
1620
// for each letter check if it is uppercased
1622
UpPage:=UTF8UpperCase(Page);
1624
PageUpP:=PChar(UpPage);
1625
while (PageP^<>#0) and (PageUpP^<>#0) do begin
1626
if PageP^='%' then begin
1627
// skip encoded characters, it does not matter if they are written lower or uppercase
1630
for i:=1 to 2 do begin
1631
if PageUpP^ in ['0'..'9','A'..'F'] then begin
1637
CharLen:=UTF8CharacterLength(PageP);
1638
UpCharLen:=UTF8CharacterLength(PageUpP);
1639
if (CharLen>1) or (PageP^ in ['a'..'z','A'..'Z']) then begin
1640
if (CharLen=UpCharLen) and CompareMem(PageP,PageUpP,CharLen) then
1641
CaseFlags:=CaseFlags+'u'
1643
CaseFlags:=CaseFlags+'l';
1646
inc(PageUpP,UpCharLen);
1650
// encode bit vector (one character per 5bit)
1651
while CaseFlags<>'' do begin
1653
for i:=1 to 5 do begin
1654
if i>length(CaseFlags) then break;
1656
if CaseFlags[i]='u' then n+=1;
1659
0..9: Result:=Result+chr(n+ord('0'));
1660
10..31: Result:=Result+chr(n-10+ord('a'));
1662
system.Delete(CaseFlags,1,5);
1666
function dbgs(t: TWPTokenType): string;
1668
Result:=WPTokenInfos[t].Caption;
1671
function dbgs(r: TWPTokenRange): string;
1673
Result:=WPTokenRangeNames[r];