1
{ Console utility to download the Lazarus wiki.
2
Maybe it also works for other MediaWikis sites.
4
Copyright (C) 2012 Mattias Gaertner mattias@freepascal.org
6
This source is free software; you can redistribute it and/or modify it under
7
the terms of the GNU General Public License as published by the Free
8
Software Foundation; either version 2 of the License, or (at your option)
11
This code is distributed in the hope that it will be useful, but WITHOUT ANY
12
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16
A copy of the GNU General Public License is available on the World Wide Web
17
at <http://www.gnu.org/copyleft/gpl.html>. You can also obtain it by writing
18
to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
22
- get more than 500 changes
32
Classes, SysUtils, LazFileUtils, laz2_XMLRead, laz2_DOM, laz2_XMLWrite,
33
LazUTF8, LazLogger, CodeToolsStructs, CustApp, AVL_Tree,
34
{$IF FPC_FULLVERSION<20701}
37
fphttpclient, HTTPDefs,
39
WikiParser, WikiFormat;
42
IgnorePrefixes: array[1..12] of string = (
60
TFetchWikiPage = class(TWikiPage)
67
TWikiGet = class(TCustomApplication)
71
FIgnoreFilesYoungerThanMin: integer;
75
FNeededPages: TStringToPointerTree; // PageName to TFetchWikiPage
76
FAllPages: TStringToPointerTree; // PageName to TFetchWikiPage
77
FAllImages: TStringToStringTree; // image name to filename
79
procedure DoRun; override;
80
procedure GetAll(Version: integer = 2; SaveTOC: boolean = false);
81
procedure GetRecent(Days: integer; Version: integer = 2);
82
procedure DownloadPage(Page: string);
83
procedure DownloadFirstNeededPage;
84
procedure CheckNotUsedPages(Show, Delete: boolean);
85
procedure DownloadImages;
86
procedure DownloadPageImages(Page: string);
87
procedure OnParseForImages(Token: TWPToken);
88
procedure CheckNotUsedImages(Show, Delete: boolean);
89
function LoadPageFromDisk(Page: string): TFetchWikiPage;
90
function AddWikiPage(Page: string): TFetchWikiPage;
91
function NeedWikiPage(Page: string): TFetchWikiPage;
92
function PageToFilename(Page: string; IsInternalLink: boolean): string;
93
function ImageToFilename(Image: string; IsInternalLink, KeepScheme: boolean): string;
94
function EscapeDocumentName(aName: string): string;
95
function IsIgnoredPage(Page: string): boolean;
98
constructor Create(TheOwner: TComponent); override;
99
destructor Destroy; override;
100
procedure WriteHelp; virtual;
101
property OutputDir: string read FOutputDir;
102
property ImagesDir: string read FImagesDir;
103
property BaseURL: string read FBaseURL;
104
property NoWrite: boolean read FNoWrite;
105
property IgnoreFilesYoungerThanMin: integer read FIgnoreFilesYoungerThanMin;
110
procedure TWikiGet.DoRun;
114
procedure E(Msg: string; DoWriteHelp: boolean = false);
116
if Msg<>'' then begin
117
writeln('ERROR: ',Msg);
130
NeedSinglePage: Boolean;
134
// quick check parameters
135
ErrorMsg:=CheckOptions('h','help dir: images: baseurl: page: allmissing recent: ignore-recent: nowrite'
136
+' shownotusedpages deletenotusedpages'
137
+' shownotusedimages deletenotusedimages');
142
if HasOption('h','help') then
145
FNoWrite:=HasOption('nowrite');
147
if HasOption('dir') then begin
148
fOutputDir:=GetOptionValue('dir');
149
if fOutputDir='' then
150
E('output directory missing',true);
152
fOutputDir:=CleanAndExpandDirectory(OutputDir);
154
if HasOption('images') then begin
155
FImagesDir:=GetOptionValue('images');
156
if FImagesDir='' then
157
E('images directory missing',true);
159
FImagesDir:=CleanAndExpandDirectory(ImagesDir);
161
if HasOption('baseurl') then
162
FBaseURL:=GetOptionValue('baseurl');
163
if HasOption('page') then
164
fFirstPage:=GetOptionValue('page');
167
if not DirectoryExistsUTF8(OutputDir) then
168
E('output directory not found "'+OutputDir+'"');
169
if not DirectoryExistsUTF8(ImagesDir) then
170
E('images directory not found "'+ImagesDir+'"');
171
if copy(BaseURL,1,7)<>'http://' then
172
E('invalid baseurl "'+BaseURL+'"');
174
if HasOption('ignore-recent') then begin
175
fIgnoreFilesYoungerThanMin:=StrToIntDef(GetOptionValue('ignore-recent'),-1);
176
if IgnoreFilesYoungerThanMin<0 then
177
E('invalid --ignore-recent value "'+GetOptionValue('ignore-recent')+'"');
180
NeedSinglePage:=true;
181
if HasOption('allmissing') or HasOption('recent') then begin
182
NeedSinglePage:=false;
184
if HasOption('recent') then begin
185
RecentDays:=StrToIntDef(GetOptionValue('recent'),-1);
187
E('invalid --recent value "'+GetOptionValue('recent')+'"');
191
GetRecent(RecentDays);
193
for i:=1 to GetParamCount do begin
195
//writeln('TWikiGet.DoRun Param="',Param,'"');
196
if copy(Param,1,length(pPage))=pPage then
197
NeedWikiPage(WikiInternalLinkToPage(copy(Param,length(pPage)+1,length(Param))));
199
if (NeedSinglePage) and (FNeededPages.Tree.Count=0) then
200
E('nothing to do',true);
202
while FNeededPages.Tree.Count>0 do
203
DownloadFirstNeededPage;
207
CheckNotUsedPages(HasOption('shownotusedpages'),HasOption('deletenotusedpages'));
208
CheckNotUsedImages(HasOption('shownotusedimages'),HasOption('deletenotusedimages'));
214
procedure TWikiGet.GetAll(Version: integer; SaveTOC: boolean);
216
Client: TFPHTTPClient;
217
Response: TMemoryStream;
228
URLs:=TStringList.Create;
230
Client:=TFPHTTPClient.Create(nil);
231
Response:=TMemoryStream.Create;
232
// get list of range pages
233
//URL:=BaseURL+'index.php?title=Special:AllPages&action=submit&namespace=0&from=';
235
URL:=BaseURL+'index.php?title=Special:Allpages'
237
URL:=BaseURL+'index.php?title=Special:AllPages';
238
writeln('getting page "',URL,'" ...');
239
Client.Get(URL,Response);
240
//Client.ResponseHeaders.SaveToFile('responseheaders.txt');
241
debugln(['TWikiGet.GetAll ',SaveTOC]);
242
if Response.Size>0 then begin
243
if SaveTOC then begin
244
Response.Position:=0;
245
Filename:='all.html';
246
writeln('saving page "',Filename,'" ...');
248
Response.SaveToFile(Filename);
250
Response.Position:=0;
251
SetLength(s,Response.Size);
252
Response.Read(s[1],length(s));
255
p:=Pos('<a href="/Special:Allpages/',s)
257
p:=Pos('<a href="/index.php?title=Special:AllPages&from=',s);
259
inc(p,length('<a href="'));
261
while (p<=length(s)) and (s[p]<>'"') do inc(p);
262
URL:=XMLValueToStr(copy(s,StartPos,p-StartPos));
263
if (URL<>'') and (URLs.IndexOf(URL)<0) then begin;
264
writeln('TWikiGet.GetAll URL="',URL,'"');
267
System.Delete(s,1,p);
271
// get all range pages
272
for i:=0 to URLs.Count-1 do begin
273
URL:=EscapeDocumentName(URLs[i]);
276
writeln('getting page "',URL,'" ...');
277
Client.Get(URL,Response);
278
//Client.ResponseHeaders.SaveToFile('responseheaders.txt');
279
if SaveTOC then begin
280
Response.Position:=0;
281
Filename:='all_'+IntToStr(i+1)+'.html';
282
writeln('saving page "',Filename,'" ...');
284
Response.SaveToFile(Filename);
286
if Response.Size>0 then begin
287
Response.Position:=0;
288
SetLength(s,Response.Size);
289
Response.Read(s[1],length(s));
291
p:=Pos('<a href="/',s);
293
inc(p,length('<a href="'));
295
while (p<=length(s)) and (s[p]<>'"') do inc(p);
296
Page:=copy(s,StartPos,p-StartPos);
297
while (Page<>'') and (Page[1]='/') do
298
System.Delete(Page,1,1);
299
if (Page<>'') and (not IsIgnoredPage(Page)) then begin;
300
//writeln('TWikiGet.GetAll Page="',Page,'"');
301
Filename:=PageToFilename(Page,false);
303
if not FileExistsUTF8(Filename) then begin
304
writeln('TWikiGet.GetAll missing Page="',Page,'"');
308
System.Delete(s,1,p);
319
procedure TWikiGet.GetRecent(Days: integer; Version: integer);
321
linksstart = '<a href="/index.php?title=';
323
Client: TFPHTTPClient;
324
Response: TMemoryStream;
333
CheckedPages: TStringToStringTree;
335
//writeln('TWikiGet.GetRecent Days=',Days);
337
CheckedPages:=TStringToStringTree.Create(true);
339
Client:=TFPHTTPClient.Create(nil);
340
Response:=TMemoryStream.Create;
342
URL:=BaseURL+'index.php?title=Special:Recentchanges&days='+IntToStr(Days)+'&limit=500'
344
URL:=BaseURL+'index.php?title=Special:RecentChanges&days='+IntToStr(Days)+'&limit=500';
345
writeln('getting page "',URL,'" ...');
346
Client.Get(URL,Response);
347
//Client.ResponseHeaders.SaveToFile('responseheaders.txt');
348
//Response.SaveToFile('test.html');
349
NowDate:=DateTimeToFileDate(Now);
350
if Response.Size>0 then begin
351
SetLength(s,Response.Size);
352
Response.Position:=0;
353
Response.Read(s[1],length(s));
355
// find next a href tag
356
p:=Pos(linksstart,s);
358
Delete(s,1,p+length(linksstart)-1);
359
// get href attribute
361
while (p<=length(s)) and (not (s[p] in ['"'])) do inc(p);
362
if p>length(s) then break;
363
href:=LeftStr(s,p-1);
364
//writeln('TWikiGet.GetRecent href="'+href+'"');
366
if Pos('&diff=',href)<1 then begin
367
// this is not a change
371
Page:=LeftStr(href,Pos('&',href)-1);
372
//writeln('TWikiGet.GetRecent page="'+Page+'"');
373
if not (FAllPages.Contains(Page)) then
374
continue; // deleted in the mean time
375
if CheckedPages.Contains(Page) then continue;
376
if IsIgnoredPage(Page) then continue;
377
if FNeededPages.Contains(Page) then continue;
378
CheckedPages[Page]:='1';
379
Filename:=PageToFilename(Page,false);
380
//writeln('TWikiGet.GetRecent recent diff page="'+Page+'" File="',Filename,'"');
381
if FileExistsUTF8(Filename) then begin
382
AgeInMin:=(NowDate-FileAgeUTF8(Filename)) div 60;
383
//writeln('TWikiGet.GetRecent FileAge=',AgeInMin,' Ignore=',IgnoreFilesYoungerThanMin,' File="',Filename,'"');
384
if AgeInMin<IgnoreFilesYoungerThanMin then continue;
386
writeln(' recently changed: "',Page,'" File="',Filename,'"');
398
procedure TWikiGet.DownloadPage(Page: string);
400
Response: TMemoryStream;
401
Client: TFPHTTPClient;
405
Filename:=PageToFilename(Page,false);
409
Client:=TFPHTTPClient.Create(nil);
410
Response:=TMemoryStream.Create;
411
URL:=BaseURL+'index.php?title=Special:Export&pages='+EscapeDocumentName(Page)+'&curonly=1&action=submit';
412
writeln('getting page "',URL,'" ...');
413
Client.Get(URL,Response);
414
//Client.ResponseHeaders.SaveToFile('responseheaders.txt');
415
Response.Position:=0;
416
writeln('saving page "',Filename,'" ...');
417
Response.Position:=0;
419
Response.SaveToFile(Filename);
426
procedure TWikiGet.DownloadFirstNeededPage;
431
Node:=FNeededPages.Tree.FindLowest;
432
if Node=nil then exit;
433
Page:=PStringMapItem(Node.Data)^.Name;
434
FNeededPages.Remove(Page);
438
procedure TWikiGet.CheckNotUsedPages(Show, Delete: boolean);
440
Item: PStringToPointerTreeItem;
441
Files: TFilenameToPointerTree;
442
FileInfo: TSearchRec;
443
Page: TFetchWikiPage;
446
Files:=TFilenameToPointerTree.Create(false);
448
for Item in FAllPages do begin
449
Page:=TFetchWikiPage(Item^.Value);
450
Files[Page.Filename]:=Page;
453
writeln('Not needed files in the output directory "',OutputDir,'":');
454
if FindFirstUTF8(OutputDir+AllFilesMask,faAnyFile,FileInfo)=0 then begin
456
if (FileInfo.Name='.') or (FileInfo.Name='..') or (FileInfo.Name='') then
458
if (faDirectory and FileInfo.Attr)<>0 then continue;
459
Filename:=OutputDir+FileInfo.Name;
460
if Files.Contains(Filename) then continue;
462
writeln('page:',FileInfo.Name);
464
writeln('deleting page: ',FileInfo.Name);
465
if (not NoWrite) and (not DeleteFileUTF8(Filename)) then
466
writeln('failed to delete page "',Filename,'"');
468
until FindNextUTF8(FileInfo)<>0;
470
FindCloseUTF8(FileInfo);
476
procedure TWikiGet.DownloadImages;
478
Item: PStringToPointerTreeItem;
480
writeln('checking images of ',FAllPages.Tree.Count,' pages ...');
481
for Item in FAllPages do
482
DownloadPageImages(Item^.Name);
483
writeln('total images: ',FAllImages.Tree.Count);
486
procedure TWikiGet.DownloadPageImages(Page: string);
490
p:=LoadPageFromDisk(Page);
492
//writeln('TWikiGet.DownloadPageImages ',p.Filename,' ',length(p.Src));
493
p.Verbosity:=wpvError;
494
p.Parse(@OnParseForImages,p);
497
procedure TWikiGet.OnParseForImages(Token: TWPToken);
500
LinkToken: TWPLinkToken;
503
Client: TFPHTTPClient;
504
Response: TMemoryStream;
516
p:=TFetchWikiPage(Token.UserData);
517
if Token.Token=wptInternLink then begin
518
LinkToken:=Token as TWPLinkToken;
519
SrcLink:=LinkToken.Link;
521
ColonPos:=Pos(':',Link);
522
//writeln('TWikiGet.OnParseForImages Link="',Link,'" ColonPos=',ColonPos);
523
if ColonPos<1 then exit;
524
if ColonPos=length(Link) then exit;
525
Prefix:=lowercase(copy(Link,1,ColonPos-1));
526
if Prefix<>'image' then exit;
527
Link:=UTF8Trim(copy(Link,ColonPos+1,length(Link)));
528
if Link='' then exit;
529
Filename:=ImageToFilename(Link,true,true);
530
//writeln('TWikiGet.OnParseForImages page="',p.Filename,'" Link="',Link,'" => ',Filename);
531
if FAllImages.Contains(Link) then exit; // already tried
532
FAllImages[Link]:=Filename;
533
if FileExistsUTF8(Filename) then exit;
534
//writeln('TWikiGet.OnParseForImages ',FileExists(Filename),' ',FileExistsUTF8(Filename),' "',Filename,'"');
535
// download image page
540
Client:=TFPHTTPClient.Create(nil);
541
Response:=TMemoryStream.Create;
542
URL:=BaseURL+EscapeDocumentName('Image:'+WikiInternalLinkToPage(Link));
543
writeln('getting image page "',URL,'" ...');
544
Client.Get(URL,Response);
545
//Client.ResponseHeaders.SaveToFile('responseheaders.txt');
546
Response.Position:=0;
547
SetLength(Data,Response.Size);
549
Response.Read(Data[1],length(Data));
550
i:=Pos('class="fullImageLink"',Data);
552
writeln('TWikiGet.OnParseForImages WARNING: image page has no fullImageLink marker: "',URL,'"');
553
writeln('saving responseheaders.txt ...');
555
Client.ResponseHeaders.SaveToFile('responseheaders.txt');
556
writeln('saving response.txt ...');
558
Response.SaveToFile('response.txt');
561
while i<=length(Data) do begin
562
if (copy(Data,i,5)='src="') then begin
563
//writeln('TWikiGet.OnParseForImages src found ...');
566
while (i<=length(Data)) and (Data[i]<>'"') do
568
ImageLink:=UTF8Trim(copy(Data,StartPos,i-StartPos));
569
if ImageLink='' then exit;
570
//writeln('TWikiGet.OnParseForImages Img="',ImageLink,'"');
571
URL:=BaseURL+EscapeDocumentName(ImageLink);
572
writeln('getting image "',URL,'" ...');
574
Client.Get(URL,Response);
575
for j:=0 to Client.ResponseHeaders.Count-1 do begin
576
Header:=Client.ResponseHeaders[j];
577
if LeftStr(Header,length('Content-Type:'))='Content-Type:' then begin
578
if Pos('image/',Header)<1 then begin
579
writeln('this is not an image: ',Header);
584
writeln('saving image to "',Filename,'" ...');
586
Response.SaveToFile(Filename);
592
on E: EHTTPClient do begin
593
writeln('TWikiGet.OnParseForImages WARNING: page="'+p.Filename+'" Link="'+Link+'" SrcLink="'+SrcLink+'" URL="'+URL+'": '+E.Message);
603
procedure TWikiGet.CheckNotUsedImages(Show, Delete: boolean);
605
Files: TFilenameToStringTree;
606
FileInfo: TSearchRec;
608
Item: PStringToStringTreeItem;
610
Files:=TFilenameToStringTree.Create(false);
612
for Item in FAllImages do begin
613
Filename:=Item^.Value;
614
Files[Filename]:='1';
617
writeln('Not needed files in the images directory "',ImagesDir,'":');
618
if FindFirstUTF8(ImagesDir+AllFilesMask,faAnyFile,FileInfo)=0 then begin
620
if (FileInfo.Name='.') or (FileInfo.Name='..') or (FileInfo.Name='') then
622
if (faDirectory and FileInfo.Attr)<>0 then continue;
623
Filename:=ImagesDir+FileInfo.Name;
624
if Files.Contains(Filename) then continue;
626
writeln('image:',FileInfo.Name);
628
writeln('deleting image: ',FileInfo.Name);
629
if (not NoWrite) and (not DeleteFileUTF8(Filename)) then
630
writeln('failed to delete image "',Filename,'"');
632
until FindNextUTF8(FileInfo)<>0;
634
FindCloseUTF8(FileInfo);
640
function TWikiGet.LoadPageFromDisk(Page: string): TFetchWikiPage;
644
Result:=AddWikiPage(Page);
645
if (Result=nil) or (Result.Src<>'') then exit;
646
Filename:=PageToFilename(Page,false);
647
//writeln('TWikiGet.LoadPageFromDisk ',Page,' File=',Filename);
648
if not FileExistsUTF8(Filename) then begin
649
writeln('TWikiGet.LoadPageFromDisk page "',Page,'": file not found "',Filename,'"');
652
Result.LoadFromFile(Filename);
655
function TWikiGet.AddWikiPage(Page: string): TFetchWikiPage;
657
if Page='' then exit(nil);
658
Result:=TFetchWikiPage(FAllPages[Page]);
659
if Result=nil then begin
660
Result:=TFetchWikiPage.Create;
661
Result.PageName:=Page;
662
Result.Filename:=PageToFilename(Page,false);
663
FAllPages[Page]:=Result;
667
function TWikiGet.NeedWikiPage(Page: string): TFetchWikiPage;
669
Result:=AddWikiPage(Page);
670
if Result=nil then exit;
671
FNeededPages[Page]:=Result;
674
function TWikiGet.PageToFilename(Page: string; IsInternalLink: boolean): string;
676
Result:=OutputDir+WikiPageToFilename(Page,IsInternalLink,true)+'.xml';
679
function TWikiGet.ImageToFilename(Image: string; IsInternalLink,
680
KeepScheme: boolean): string;
682
Result:=ImagesDir+WikiImageToFilename(Image,IsInternalLink,true,KeepScheme);
685
function TWikiGet.EscapeDocumentName(aName: string): string;
692
while i<=length(Result) do begin
695
':': s:='%'+HexStr(ord(s[1]),2);
698
ReplaceSubstring(Result,i,1,s);
702
if (Result<>'') and (Result[1]='/') then
706
function TWikiGet.IsIgnoredPage(Page: string): boolean;
710
for i:=low(IgnorePrefixes) to high(IgnorePrefixes) do begin
711
if LeftStr(Page,length(IgnorePrefixes[i]))=IgnorePrefixes[i] then
717
procedure TWikiGet.Test;
719
procedure w(URL: string);
724
debugln(['TWikiGet.Test [',URL,']']);
725
Page:=WikiInternalLinkToPage(URL);
726
debugln([' URL=[',dbgstr(URL),'] Page=[',Page,']']);
727
Filename:=WikiImageToFilename(Page,false,true);
728
debugln([' URL=[',dbgstr(URL),'] Filename="',Filename,'"']);
732
//w('Image:Acs_demos.jpg');
733
//w('Image:Acs demos.jpg');
734
w('Image:Acs%20demos.jpg');
735
//w('Image:Acs demos.JPG');
740
constructor TWikiGet.Create(TheOwner: TComponent);
742
inherited Create(TheOwner);
743
StopOnException:=True;
744
fOutputDir:='wikixml';
745
FImagesDir:='images';
746
FBaseURL:='http://wiki.lazarus.freepascal.org/';
747
fFirstPage:='Lazarus_Documentation';
748
FAllPages:=TStringToPointerTree.Create(true);
749
FNeededPages:=TStringToPointerTree.Create(true);
750
FAllImages:=TStringToStringTree.Create(true);
751
FIgnoreFilesYoungerThanMin:=60;
754
destructor TWikiGet.Destroy;
762
procedure TWikiGet.WriteHelp;
764
writeln('Usage: ',ExeName,' -h');
766
writeln('--dir=<directory> : directory where to store the files. Default: ',OutputDir);
767
writeln('--images=<directory> : directory where to store the images. Default: ',ImagesDir);
768
writeln('--baseurl=<URL> : URL of the wiki. Default: ',BaseURL);
769
writeln('--page=<pagename> : download this wiki page. Can be given multiple times.');
770
writeln('--allmissing : download all wiki pages, if file not already there.');
771
writeln('--recent=<days> : download pages again if changed in the last days on the site.');
772
writeln(' includes --allmissing.');
773
writeln(' ToDo: check more than last 500 changes.');
774
writeln('--ignore-recent=<minutes> : do not download again files younger than this on disk.');
775
writeln(' combine with --recent. Default: ',IgnoreFilesYoungerThanMin);
776
writeln('--shownotusedpages : show not used files in the output directory.');
777
writeln('--deletenotusedpages : delete the files in the output directory that are not used.');
778
writeln('--shownotusedimages : show not used files in the images directory.');
779
writeln('--deletenotusedimages : delete the files in the images directory that are not used.');
780
writeln('--nowrite : do not write files, just print what would be written.');
782
writeln('Example: download one page');
783
writeln(' ',ExeName,' --dir=html --images=images --page=Install_Packages');
784
writeln('Example: download the whole wiki');
785
writeln(' ',ExeName,' --allmissing');
786
writeln('Example: call this to download new files once per week');
787
writeln(' ',ExeName,' --recent=8 --deletenotusedpages --deletenotusedimages');
791
Application: TWikiGet;
793
Application:=TWikiGet.Create(nil);
794
Application.Title:='Wiki Get';