2
This file is part of libextractor.
3
(C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff
5
libextractor is free software; you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published
7
by the Free Software Foundation; either version 2, or (at your
8
option) any later version.
10
libextractor is distributed in the hope that it will be useful, but
11
WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
General Public License for more details.
15
You should have received a copy of the GNU General Public License
16
along with libextractor; see the file COPYING. If not, write to the
17
Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18
Boston, MA 02111-1307, USA.
20
Portions of this code were adapted from libhtmlparse by
21
Mooneer Salem (mooneer@translator.cs). The main changes
22
to libhtmlparse were the removal of globals to make the
26
#include "extractor.h"
29
#include <sys/types.h>
38
/* struct holding the arguments of tags */
45
* libhtmlparse has the callbacks defined as globals,
46
* which is bad for making libextractor re-entrant.
47
* We now put them all in one big table that is passed
48
* around inside the parser.
51
* You may call one ore several or even all callbacks. Except of the
52
* XHTMLCallBack, all CallBacks will work as expected and described
55
* The XHTMLCallBack is a special case, because you can decide, if the
56
* XHTML specific tags should be handeled as a start- AND endtag, or
57
* as an XHTML tag. If you call nothing, except start and endtag, the
58
* behaviour is, that you'll get a start AND an endtag called back.
59
* If you call XHTMLCallBack, it will only give you the XHTML call back.
61
* If you are in doubt or simply confused now, call XHTMLCallBack()
64
/* handle comments and javascript */
65
int (*commentCallBack) (char *comment, struct PC_ * pc);
66
int (*commentStartCallBack) (struct PC_ * pc);
67
int (*commentEndCallBack) (struct PC_ * pc);
69
/* Declaration e.g. <!DOCTYPE HTML ... */
70
int (*declCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);
72
/* Start tag e.g. <html>, with arguments, args may be NULL, numargs may be 0 */
73
int (*startCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);
75
/* End tag e.g. </html>*/
76
int (*endCallBack) (char *tag, struct PC_ * pc);
78
/* handle plain text */
79
int (*textCallBack) (char *text, struct PC_ * pc);
80
int (*textStartCallBack) (struct PC_ * pc);
81
int (*textEndCallBack) (struct PC_ * pc);
83
/* PHP inserts. BUG(?): if someone prints another PHP function from this PHP function
84
our lib will get confused. */
85
int (*phpCallBack) (char *text, struct PC_ * pc);
87
/* empty tags like <hr/>, <br/>, with arguments, args may be NULL, numargs may be 0 */
88
int (*XHTMLCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);
90
/* XML tags <?xml>, with arguments, args may be NULL, numargs may be 0 */
91
int (*xmlCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);
93
/* entities like ä,ä text will inherit all chars between '&' and ';' */
94
int (*entityCallBack) (char *text, struct PC_ * pc);
96
/* and we also put some formaly static variables in this */
98
/* needed to pass text in <script> tags verbatim */
99
unsigned int lhtml_script_passthru;
107
* 0: ignore, 1: add keyword
112
* If nextTextAction == 1, this gives the type of the
115
EXTRACTOR_KeywordType nextKeywordType;
118
* Result of the current pass.
120
struct EXTRACTOR_Keywords * result;
125
/**********************************************************************/
128
/* argument caching (e.g width="80%") */
129
static struct ArgvTable *addArgToTable(struct ArgvTable *args, char *arg, char *val,
133
args = (struct ArgvTable*) calloc(1, sizeof(struct ArgvTable)*(pc->numArgs+1));
135
args = (struct ArgvTable*) realloc(args, sizeof(struct ArgvTable)*(pc->numArgs+1));
139
"FATAL: could not allocate: %s at %s:%d\n",
144
args[pc->numArgs-1].arg = arg;
145
args[pc->numArgs-1].val = val;
149
/* clean up memory */
150
static void freeArgs (struct ArgvTable *args,
155
for(i=0; i<pc->numArgs; i++) {
166
static const char *parseForEntities(const char *, struct PC_ * pc);
169
static const char *parseText(const char *html, struct PC_ * pc) {
174
while( (*html != '\0') && isspace((int) *html)) html++;
176
if (*html == '<') return html;
179
while ( (*html != '\0') && (*html != '<') ) html++;
181
tmp = (char *)calloc(1, (size_t)(html-tmp2+1));
184
memcpy(tmp, tmp2, (size_t)(html-tmp2));
186
if (strlen(tmp) > 0) {
187
if (pc->textStartCallBack) {
188
ret = pc->textStartCallBack(pc);
194
if (pc->textCallBack) {
195
if (pc->entityCallBack){ /* that is textCallBack(text)
196
with entityCallBack(entity) as an extrabonus */
197
/*printf("entity is here\n");*/
198
parseForEntities(tmp, pc);
200
ret = pc->textCallBack(tmp, pc);
207
if (pc->textEndCallBack) {
208
ret = pc->textEndCallBack(pc);
217
if (*(html+1) == '>') html += 2;
221
static const char *parseComment (const char *html, struct PC_ * pc) {
226
while ( (*html == '-') || isspace((int)*html))html++;
229
while ( (*html != '\0') && !(*html == '-' && *(html+1) == '-' && *(html+2) == '>')) html++;
231
tmp = (char *)calloc(1, (size_t)(html-tmp2+1));
234
memcpy(tmp, tmp2, (size_t)(html-tmp2));
236
if (*(html+3) != '\0') html += 3;
238
if (pc->commentStartCallBack) {
239
ret = pc->commentStartCallBack(pc);
245
if (pc->commentCallBack) {
246
ret = pc->commentCallBack(tmp, pc);
252
if (pc->commentEndCallBack) {
253
ret = pc->commentEndCallBack(pc);
263
static const char *parseEndTag(const char *html, struct PC_ * pc) {
273
while(*html != '\0' && *html != '>') html++;
275
tmp =(char *) calloc(1, (size_t)(html-tmp2+1));
278
memcpy(tmp, tmp2, (size_t)(html-tmp2));
280
if (pc->endCallBack) {
281
ret = pc->endCallBack(tmp,pc);
287
if (*html == '>') html++;
292
static const char *parsePHP(const char *html, struct PC_ * pc) {
298
while(*html != '\0' && isspace((int)*html)) html++;
302
while(*html != '\0' && !(*html == '?' && *(html+1) == '>')) html++;
303
tmp2 = (char *)calloc(1, (size_t)(html-tmp+1));
304
if (!tmp2) return "";
306
memcpy(tmp2, tmp, (size_t)(html-tmp));
308
if (pc->phpCallBack) {
309
ret = pc->phpCallBack(tmp2, pc);
320
/* parse the XML tag itself */
321
static const char *parseXMLtag(const char *html, struct PC_ * pc) {
322
char *tag, *name, *value;
325
struct ArgvTable *tmp2 = NULL;
329
while(*html != '\0' && !isspace((int)*html) && *html != '>') html++;
331
/* you may want to upper/lower tags, so I leave the tag itself untouched */
332
tag = (char *)calloc(1, (size_t)(html-tmp+1));
336
memcpy(tag, tmp, (size_t)(html-tmp));
339
if (pc->xmlCallBack != NULL) {
340
ret = pc->xmlCallBack(tag, NULL, 0, pc);
342
if (*html == '>') html++;
343
return((ret != 0) ? (char *) "" : html);
346
while(*html != '\0' && isspace((int)*html)) html++;
348
while(*html != '\0' && *html != '>' ) {
349
while(isspace((int)*html)) html++;
350
if (*html == '>') break;
353
while(*html != '\0' && !isspace((int)*html) && *html != '=' && *html != '>') html++;
354
name = (char *)calloc(1, (size_t)(html-tmp+1));
360
memcpy(name, tmp, (size_t)(html-tmp));
361
if (isspace((int)*html)) {
362
tmp2 = addArgToTable(tmp2, name, NULL, pc);
363
while(*html != '\0' && isspace((int)*html) && *html != '>') html++;
366
tmp2 = addArgToTable(tmp2, name, NULL, pc);
370
if (*html == '=') html++;
371
if (*html != '"' && *html != '\'') {
373
while(*html != '\0' && *html != '>' && !isspace((int)*html)) html++;
374
value = (char *)calloc(1, (size_t)(html-tmp+1));
387
memcpy(value, tmp, (size_t)(html-tmp));
388
tmp2 = addArgToTable(tmp2, name, value, pc);
389
} else if (*html == '"') {
392
while(*html != '\0' && !(*html == '"' && *(html-1) != '\\')) html++;
393
value = (char *) calloc(1, (size_t)(html-tmp+1));
406
memcpy(value, tmp, (size_t)(html-tmp));
409
tmp2 = addArgToTable(tmp2, name, value, pc);
410
} else if (*html == '\'') {
413
while(*html != '\0' && !(*html == '\'' && *(html-1) != '\\')) html++;
415
value = (char *)calloc(1, (size_t)(html-tmp+1));
427
memcpy(value, tmp, (size_t)(html-tmp));
430
tmp2 = addArgToTable(tmp2, name, value, pc);
436
if (*html != '\0') html++;
437
ret = pc->xmlCallBack(tag, tmp2, pc->numArgs, pc);
445
return (ret != 0 ? "" : html);
448
/* cannibalistic function, munches the actuall tag */
449
static const char *eatUp(const char *html){
450
while ( (*html != '>') &&
459
/* cannibalistic function, munches the actuall text */
460
static const char *eatUpText(const char *html){
461
while(*html != '\0' && *html != '<')
467
/* decides, if a found '?' leads to PHP or XML if requisited
468
otherwise it gormandizes them up. *burps* */
469
static const char *parseXML(const char *html, struct PC_ * pc) {
470
/* conditional expressions inside a conditional expression
471
don't try _this_ at home kids! ;-) */
472
html=(((tolower((int)(*(html+1))))==(int)('p')) ?
473
( (pc->phpCallBack) ? parsePHP (html, pc) : eatUp(html) ) :
474
( (pc->xmlCallBack) ? parseXMLtag(html, pc) : eatUp(html) ) );
478
static const char *parseStartTag (const char *html, struct PC_ * pc) {
479
char *tag, *name, *value;
482
struct ArgvTable *tmp2 = NULL;
486
while(*html != '\0' && !isspace((int)*html) &&
487
*html != '>' && *html != '/') html++;
489
tag = (char *)calloc(1, (size_t)(html-tmp+1));
493
memcpy(tag, tmp, (size_t)(html-tmp));
495
if (strncasecmp("script", tag, 6) == 0) {
496
pc->lhtml_script_passthru = 1;
498
else if (strncasecmp("pre", tag, 3) == 0) {
499
pc->lhtml_script_passthru = 2;
503
if (pc->startCallBack) {
504
ret = pc->startCallBack(tag, NULL, 0, pc);
508
/* this check is redundant */
509
/* if (*html == '>') */ html++;
510
return((ret != 0) ? "" : html);
513
else if (*html == '/' ) { /* XHTML empty tag like <hr/>, <br/>*/
514
/**********************************************
515
* You may choose now between two behaviors *
516
* of libhtmlparse to handle XHTML empty tags: *
517
* a) call XHTMLCallBack *
518
* b) call start- AND endCallBack *
519
***********************************************/
520
if (pc->startCallBack != NULL && !(pc->XHTMLCallBack)) {
521
ret = pc->startCallBack(tag, NULL, 0, pc);
523
if (pc->endCallBack != NULL && ret==0 && !(pc->XHTMLCallBack)) {
524
ret = pc->endCallBack(tag, pc);
526
if(pc->XHTMLCallBack){
527
ret = pc->XHTMLCallBack(tag, NULL, 0, pc);
534
return((ret != 0) ? "" : html);
537
while(*html != '\0' && isspace((int)*html)) html++;
539
while(*html != '\0' && *html != '>' ) {
540
while(isspace((int)*html)) html++;
541
if (*html == '>') break;
543
if (*html == '/' && *(html+1) == '>') {
548
while(*html != '\0' && !isspace((int)*html) &&
549
*html != '=' && *html != '>') html++;
550
name = (char *)calloc(1, (size_t)(html-tmp+1));
557
memcpy(name, tmp, (size_t)(html-tmp));
559
if (isspace((int)*html)) {
560
const char *x = html;
561
while(*x != '\0' && *x != '>' && *x != '=') x++;
566
tmp2 = addArgToTable(tmp2, name, NULL, pc);
567
while(*html != '\0' && isspace((int)*html) &&
569
!(*html == '/' && *(html+1) == '>'))
578
/* html++ is repeated after the while loop
579
* and may cause deletion of important info */
581
tmp2 = addArgToTable(tmp2, name, NULL, pc);
587
if (*html == '=') html++;
589
while(isspace(*html)) html++;
591
if (*html != '"' && *html != '\'') {
593
while(*html != '\0' && *html != '>' &&
594
!isspace((int)*html) &&
595
!(*html == '/' && *(html+1) == '>'))
597
value = (char *)calloc(1, (size_t)(html-tmp+1));
607
memcpy(value, tmp, (size_t)(html-tmp));
608
tmp2 = addArgToTable(tmp2, name, value, pc);
609
} else if (*html == '"') {
612
while(*html != '\0' &&
613
!(*html == '"' && *(html-1) != '\\'))
615
value = (char *) calloc(1, (size_t)(html-tmp+1));
626
memcpy(value, tmp, (size_t)(html-tmp));
629
tmp2 = addArgToTable(tmp2, name, value, pc);
630
} else if (*html == '\'') {
633
while(*html != '\0' && !(*html == '\'' &&
634
*(html-1) != '\\')) html++;
636
value = (char *)calloc(1, (size_t)(html-tmp+1));
647
memcpy(value, tmp, (size_t)(html-tmp));
650
tmp2 = addArgToTable(tmp2, name, value, pc);
655
if (*html != '\0') html++;
657
if (pc->startCallBack != NULL && *(html-2)!='/' ) {
658
ret = pc->startCallBack(tag, tmp2, pc->numArgs, pc);
660
if (pc->endCallBack != NULL && ret==0 && *(html-2)=='/'
661
&& !(pc->XHTMLCallBack)) {
662
ret = pc->endCallBack(tag, pc);
664
/* these tags may have arguments too, e.g. <hr noshade/> */
665
if (pc->XHTMLCallBack != NULL && *(html-2)=='/') {
666
ret = pc->XHTMLCallBack(tag, tmp2, pc->numArgs, pc);
676
/* this is a bad hack, feel free to write a better one (maybe a more readable one? ;-)*/
678
(pc->XHTMLCallBack != NULL) ?
680
((ret != 0) ? "" : html);
683
static const char *parseDecl(const char *html, struct PC_ * pc) {
684
char *tag, *name, *value;
687
struct ArgvTable *tmp2 = NULL;
691
while(*html != '\0' && !isspace((int)*html) && *html != '>') html++;
693
tag = (char *)calloc(1, (size_t)(html-tmp+1));
698
memcpy(tag, tmp, (size_t)(html-tmp));
701
if (pc->declCallBack) {
702
ret = pc->declCallBack(tag, NULL, 0, pc);
706
if (*html == '>') html++;
707
return((ret != 0) ? "" : html);
711
while(*html != '\0' && isspace((int)*html)) html++;
713
while(*html != '\0' && *html != '>') {
714
while(isspace((int)*html)) html++;
715
if (*html == '>') break;
721
while(*html != '\0' && !(*html == '\'' && *html != '\\'))
727
while(*html != '\0' && !(*html == '"' && *html != '\\'))
731
while(*html != '\0' && !isspace((int)*html) && *html != '=' && *html != '>')
736
name = (char *) calloc(1, (size_t)(html-tmp+1));
743
memcpy(name, tmp, (size_t)(html-tmp));
745
if (isspace((int)*html)) {
746
tmp2 = addArgToTable(tmp2, name, NULL, pc);
747
while(*html != '\0' && isspace((int)*html) && *html != '>')
753
tmp2 = addArgToTable(tmp2, name, NULL, pc);
758
if (*(html+1) == '>') {
759
tmp2 = addArgToTable(tmp2, name, NULL, pc);
764
if (*html == '=') html++;
769
while(*html != '\0' && !(*html == '\'' && *(html-1) != '\\'))
772
value = (char *) calloc(1, (size_t)(html-tmp+1));
783
memcpy(value, tmp, (size_t)(html-tmp));
786
tmp2 = addArgToTable(tmp2, name, value, pc);
791
while (*html != '\0' && !(*html == '"' && *(html-1) != '\\'))
793
value = (char *)calloc(1, (size_t)(html-tmp+1));
804
memcpy(value, tmp, (size_t)(html-tmp));
807
tmp2 = addArgToTable(tmp2, name, value, pc);
812
while(*html != '\0' && *html != '>' && !isspace((int)*html))
814
value = (char *) calloc(1, (size_t)(html-tmp+1));
825
memcpy(value, tmp, (size_t)(html-tmp));
826
tmp2 = addArgToTable(tmp2, name, value, pc);
832
if (*html != '\0') html++;
834
if (pc->declCallBack) {
835
ret = pc->declCallBack(tag, tmp2, pc->numArgs, pc);
839
return((ret != 0) ? "" : html);
847
static const char *parseForEntities (const char *tmp, struct PC_ * pc){
848
char *entity, *text ;
849
const char *tmp1, *tmp2;
853
while(*tmp != '\0' && *tmp != '&')tmp++;
855
text = (char *)calloc(1, (size_t)(tmp-tmp1+1));
860
memcpy(text, tmp1, (size_t)(tmp-tmp1));
861
/* the chunk of text before the first entity will
862
not be called, if it starts with an entity*/
863
if(strlen(text)>0 && (!(isspace((int)*text)))){
864
if (pc->textCallBack) {
865
ret = pc->textCallBack(text, pc);
874
/* sometimes the ';' is absent, it's a bad hack, just to avoid more trouble */
875
while( *tmp != '\0' && (*tmp != ';' && count != 9) ){
879
entity = (char *)calloc(1, (size_t)(tmp-tmp2+1));
883
memcpy(entity, tmp2, (size_t)(tmp-tmp2));
884
if (*tmp == ';' || count == 9){ /* should I add an errortrap here? */
885
ret = pc->entityCallBack(entity, pc);
893
if (*tmp != '\0') tmp++;
898
static void parse (const char *html, struct PC_ * pc) {
899
while(*html != '\0') {
900
/* while(isspace(*html)){html++;} there may be leading blanks in some autogenerated files
901
add this or not, that is the question ;-)) */
903
if (pc->lhtml_script_passthru != 0) {
908
if (pc->lhtml_script_passthru == 1 ){
909
while(*text != '\0') {
911
if (*(text+2) == 's' || *(text+2) == 'S') {
912
if (*(text+7) == 't' || *(text+7) == 'T') {
917
if(*text != '\0') text++;
919
if (pc->lhtml_script_passthru == 2 ){
920
while(*text != '\0') {
922
if (*(text+2) == 'p' || *(text+2) == 'P') {
923
if (*(text+4) == 'e' || *(text+4) == 'E') {
928
if(*text != '\0') text++;
931
tmp = (char *) malloc((size_t)(text-html+1));
933
fprintf(stderr, "WARNING [libhtmlparse]: memory error\n");
937
strncpy(tmp, html, (size_t)(text-html));
939
if (pc->textCallBack != NULL) {
940
int ret = pc->textCallBack(tmp, pc);
952
pc->lhtml_script_passthru = 0;
963
/* I must admit, I like conditional expressions,
964
they are so obviously obfuscated ;-) */
966
html = (*html == '-') ?
967
((pc->commentCallBack) ? parseComment(html, pc) : eatUp(html)) :
968
((pc->declCallBack) ? parseDecl(html, pc) : eatUp(html)) ;
970
case '?' : /* XML/PHP tag */
971
html = (pc->xmlCallBack != NULL || pc->phpCallBack != NULL) ?
975
case '/' : /* HTML end tag */
976
html = (pc->endCallBack) ?
977
parseEndTag(html, pc) :
980
default : /* HTML start tag */
981
html = (pc->XHTMLCallBack != NULL || pc->startCallBack != NULL) ?
982
parseStartTag(html, pc) :
986
} else { /* All other text */
987
/* while(isspace(*html))html++; it seems to be faster inside the function */
988
html = (pc->textCallBack) ?
998
/* ******************* now: LE specifics *************** */
1004
static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type,
1006
struct EXTRACTOR_Keywords * next) {
1007
EXTRACTOR_KeywordList * result;
1009
if (keyword == NULL)
1011
result = (EXTRACTOR_KeywordList*)malloc(sizeof(EXTRACTOR_KeywordList));
1012
result->next = next;
1013
result->keyword = strdup(keyword);
1014
result->keywordType = type;
1019
* Called by the parser whenever we see text.
1021
static int texts (char *comment, struct PC_ * pc) {
1022
if (pc->nextTextAction) {
1023
pc->result = addKeyword(pc->nextKeywordType,
1026
pc->nextTextAction = 0;
1031
static int hasTag(char * arg,
1033
struct ArgvTable * args,
1036
for (i=0;i<numargs;i++)
1037
if (0 == strcasecmp(args[i].arg, arg))
1038
if (0 == strcasecmp(args[i].val, val))
1043
static char * getTag(char * arg,
1044
struct ArgvTable * args,
1047
for (i=0;i<numargs;i++)
1048
if (0 == strcasecmp(args[i].arg, arg))
1055
EXTRACTOR_KeywordType type;
1057
{ "author" , EXTRACTOR_AUTHOR},
1058
{ "description" , EXTRACTOR_DESCRIPTION},
1059
{ "language", EXTRACTOR_LANGUAGE},
1060
{ "rights", EXTRACTOR_COPYRIGHT},
1061
{ "publisher", EXTRACTOR_PUBLISHER},
1062
{ "date", EXTRACTOR_DATE},
1063
{ "keywords", EXTRACTOR_KEYWORDS},
1064
{NULL, EXTRACTOR_UNKNOWN},
1069
static int starttag (char *tag,
1070
struct ArgvTable *args,
1075
if (0 == strcasecmp(tag,"title")) {
1076
pc->nextTextAction = 1;
1077
pc->nextKeywordType = EXTRACTOR_TITLE;
1080
if (0 == strcasecmp(tag,"meta")) {
1082
while (tagmap[i].name != NULL) {
1083
if (hasTag("name",tagmap[i].name,args, numargs))
1084
pc->result = addKeyword(tagmap[i].type,
1091
/* Don't do this, you can't be certain...*/
1093
if (0 == strcasecmp(tag,"html")) {
1094
pc->result = addKeyword(EXTRACTOR_MIMETYPE,
1103
static int endtag (char *tag, struct PC_ * pc) {
1104
pc->nextTextAction = 0;
1109
/* mimetype = text/html */
1110
struct EXTRACTOR_Keywords * libextractor_html_extract(const char * filename,
1113
struct EXTRACTOR_Keywords * prev) {
1122
sizeof(ParserContext));
1124
pc.textCallBack = &texts;
1125
pc.startCallBack = &starttag;
1126
pc.endCallBack = &endtag;
1127
if (size > 1024 * 32)
1131
/* the parser requires 0-termination. We just
1132
overwrite the last character in data and
1133
restore it later, assuming that it can
1134
hardly be a keyword in a valid HTML text...
1136
backup = data[xsize-1];
1137
data[xsize-1] = '\0';
1139
data[xsize-1] = backup;
1144
int main(int argc, char **argv) {
1147
struct stat fstatbuf;
1152
"Call with filename as argument\n");
1155
file = OPEN(filename,O_RDONLY);
1158
if (-1 == FSTAT(file, &fstatbuf)) {
1162
size = fstatbuf.st_size;
1163
buffer = mmap(NULL, size, PROT_READ, MAP_SHARED, file, 0);
1166
EXTRACTOR_printKeywords(stdout,
1167
libextractor_html_extract(argv[1],