2
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
3
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
5
** This program and library is free software; you can redistribute it and/or
6
** modify it under the terms of the GNU (Library) General Public License
7
** as published by the Free Software Foundation; either version 2
8
** of the License, or any later version.
10
** This program is distributed in the hope that it will be useful,
11
** but WITHOUT ANY WARRANTY; without even the implied warranty of
12
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
** GNU (Library) General Public License for more details.
15
** You should have received a copy of the GNU (Library) General Public License
16
** along with this program; if not, write to the Free Software
17
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18
**---------------------------------------------------------
19
** ** ** PATCHED 5/13/96, CJC
20
** Added MatchAndChange for regex in replace rule G.Hill 2/10/98
22
** change sprintf to snprintf to avoid corruption
23
** added safestrcpy() macro to avoid corruption from strcpy overflow
26
** fixed cast to int problems pointed out by "gcc -Wall"
36
/* My own case-insensitive strstr().
45
for (i = 0; s[i]; i++) {
46
for (j = 0, l = k = i; s[k] && t[j] &&
47
tolower(s[k]) == tolower(t[j]); j++, k++)
55
/* Gets the next word in a line. If the word's in quotes,
56
** include blank spaces in the word or phrase.
59
char *getword(line, skiplen)
66
static char *word=NULL;
68
if(!lenword) word = (char *)emalloc((lenword=MAXWORDLEN) + 1);
72
while (isspace((int)*line)) line++;
81
for (i = 0; *line && ((inquotes) ? (*line != '\"') : (!isspace((int)*line))); line++) {
84
word = (char *)erealloc(word,lenword+1);
91
if ((p=strpbrk(word,"\r\n"))) *p='\0';
96
*skiplen = line - start;
101
/* Gets the value of a variable in a line of the configuration file.
102
** Basically, anything in quotes or an argument to a variable.
105
char *getconfvalue(line, var)
111
static int lentmpvalue=0;
112
static char *tmpvalue=NULL;
114
if(!lentmpvalue) tmpvalue = (char *) emalloc((lentmpvalue=MAXSTRLEN) + 1);
115
if ((c = (char *) lstrstr(line, var)) != NULL) {
119
while (isspace((int)*c) || *c == '\"')
123
for (i = 0; *c != '\0' && *c != '\"' && *c != '\n' && *c!= '\r' ; c++) {
126
tmpvalue= (char *) erealloc(tmpvalue,lentmpvalue +1);
137
/* Extracts anything in <title> tags from an HTML file and returns it.
138
** Otherwise, only the file name without its path is returned.
141
char *parsetitle(filename, alttitle)
148
static int lentitle=0;
149
static char *title=NULL;
150
static int lenshorttitle=0;
151
static char *shorttitle=NULL;
152
int i, j, lines, status, tagbuflen, totaltaglen, curlentitle;
155
if(!lentitle) title = (char *) emalloc((lentitle=MAXTITLELEN) +1);
156
if(!lenshorttitle) shorttitle = (char *) emalloc((lenshorttitle=MAXTITLELEN) +1);
157
tag = (char *) emalloc(1);
163
if ((q=strrchr(alttitle, '/')))
167
shorttitle = SafeStrCopy(shorttitle,q,&lenshorttitle);
169
fp = fopen(filename, "r");
174
for (; lines < TITLETOPLINES ; ) {
187
tag = (char *) emalloc((tagbuflen=MAXSTRLEN)+1);
189
tag[totaltaglen++] = '<';
199
if(totaltaglen==tagbuflen) {
201
tag=erealloc(tag,tagbuflen+1);
203
tag[totaltaglen++] = d;
205
tag[totaltaglen]='\0';
210
if (lstrstr(tag, "</title>")) {
214
for (i = 0; title[i]; i++)
215
if (title[i] == '\n')
217
for (i = 0; isspace((int)title[i]) ||
218
title[i] == '\"'; i++)
220
for (j = 0; title[i]; j++)
221
title[j] = title[i++];
223
for (j = strlen(title) - 1;
224
(j && isspace((int)title[j]))
225
|| title[j] == '\0' || title[j] == '\"'; j--)
227
for (j = 0; title[j]; j++)
228
if (title[j] == '\"')
231
return *title ? title : shorttitle;
234
if (lstrstr(tag, "<title>"))
239
if (status == TI_FOUND) {
241
if(curlentitle==lentitle) {
243
title = (char *)erealloc(title,lentitle +1);
244
p = title + curlentitle;
250
if (status == TI_CLOSE) {
263
/* Is a character a valid word character?
266
/* Old version - Now is a macro with a lookuptable for better performance
267
int iswordchar(char c)
273
for (i = 0; wordchars[i] != '\0'; i++)
275
if ((char)d == wordchars[i])
282
/* In a string, replaces all occurrences of "oldpiece" with "newpiece".
283
** This is not really bulletproof yet.
288
char *replace(string, oldpiece, newpiece)
293
int limit, curpos, lennewpiece, lenoldpiece, curnewlen;
295
static int lennewstring=0;
296
static char *newstring=NULL;
298
if(!lennewstring) newstring = (char *) emalloc((lennewstring=MAXSTRLEN) + 1);
300
lennewpiece = strlen(newpiece);
301
lenoldpiece = strlen(oldpiece);
305
while ((p = (char *) strstr(c, oldpiece))) {
307
curnewlen += (limit + lennewpiece);
308
if(curnewlen > lennewstring) {
309
curpos = q - newstring;
310
lennewstring = curnewlen + 200;
311
newstring = (char *) erealloc(newstring,lennewstring+1);
316
memcpy(q,newpiece,lennewpiece);
320
curnewlen +=strlen(c);
321
if(curnewlen > lennewstring) {
322
curpos = q - newstring;
323
lennewstring = curnewlen + 200;
324
newstring = (char *) erealloc(newstring,lennewstring+1);
331
/* Just for A.P. and K.H. 2/5/98 by G.Hill - not really used now */
332
char* replaceWild (char* fileName, char* pattern, char* subs)
335
for (i = 0; pattern[i] != '*' && fileName[i] != '\0'; i++)
337
if (fileName[i] != pattern[i])
343
/* Like strcmp(), but the order of sorting the first char is
344
** determined by the order of the characters in the wordchars array.
348
int wordcompare(s1, s2)
354
if (s1[0] != s2[0]) {
355
for (i = 0; wordchars[i] != '\0'; i++)
356
if (s1[0] == wordchars[i])
358
for (j = 0; wordchars[j] != '\0'; j++)
360
if (s2[0] == wordchars[j])
369
return strcmp(s1, s2);
373
/* This converts HTML numbered entities (such as ©)
374
** to strings (like ©). Much is this function is
375
** simply adding semicolons in the right places.
376
** This and the functions it calls are not very fast
377
** and could be made faster.
380
char *convertentities(s)
385
static char *ent=NULL;
386
static int lennewword=0;
387
static char *newword=NULL;
388
char *newwordconvert;
391
if(!lennewword) newword = (char *) emalloc((lennewword=MAXWORDLEN) +1);
392
if(!lenent) ent = (char *) emalloc((lenent=MAXENTLEN) +1);
394
if (!(p=strchr(s, '&'))) return s;
396
if(lens>lennewword) {
397
lennewword = lens + 200;
398
newword = (char *) erealloc(newword,lennewword + 1);
400
if ((int)strlen(s) > maxwordlimit) return s;
405
for (s=p; *s != '\0'; s++) {
407
ent = SafeStrCopy(ent,getent(s, &skip),&lenent);
408
if (ent[0] == '\0') {
426
/* Jose Ruiz 06/00 Do not call to converttonamed
427
** here. convertoascii do all the work
429
newwordconvert = (char *) converttoascii(newword);
430
newword = SafeStrCopy(newword,newwordconvert,&lennewword);
432
newwordconvert = (char *) converttonamed(newword);
433
newword = SafeStrCopy(newword,newwordconvert,&lennewword);
439
/* Returns a matching entity that matches the beginning of a string, if any.
442
char *getent(s, skip)
448
static char *ent=NULL;
449
static int lentestent=0;
450
static char *testent=NULL;
452
if(!lenent) ent = (char *)emalloc((lenent=MAXENTLEN) +1);
453
if(!lentestent) testent = (char *)emalloc((lentestent=MAXENTLEN) +1);
456
ent = SafeStrCopy(ent,s,&lenent);
458
if (isdigit((int)ent[5]))
460
for (i = 2; ent[i] != '\0' && isdigit((int)ent[i]); i++)
462
while (ent[i] != '\0' && !isdigit((int)ent[i]))
468
for (i = 0; entities[i] != NULL; i += 3) {
469
testent = SafeStrCopy(testent, entities[i],&lentestent);
470
if (testent[0] != '\0') {
471
if (!strncmp(testent, ent, strlen(testent))) {
472
ent = SafeStrCopy(ent, testent,&lenent);
483
/* This is the real function called by convertentities() that
484
** changes numbered to named entities.
487
char *converttonamed(s)
490
int i, hasnumbered, ilen;
491
static int lentestent=0;
492
static char *testent=NULL;
493
static int lennewent=0;
494
static char *newent=NULL;
495
static int lennewword=0;
496
static char *newword=NULL;
497
char *newwordreplaced;
499
if(!lennewword) newword = (char *) emalloc((lennewword=MAXWORDLEN) + 1);
500
if(!lentestent) testent = (char *) emalloc((lentestent=MAXENTLEN) + 1);
501
if(!lennewent) newent = (char *) emalloc((lennewent=MAXENTLEN) + 1);
503
newword=SafeStrCopy(newword, s,&lennewword);
505
for (i = 0, hasnumbered = 0; entities[i] != NULL; i += 3) {
506
ilen=strlen(entities[i+1]);
507
if((ilen+1)>=lentestent) {
508
lentestent=ilen+1+100;
509
testent=erealloc(testent,lentestent+1);
511
memcpy(testent, entities[i + 1],ilen);
513
testent[ilen+1]='\0';
514
if (strstr(newword, testent) != NULL &&
515
(entities[i])[0] != '\0') {
517
ilen=strlen(entities[i]);
518
if((ilen+1)>=lennewent) {
519
lennewent=ilen+1+100;
520
newent=erealloc(newent,lennewent+1);
522
memcpy(newent,entities[i],ilen);
525
newwordreplaced = (char *) replace(newword, testent, newent);
526
newword=SafeStrCopy(newword,newwordreplaced,&lennewword);
527
strcpy(newword,newwordreplaced);
530
} while (hasnumbered);
534
/* This function converts all convertable named and numbered
535
** entities to their ASCII equivalents, if they exist.
538
char *converttoascii(s)
541
int i, hasnonascii,ilen;
543
static int lenwrdent=0;
544
static char *wrdent=NULL;
545
static int lennument=0;
546
static char *nument=NULL;
547
static int lennewword=0;
548
static char *newword=NULL;
549
char *newwordreplaced;
551
if(!lennewword) newword = (char *) emalloc((lennewword=MAXWORDLEN) + 1);
552
if(!lenwrdent) wrdent = (char *) emalloc((lenwrdent=MAXENTLEN) + 1);
553
if(!lennument) nument = (char *) emalloc((lennument=MAXENTLEN) + 1);
555
newword=SafeStrCopy(newword, s, &lennewword);
558
for (i = 0, hasnonascii = 0; entities[i] != NULL; i += 3) {
559
ilen=strlen(entities[i]);
560
if((ilen+1)>=lenwrdent) {
561
lenwrdent=ilen+1+200;
562
wrdent=erealloc(wrdent,lenwrdent+1);
564
memcpy(wrdent,entities[i],ilen);
567
ilen=strlen(entities[i+1]);
568
if((ilen+1)>=lennument) {
569
lennument=ilen+1+200;
570
nument=erealloc(nument,lennument+1);
572
memcpy(nument,entities[i+1],ilen);
577
if ((entities[i])[0] != '\0')
578
c = (char *) strstr(newword, wrdent);
579
if ((entities[i + 1])[0] != '\0')
580
d = (char *) strstr(newword, nument);
581
if ((entities[i + 2])[0] != '\0' && (c!=NULL || d!=NULL)) {
583
if (c != NULL && d==NULL) {
584
newwordreplaced = (char *) replace(newword, wrdent, entities[i + 2]);
585
newword=SafeStrCopy(newword,newwordreplaced,&lennewword);
586
} else if (d != NULL && c==NULL) {
587
newwordreplaced = (char *) replace(newword, nument, entities[i + 2]);
588
newword=SafeStrCopy(newword,newwordreplaced,&lennewword);
590
newwordreplaced = (char *) replace(newword, wrdent, entities[i + 2]);
591
newword=SafeStrCopy(newword,newwordreplaced,&lennewword);
592
newwordreplaced = (char *) replace(newword, nument, entities[i + 2]);
593
newword=SafeStrCopy(newword,newwordreplaced,&lennewword);
597
} while (hasnonascii);
602
/* That regular expression matching and replacing thing */
603
char * matchAndChange (char *str, char *pattern, char *subs)
605
int status, lenSub, lenBeg, lenTmp;
607
regmatch_t pmatch[MAXPAR];
609
char begin[MAXSTRLEN];
610
static int lennewstr=0;
611
static char *newstr=NULL;
612
static int lenoldStr=0;
613
static char *oldStr=NULL;
615
if(!lenoldStr) oldStr = (char *)emalloc((lenoldStr=MAXSTRLEN) +1);
616
if(!lennewstr) newstr = (char *)emalloc((lennewstr=MAXSTRLEN) +1);
618
/* Save the old string just in case */
619
oldStr = SafeStrCopy(oldStr,str,&lenoldStr);
621
status = regcomp(&re, pattern, REG_EXTENDED);
623
regfree(&re); /* Richard Beebe */
627
status = regexec(&re,str,(size_t)MAXPAR,pmatch,0);
629
regfree(&re); /* Richard Beebe */
635
/* Stuff the new piece were needed */
636
strncpy(begin,tmpstr,pmatch[0].rm_so); /* get the beginning */
637
begin[pmatch[0].rm_so] = '\0'; /* Null terminate */
639
lenBeg = strlen(begin);
640
lenSub = strlen(subs);
641
lenTmp = strlen(&(tmpstr[pmatch[0].rm_eo]));
642
if ( (lenTmp + lenSub + lenBeg) >= lennewstr)
644
lennewstr = lenTmp + lenSub + lenBeg + 200;
645
newstr = (char *)erealloc(newstr,lennewstr +1);
648
memcpy(newstr,begin,lenBeg);
649
memcpy(newstr+lenBeg,subs,lenSub);
650
memcpy(newstr+lenBeg+lenSub,&(tmpstr[pmatch[0].rm_eo]),lenTmp);
651
newstr[lenBeg+lenSub+lenTmp]='\0';
653
/* Copy the newstr into the tmpstr */
655
safestrcpy(MAXSTRLEN,tmpstr,newstr); /* CAREFUL! tmpstr points to an arg of unknown length */
656
/* Position the pointer to the end of the subs string */
657
tmpstr = &(tmpstr[pmatch[0].rm_so+lenSub]);
658
status = regexec(&re,tmpstr,(size_t)5,pmatch,0);
663
/*---------------------------------------------------------*/
664
/* Match a regex and a string */
666
int matchARegex( char *str, char *pattern)
671
status = regcomp(&re, pattern, REG_EXTENDED);
673
regfree(&re); /* Richard Beebe */
677
status = regexec(&re,str,(size_t)0,NULL,0);
678
regfree(&re); /** Marc Perrin ## 18Jan99 **/
684
/*-----------------------------------------------------*/
685
void makeItLow (char *str)
688
int len = strlen(str);
689
for (i = 0; i < len; i++)
690
str[i] = tolower(str[i]);
692
/*----------------------------------------------------*/
694
/* Check if a file with a particular suffix should be indexed
695
** according to the settings in the configuration file.
698
int isoksuffix(filename, rulelist)
700
struct swline *rulelist;
704
static int lensuffix=0;
705
static char *suffix=NULL;
706
static int lenchecksuffix=0;
707
static char *checksuffix=NULL;
708
struct swline *tmplist;
710
if(!lensuffix) suffix = (char *) emalloc((lensuffix=MAXSUFFIXLEN) + 1);
711
if(!lenchecksuffix) checksuffix = (char *) emalloc((lenchecksuffix=MAXSUFFIXLEN) + 1);
717
if ((c = (char *) strrchr(filename, '.')) == NULL)
720
if ((int)strlen(c+1) >= MAXSUFFIXLEN)
724
checksuffix = SafeStrCopy(checksuffix, c + 1,&lenchecksuffix);
725
while (tmplist != NULL) {
726
if ((c = (char *) strrchr(tmplist->line, '.')) == NULL)
727
{ suffix = SafeStrCopy(suffix, tmplist->line,&lensuffix); }
729
{ suffix = SafeStrCopy(suffix, c + 1,&lensuffix); }
730
if (lstrstr(suffix, checksuffix) && strlen(suffix) ==
733
tmplist = tmplist->next;
739
** Function to copy strings
740
** Reallocate memory if needed
741
** Returns the string copied
743
char *SafeStrCopy(dest, orig, initialsize)
752
*initialsize=len + 200;
754
dest = (char *) erealloc(dest,*initialsize + 1);
756
dest = (char *) emalloc(*initialsize + 1);
763
/* Comparison routine to sort a string - See sortstring */
764
int ccomp(const void *s1,const void *s2)
766
return (*(unsigned char *)s1 - *(unsigned char *)s2);
769
/* Sort a string removing dups */
770
void sortstring(char *s)
774
qsort(s,len,1,&ccomp);
775
for(i=1,j=1;i<len;i++) if(s[i]!=s[j-1]) s[j++]=s[i];
780
/* Merges two strings removing dups and ordering results */
781
char *mergestrings(char *s1, char *s2)
783
int i,j,ilen1,ilen2,ilent;
790
if (ilen1) memcpy(s,s1,ilen1);
791
if (ilen2) memcpy(s+ilen1,s2,ilen2);
792
if (ilent) qsort(s,ilent,1,&ccomp);
793
for(i=1,j=1,p[0]=s[0];i<ilent;i++) if(s[i]!=p[j-1]) p[j++]=s[i];
799
void makelookuptable(char* s,int *l)
802
for(i=0;i<256;i++) l[i]=0;
803
for(;*s;s++)l[(int)((unsigned char)*s)]=1;
806
void makeallstringlookuptables(void)
808
makelookuptable("aeiouAEIOU",isvowellookuptable);
811
/* 06/00 Jose Ruiz- Parses a line into a StringList
813
StringList *parse_line(char *line)
816
int cursize,skiplen,maxsize;
818
if(!line) return(NULL);
819
if((p=strchr(line,'\n')))
822
sl=(StringList *)emalloc(sizeof(StringList));
823
sl->word=(char **)emalloc((maxsize=1)*sizeof(char *));
826
while(skiplen && *(p=(char *)getword(line,&skiplen)))
829
sl->word=(char **)erealloc(sl->word,(maxsize*=2)*sizeof(char *));
830
sl->word[cursize++]=(char *)estrdup(p);
837
/* Frees memory used by a StringList
839
void freeStringList(StringList *sl)
843
efree(sl->word[--sl->n]);