2
$Id: xml.c,v 1.58 2003/03/28 16:31:46 whmoseley Exp $
5
** This program and library is free software; you can redistribute it and/or
6
** modify it under the terms of the GNU (Library) General Public License
7
** as published by the Free Software Foundation; either version 2
8
** of the License, or any later version.
10
** This program is distributed in the hope that it will be useful,
11
** but WITHOUT ANY WARRANTY; without even the implied warranty of
12
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
** GNU (Library) General Public License for more details.
16
** 2001-03-17 rasc save real_filename as title (instead full real_path)
17
** was: compatibility issue to v 1.x.x
18
** 2001-05-09 rasc entities changed (new module)
20
** 2001-07-25 moseley complete rewrite to use James Clark's Expat parser
23
** UndefinedMetaTags ignore is not coded
34
#include "metanames.h"
36
#include "expat/xmlparse/xmlparse.h" // James Clark's Expat
38
#define BUFFER_CHUNK_SIZE 20000
41
char *buffer; // text for buffer
42
int cur; // pointer to end of buffer
43
int max; // max size of buffer
44
int defaultID; // default ID for no meta names.
48
// I think that the property system can deal with StoreDescription in a cleaner way.
49
// This code shouldn't need to know about that StoreDescription.
52
struct metaEntry *meta;
53
int save_size; /* save max size */
54
char *tag; /* summary tag */
55
int active; /* inside summary */
60
CHAR_BUFFER text_buffer; // buffer for collecting text
62
// CHAR_BUFFER prop_buffer; // someday, may want a separate property buffer if want to collect tags within props
64
SUMMARY_INFO summary; // argh.
66
char *ignore_tag; // tag that triggered ignore (currently used for both)
71
INDEXDATAHEADER *header;
74
FileRec *thisFileEntry;
80
static void start_hndl(void *data, const char *el, const char **attr);
81
static void end_hndl(void *data, const char *el);
82
static void char_hndl(void *data, const char *txt, int txtlen);
83
static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen );
84
static void flush_buffer( PARSE_DATA *parse_data );
85
static void comment_hndl(void *data, const char *txt);
86
static char *isIgnoreMetaName(SWISH * sw, char *tag);
91
/*********************************************************************
92
* Entry to index an XML file.
94
* Creates an XML_Parser object and parses buffer
97
* Count of words indexed
100
* This is a stream parser, so could avoid loading entire document into RAM before parsing
102
*********************************************************************/
104
int countwords_XML (SWISH *sw, FileProp *fprop, FileRec *fi, char *buffer)
106
PARSE_DATA parse_data;
107
XML_Parser p = XML_ParserCreate(NULL);
108
IndexFILE *indexf = sw->indexlist;
109
struct StoreDescription *stordesc = fprop->stordesc;
113
memset(&parse_data, 0, sizeof(parse_data));
115
parse_data.header = &indexf->header;
116
parse_data.parser = p;
118
parse_data.fprop = fprop;
119
parse_data.filenum = fi->filenum;
120
parse_data.word_pos= 1; /* compress doesn't like zero */
121
parse_data.thisFileEntry = fi;
124
/* Don't really like this, as mentioned above */
125
if ( stordesc && (parse_data.summary.meta = getPropNameByName(parse_data.header, AUTOPROPERTY_SUMMARY)))
127
/* Set property limit size for this document type, and store previous size limit */
128
parse_data.summary.save_size = parse_data.summary.meta->max_len;
129
parse_data.summary.meta->max_len = stordesc->size;
130
parse_data.summary.tag = stordesc->field;
134
addCommonProperties(sw, fprop, fi, NULL,NULL, 0);
139
progerr("Failed to create XML parser object for '%s'", fprop->real_path );
142
/* Set event handlers */
143
XML_SetUserData( p, (void *)&parse_data ); // local data to pass around
144
XML_SetElementHandler(p, start_hndl, end_hndl);
145
XML_SetCharacterDataHandler(p, char_hndl);
147
if( sw->indexComments )
148
XML_SetCommentHandler( p, comment_hndl );
150
//XML_SetProcessingInstructionHandler(p, proc_hndl);
152
if ( !XML_Parse(p, buffer, fprop->fsize, 1) )
153
progwarn("XML parse error in file '%s' line %d. Error: %s",
154
fprop->real_path, XML_GetCurrentLineNumber(p),XML_ErrorString(XML_GetErrorCode(p)));
160
/* Flush any text left in the buffer, and free the buffer */
161
flush_buffer( &parse_data );
163
if ( parse_data.text_buffer.buffer )
164
efree( parse_data.text_buffer.buffer );
167
/* Restore the size in the StoreDescription property */
168
if ( parse_data.summary.save_size )
169
parse_data.summary.meta->max_len = parse_data.summary.save_size;
171
return parse_data.total_words;
174
/*********************************************************************
175
* Start Tag Event Handler
177
* These routines check to see if a given meta tag should be indexed
178
* and if the tags should be added as a property
181
* deal with attributes!
183
*********************************************************************/
186
static void start_hndl(void *data, const char *el, const char **attr)
188
PARSE_DATA *parse_data = (PARSE_DATA *)data;
190
SWISH *sw = parse_data->sw;
191
char tag[MAXSTRLEN + 1];
194
/* return if within an ignore block */
195
if ( parse_data->ignore_tag )
198
/* Flush any text in the buffer */
199
flush_buffer( parse_data );
202
if(strlen(el) >= MAXSTRLEN) // easy way out
204
progwarn("Warning: Tag found in %s is too long: '%s'", parse_data->fprop->real_path, el );
208
strcpy(tag,(char *)el);
209
strtolower( tag ); // $$$ swish ignores case in xml tags!
213
/* Bump on all meta names, unless overridden */
214
/* Done before the ignore tag check since still need to bump */
216
if (!isDontBumpMetaName(sw->dontbumpstarttagslist, tag))
217
parse_data->word_pos++;
220
/* check for ignore tag (should propably remove char handler for speed) */
221
if ( (parse_data->ignore_tag = isIgnoreMetaName( sw, tag )))
225
/* Check for metaNames */
227
if ( (m = getMetaNameByName( parse_data->header, tag)) )
232
if (sw->UndefinedMetaTags == UNDEF_META_AUTO)
235
printf("!!!Adding automatic MetaName '%s' found in file '%s'\n", tag, parse_data->fprop->real_path);
237
addMetaEntry( parse_data->header, tag, META_INDEX, 0)->in_tag++;
241
/* If set to "error" on undefined meta tags, then error */
242
if (sw->UndefinedMetaTags == UNDEF_META_ERROR)
243
progerr("UndefinedMetaNames=error. Found meta name '%s' in file '%s', not listed as a MetaNames in config", tag, parse_data->fprop->real_path);
247
/* Check property names */
249
if ( (m = getPropNameByName( parse_data->header, tag)) )
253
/* Look to enable StoreDescription */
255
SUMMARY_INFO *summary = &parse_data->summary;
256
if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
263
/*********************************************************************
264
* End Tag Event Handler
268
*********************************************************************/
271
static void end_hndl(void *data, const char *el)
273
PARSE_DATA *parse_data = (PARSE_DATA *)data;
274
char tag[MAXSTRLEN + 1];
277
if(strlen(el) > MAXSTRLEN)
279
progwarn("Warning: Tag found in %s is too long: '%s'", parse_data->fprop->real_path, el );
283
strcpy(tag,(char *)el);
286
if ( parse_data->ignore_tag )
288
if (strcmp( parse_data->ignore_tag, tag ) == 0)
289
parse_data->ignore_tag = NULL; // don't free since it's a pointer to the config setting
293
/* Flush any text in the buffer */
294
flush_buffer( parse_data );
297
/* Don't allow matching across tag boundry */
298
if (!isDontBumpMetaName(parse_data->sw->dontbumpendtagslist, tag))
299
parse_data->word_pos++;
303
/* Flag that we are not in tag anymore - tags must be balanced, of course. */
305
if ( ( m = getMetaNameByName( parse_data->header, tag) ) )
310
if ( ( m = getPropNameByName( parse_data->header, tag) ) )
315
/* Look to disable StoreDescription */
317
SUMMARY_INFO *summary = &parse_data->summary;
318
if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
324
/*********************************************************************
325
* Character Data Event Handler
327
* This does the actual adding of text to the index and adding properties
328
* if any tags have been found to index
331
*********************************************************************/
333
static void char_hndl(void *data, const char *txt, int txtlen)
335
PARSE_DATA *parse_data = (PARSE_DATA *)data;
338
/* If currently in an ignore block, then return */
339
if ( parse_data->ignore_tag )
342
/* Buffer the text */
343
append_buffer( &parse_data->text_buffer, txt, txtlen );
345
/* Some day, might want to have a separate property buffer if need to collect more than plain text */
346
// append_buffer( parse_data->prop_buffer, txt, txtlen );
350
/*********************************************************************
351
* Append character data to the end of the buffer
353
* Buffer is extended/created if needed
355
* ToDo: Flush buffer if it gets too large
358
*********************************************************************/
360
static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen )
363
if ( !txtlen ) // shouldn't happen
367
/* (re)allocate buf if needed */
369
if ( buf->cur + txtlen >= buf->max )
371
buf->max = ( buf->max + BUFFER_CHUNK_SIZE+1 < buf->cur + txtlen )
372
? buf->cur + txtlen+1
373
: buf->max + BUFFER_CHUNK_SIZE+1;
375
buf->buffer = erealloc( buf->buffer, buf->max+1 );
379
memcpy( (void *) &(buf->buffer[buf->cur]), txt, txtlen );
386
/*********************************************************************
387
* Flush buffer - adds words to index, and properties
389
* 2001-08 jmruiz Change structure from IN_FILE | IN_META to IN_FILE
390
* Since structure does not have much sense in XML, if we use only IN_FILE
391
* we will save memory and disk space (one byte per location)
394
*********************************************************************/
395
static void flush_buffer( PARSE_DATA *parse_data )
397
CHAR_BUFFER *buf = &parse_data->text_buffer;
398
SWISH *sw = parse_data->sw;
400
/* anything to do? */
404
buf->buffer[buf->cur] = '\0';
408
parse_data->total_words +=
409
indexstring( sw, buf->buffer, parse_data->filenum, IN_FILE, 0, NULL, &(parse_data->word_pos) );
412
/* Add the properties */
413
addDocProperties( parse_data->header, &(parse_data->thisFileEntry->docProperties), (unsigned char *)buf->buffer, buf->cur, parse_data->fprop->real_path );
416
/* yuck. Ok, add to summary, if active */
418
SUMMARY_INFO *summary = &parse_data->summary;
419
if ( summary->active )
420
addDocProperty( &(parse_data->thisFileEntry->docProperties), summary->meta, (unsigned char *)buf->buffer, buf->cur, 0 );
424
/* clear the buffer */
430
/*********************************************************************
433
* Should be able to call the char_hndl
436
* Can't use DontBump with comments. Might need a config variable for that.
438
*********************************************************************/
439
static void comment_hndl(void *data, const char *txt)
441
PARSE_DATA *parse_data = (PARSE_DATA *)data;
442
SWISH *sw = parse_data->sw;
445
/* Bump position around comments - hard coded, always done to prevent phrase matching */
446
parse_data->word_pos++;
449
parse_data->total_words +=
450
indexstring( sw, (char *)txt, parse_data->filenum, IN_COMMENTS, 0, NULL, &(parse_data->word_pos) );
453
parse_data->word_pos++;
459
/*********************************************************************
460
* check if a tag is an IgnoreTag
462
* Note: this returns a pointer to the config set tag, so don't free it!
465
*********************************************************************/
467
static char *isIgnoreMetaName(SWISH * sw, char *tag)
469
struct swline *tmplist = sw->ignoremetalist;
476
if (strcmp(tag, tmplist->line) == 0)
477
return tmplist->line;
479
tmplist = tmplist->next;