1
/* Implementation of some simple, context-free lexers. */
3
/* Copyright (C) 1997 Andrew McCallum
5
Written by: Andrew Kachites McCallum <mccallum@cs.cmu.edu>
7
This file is part of the Bag-Of-Words Library, `libbow'.
9
This library is free software; you can redistribute it and/or
10
modify it under the terms of the GNU Library General Public License
11
as published by the Free Software Foundation, version 2.
13
This library is distributed in the hope that it will be useful,
14
but WITHOUT ANY WARRANTY; without even the implied warranty of
15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
Library General Public License for more details.
18
You should have received a copy of the GNU Library General Public
19
License along with this library; if not, write to the Free Software
20
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */
23
#include <ctype.h> /* for isalpha() */
28
#define SELF ((ifile_lexer_simple*)self)
30
/* This function is defined in scan.c */
31
extern int ifile_scan_fp_for_string (FILE *fp, const char *string, int oneline);
33
extern arguments args;
35
/* Create and return a IFILE_LEX, filling the document buffer from
36
characters in FP, starting after the START_PATTERN, and ending with
39
ifile_lexer_simple_open_text_fp (ifile_lexer *self,
42
int document_size = 2048; /* the initial size of the document buffer */
43
int len; /* an index into RET->DOCUMENT */
44
ifile_lex *ret; /* the IFILE_LEX we will return. */
45
const char *end_pattern_ptr;
46
int byte; /* a character read from FP */
51
/* Create space for the document buffer. */
52
ret = ifile_malloc (self->sizeof_lex);
53
ret->document = ifile_malloc (document_size);
55
/* Make sure DOCUMENT_START_PATTERN is not NULL; this would cause
56
it to scan forward to EOF. */
57
assert (self->document_start_pattern);
59
/* Scan forward in the file until we find the start pattern. */
60
ifile_scan_fp_for_string (fp, self->document_start_pattern, 0);
62
/* Make sure the DOCUMENT_END_PATTERN isn't the empty string; this
63
would cause it to match and finish filling immediately. */
64
assert (!self->document_end_pattern || self->document_end_pattern[0]);
66
/* Fill the document buffer until we get EOF, or until we get to the
67
DOCUMENT_END_PATTERN. */
68
for (len = 0, end_pattern_ptr = self->document_end_pattern;
70
(((byte = fgetc (fp)) != EOF)
71
/* We found the DOCUMENT_END_PATTERN */
73
&& *end_pattern_ptr == byte && *(end_pattern_ptr+1) == '\0'));
76
if (len >= document_size-1)
78
/* The RET->DOCUMENT buffer must grow to accommodate more chars. */
79
/* We need `DOCUMENT_SIZE-1' in the above test, because we
80
must have room for the terminating '\0'! */
82
ret->document = ifile_realloc (ret->document, document_size);
85
/* Put the byte in the document buffer. */
86
ret->document[len] = byte;
88
/* If the byte matches the next character of the DOCUMENT_END_PATTERN
89
then prepare to match the next character of the pattern,
90
otherwise reset to the beginning of the pattern. */
93
if (byte == *end_pattern_ptr)
95
else if (byte == self->document_end_pattern[0])
96
end_pattern_ptr = self->document_end_pattern+1;
98
end_pattern_ptr = self->document_end_pattern;
104
ifile_free (ret->document);
110
/* If we found the DOCUMENT_END_PATTERN, push it back into the input
111
stream, so we'll see it next time we read from this file. */
112
/* xxx Will this work for stdin? */
115
int end_pattern_len = (self->document_end_pattern
116
? strlen (self->document_end_pattern)
118
if (end_pattern_len && fseek (fp, -end_pattern_len, SEEK_CUR) != 0)
119
perror (__PRETTY_FUNCTION__);
120
len -= end_pattern_len;
124
/* Remember, it may be the case that LEN is zero. */
125
ret->document_position = 0;
126
ret->document_length = len;
127
if (args.max_length != 0 && args.max_length < ret->document_length)
128
ret->document_length = args.max_length;
129
assert (ret->document_length < document_size);
130
((char*)ret->document)[ret->document_length] = '\0';
134
/* Close the LEX buffer, freeing the memory held by it. */
136
ifile_lexer_simple_close (ifile_lexer *self, ifile_lex *lex)
138
ifile_free (lex->document);
142
/* Get the raw token from the document buffer by scanning forward
143
until we get a start character, and filling the buffer until we get
144
an ending character. The resulting token in the buffer is
145
NULL-terminated. Return the length of the token. */
147
ifile_lexer_simple_get_raw_word (ifile_lexer_simple *self, ifile_lex *lex,
148
char *buf, int buflen)
150
int byte; /* characters read from the FP */
151
int wordlen; /* number of characters in the word so far */
153
/* Ignore characters until we get a beginning character. */
156
byte = lex->document[lex->document_position++];
160
while (! self->true_to_start (byte));
162
/* Add the first alphabetic character to the word. */
163
buf[0] = (self->case_sensitive) ? byte : tolower (byte);
165
/* Add all the following satisfying characters to the word. */
166
for (wordlen = 1; wordlen < buflen; wordlen++)
168
byte = lex->document[lex->document_position++];;
171
if (! self->false_to_end (byte))
173
buf[wordlen] = tolower (byte);
176
if (wordlen >= buflen)
177
ifile_error ("Encountered word longer than buffer length=%d", buflen);
179
/* Back up to point at the character that caused the end of the word. */
180
lex->document_position--;
188
/* Perform all the necessary postprocessing after the initial token
189
boundaries have been found: strip non-alphas from end, toss words
190
containing non-alphas, toss words containing certaing many digits,
191
toss words appearing in the stop list, stem the word, check the
192
stoplist again, toss words of length one. If the word is tossed,
193
return zero, otherwise return the length of the word. */
195
ifile_lexer_simple_postprocess_word (ifile_lexer_simple *self, ifile_lex *lex,
196
char *buf, int buflen)
198
int wordlen = strlen (buf);
200
/* Toss words that are longer than SELF->TOSS_WORDS_LONGER_THAN */
201
if (self->toss_words_longer_than)
203
if (wordlen > self->toss_words_longer_than)
207
if (self->strip_non_alphas_from_end)
209
/* Strip any non-alphabetic characters off the end of the word */
210
while (wordlen && !isalpha(buf[wordlen-1]))
218
if (self->toss_words_containing_non_alphas)
220
/* If the word contains any non-alphabetic characters, get
221
another word instead. */
224
for (bufp = buf; *bufp; bufp++)
226
if (!isalpha (*bufp))
232
/* If the word contain TOSS_WORDS_CONTAINING_THIS_MANY_DIGITS
233
number of digits, get another word instead. (Here the
234
variable BYTE holds the count of the number of digits.) */
235
if (self->toss_words_containing_this_many_digits)
239
for (bufp = buf, byte = 0; *bufp; bufp++)
242
if (++byte > self->toss_words_containing_this_many_digits)
247
if (self->stoplist_func && self->stoplist_func (buf))
250
/* Apply the stemming algorithm to the word. */
252
self->stem_func (buf);
254
/* If the result of stemming is on the stoplist, go back and start again. */
255
if (self->stoplist_func && self->stoplist_func (buf))
258
/* If the result of stemming is only one letter long, go back and
263
/* Return the length of the word we found. */
267
/* Scan a single token from the LEX buffer, placing it in BUF, and
268
returning the length of the token. BUFLEN is the maximum number of
269
characters that will fit in BUF. If the token won't fit in BUF,
270
an error is raised. */
272
ifile_lexer_simple_get_word (ifile_lexer *self, ifile_lex *lex,
273
char *buf, int buflen)
275
int wordlen; /* number of characters in the word so far */
279
wordlen = ifile_lexer_simple_get_raw_word ((ifile_lexer_simple*)self,
284
while ((wordlen = ifile_lexer_simple_postprocess_word
285
((ifile_lexer_simple*)self, lex, buf, buflen))
290
/* The end of the ifile_lex_simple_ functions. */
294
/* A function wrapper around POSIX's `isalpha' macro. */
296
ifile_isalpha (int character)
298
return isalpha (character);
301
/* A function wrapper around POSIX's `isgraph' macro. */
303
ifile_isgraph (int character)
305
return isgraph (character);
309
/* A lexer that keeps all alphabetic strings, delimited by
310
non-alphabetic characters. For example, the string
311
`http://www.cs.cmu.edu' will result in the tokens `http', `www',
312
`cs', `cmu', `edu'. */
313
const ifile_lexer_simple _ifile_alpha_lexer =
317
ifile_lexer_simple_open_text_fp,
318
ifile_lexer_simple_get_word,
319
ifile_lexer_simple_close,
320
"", /* document start pattern begins right away */
321
NULL /* document end pattern goes to end */
323
ifile_isalpha, /* begin words with an alphabetic char */
324
ifile_isalpha, /* end words with any non-alphabetic char */
325
ifile_stoplist_present, /* use the default stoplist */
326
0, /* don't use the Porter stemming algorithm */
327
NO, /* be case-INsensitive */
328
NO, /* don't strip non-alphas from end */
329
NO, /* don't toss words w/ non-alphas */
330
0, /* don't toss words with digits */
331
59 /* toss words longer than 59 chars, uuenc=60 */
333
const ifile_lexer_simple *ifile_alpha_lexer = &_ifile_alpha_lexer;
335
/* A lexer that throws out all space-delimited strings that have any
336
non-alphabetical characters. For example, the string `obtained
337
from http://www.cs.cmu.edu' will result in the tokens `obtained'
338
and `from', but the URL will be skipped. */
339
const ifile_lexer_simple _ifile_alpha_only_lexer =
343
ifile_lexer_simple_open_text_fp,
344
ifile_lexer_simple_get_word,
345
ifile_lexer_simple_close,
346
"", /* document start pattern begins right away */
347
NULL /* document end pattern goes to end */
349
ifile_isalpha, /* begin words with an alphabetic char */
350
ifile_isgraph, /* end words with space */
351
ifile_stoplist_present, /* use the default stoplist */
352
0, /* don't use the Porter stemming algorithm */
353
NO, /* be case-INsensitive */
354
YES, /* strip non-alphas from end */
355
YES, /* toss words w/ non-alphas */
356
3, /* toss words with 3 digits */
357
59 /* toss words longer than 59 chars, uuenc=60 */
359
const ifile_lexer_simple *ifile_alpha_only_lexer = &_ifile_alpha_only_lexer;
361
/* A lexer that keeps all strings that begin and end with alphabetic
362
characters, delimited by white-space. For example,
363
the string `http://www.cs.cmu.edu' will be a single token. */
364
const ifile_lexer_simple _ifile_white_lexer =
368
ifile_lexer_simple_open_text_fp,
369
ifile_lexer_simple_get_word,
370
ifile_lexer_simple_close,
371
"", /* document start pattern begins right away */
372
NULL /* document end pattern goes to end */
374
ifile_isalpha, /* begin words with an alphabetic char */
375
ifile_isgraph, /* end words with any non-alphabetic char */
376
ifile_stoplist_present, /* use the default stoplist */
377
0, /* don't use the Porter stemming algorithm */
378
NO, /* be case-INsensitive */
379
YES, /* strip non-alphas from end */
380
NO, /* don't toss words w/ non-alphas */
381
4, /* toss words with 4 digits */
382
59 /* toss words longer than 59 chars, uuenc=60 */
384
const ifile_lexer_simple *ifile_white_lexer = &_ifile_white_lexer;