~ubuntu-branches/ubuntu/raring/ifile/raring

« back to all changes in this revision

Viewing changes to lex-simple.c

Committer: Bazaar Package Importer
Author(s): Jens Peter Secher
Date: 2004-11-19 23:30:24 UTC
Revision ID: james.westby@ubuntu.com-20041119233024-3s7sqpy963jx22eu

Tags: upstream-1.3.4

Import upstream version 1.3.4

files added:

COPYING

ChangeLog

INSTALL

Makefile.in

NOTES

README

Version

argp

argp/ChangeLog

argp/INSTALL

argp/Makefile.in

argp/README

argp/argp-ba.c

argp/argp-ex1.c

argp/argp-ex2.c

argp/argp-ex3.c

argp/argp-ex4.c

argp/argp-fmtstream.c

argp/argp-fmtstream.h

argp/argp-fs-xinl.c

argp/argp-help.c

argp/argp-namefrob.h

argp/argp-parse.c

argp/argp-pv.c

argp/argp-pvh.c

argp/argp-test.c

argp/argp-xinl.c

argp/argp.c

argp/argp.h

argp/argp1.h

argp/configure

argp/configure.in

argp/getopt.c

argp/getopt.h

argp/getopt1.c

argp/install-sh

argp/mkinstalldirs

argp/pin.c

argp/strndup.c

argp/tester.c

configure

database.c

error.c

hash_table.c

ifile.1

ifile.c

include

include/extendable_array.h

include/hash_table.h

include/ifile.h

install-sh

int4str.c

istext.c

lex-define.c

lex-email.c

lex-indirect.c

lex-simple.c

mkinstalldirs

opts.c

primes.c

scan.c

stem.c

stoplist.c

stopwords.c

test.sh

util.c

Show diffs side-by-side

added added

removed removed

lex-simple.c

/* Implementation of some simple, context-free lexers. */

Written by: Andrew Kachites McCallum <mccallum@cs.cmu.edu>

This file is part of the Bag-Of-Words Library, `libbow'.

This library is free software; you can redistribute it and/or

modify it under the terms of the GNU Library General Public License

as published by the Free Software Foundation, version 2.

This library is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

Library General Public License for more details.

You should have received a copy of the GNU Library General Public

License along with this library; if not, write to the Free Software

Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */

#include <ifile.h>

#include <ctype.h> /* for isalpha() */

#define NO 0

#define YES 1

#define SELF ((ifile_lexer_simple*)self)

/* This function is defined in scan.c */

extern int ifile_scan_fp_for_string (FILE *fp, const char *string, int oneline);

extern arguments args;

/* Create and return a IFILE_LEX, filling the document buffer from

characters in FP, starting after the START_PATTERN, and ending with

the END_PATTERN. */

ifile_lex *

ifile_lexer_simple_open_text_fp (ifile_lexer *self,

FILE *fp)

{

int document_size = 2048; /* the initial size of the document buffer */

int len; /* an index into RET->DOCUMENT */

ifile_lex *ret; /* the IFILE_LEX we will return. */

const char *end_pattern_ptr;

int byte; /* a character read from FP */

if (feof (fp))

return NULL;

/* Create space for the document buffer. */

ret = ifile_malloc (self->sizeof_lex);

ret->document = ifile_malloc (document_size);

/* Make sure DOCUMENT_START_PATTERN is not NULL; this would cause

it to scan forward to EOF. */

assert (self->document_start_pattern);

/* Scan forward in the file until we find the start pattern. */

ifile_scan_fp_for_string (fp, self->document_start_pattern, 0);

/* Make sure the DOCUMENT_END_PATTERN isn't the empty string; this

would cause it to match and finish filling immediately. */

assert (!self->document_end_pattern || self->document_end_pattern[0]);

/* Fill the document buffer until we get EOF, or until we get to the

DOCUMENT_END_PATTERN. */

for (len = 0, end_pattern_ptr = self->document_end_pattern;

/* We got EOF */

(((byte = fgetc (fp)) != EOF)

/* We found the DOCUMENT_END_PATTERN */

&& !(end_pattern_ptr

&& *end_pattern_ptr == byte && *(end_pattern_ptr+1) == '\0'));

len++)

{

if (len >= document_size-1)

{

/* The RET->DOCUMENT buffer must grow to accommodate more chars. */

/* We need `DOCUMENT_SIZE-1' in the above test, because we

must have room for the terminating '\0'! */

document_size *= 2;

ret->document = ifile_realloc (ret->document, document_size);

}

/* Put the byte in the document buffer. */

ret->document[len] = byte;

/* If the byte matches the next character of the DOCUMENT_END_PATTERN

then prepare to match the next character of the pattern,

otherwise reset to the beginning of the pattern. */

if (end_pattern_ptr)

{

if (byte == *end_pattern_ptr)

end_pattern_ptr++;

else if (byte == self->document_end_pattern[0])

end_pattern_ptr = self->document_end_pattern+1;

else

end_pattern_ptr = self->document_end_pattern;

}

100

}

101

102

if (len == 0)

103

{

104

ifile_free (ret->document);

105

ifile_free (ret);

106

return NULL;

107

}

108

109

#if 0

110

/* If we found the DOCUMENT_END_PATTERN, push it back into the input

111

stream, so we'll see it next time we read from this file. */

112

/* xxx Will this work for stdin? */

113

if (byte != EOF)

114

{

115

int end_pattern_len = (self->document_end_pattern

116

? strlen (self->document_end_pattern)

117

: 0);

118

if (end_pattern_len && fseek (fp, -end_pattern_len, SEEK_CUR) != 0)

119

perror (__PRETTY_FUNCTION__);

120

len -= end_pattern_len;

121

}

122

#endif

123

124

/* Remember, it may be the case that LEN is zero. */

125

ret->document_position = 0;

126

ret->document_length = len;

127

if (args.max_length != 0 && args.max_length < ret->document_length)

128

ret->document_length = args.max_length;

129

assert (ret->document_length < document_size);

130

((char*)ret->document)[ret->document_length] = '\0';

131

return ret;

132

}

133

134

/* Close the LEX buffer, freeing the memory held by it. */

135

void

136

ifile_lexer_simple_close (ifile_lexer *self, ifile_lex *lex)

137

{

138

ifile_free (lex->document);

139

ifile_free (lex);

140

}

141

142

/* Get the raw token from the document buffer by scanning forward

143

until we get a start character, and filling the buffer until we get

144

an ending character. The resulting token in the buffer is

145

NULL-terminated. Return the length of the token. */

146

int

147

ifile_lexer_simple_get_raw_word (ifile_lexer_simple *self, ifile_lex *lex,

148

char *buf, int buflen)

149

{

150

int byte; /* characters read from the FP */

151

int wordlen; /* number of characters in the word so far */

152

153

/* Ignore characters until we get a beginning character. */

154

155

{

156

byte = lex->document[lex->document_position++];

157

if (byte == 0)

158

return 0;

159

}

160

while (! self->true_to_start (byte));

161

162

/* Add the first alphabetic character to the word. */

163

buf[0] = (self->case_sensitive) ? byte : tolower (byte);

164

165

/* Add all the following satisfying characters to the word. */

166

for (wordlen = 1; wordlen < buflen; wordlen++)

167

{

168

byte = lex->document[lex->document_position++];;

169

if (byte == 0)

170

break;

171

if (! self->false_to_end (byte))

172

break;

173

buf[wordlen] = tolower (byte);

174

}

175

176

if (wordlen >= buflen)

177

ifile_error ("Encountered word longer than buffer length=%d", buflen);

178

179

/* Back up to point at the character that caused the end of the word. */

180

lex->document_position--;

181

182

/* Terminate it. */

183

buf[wordlen] = '\0';

184

185

return wordlen;

186

}

187

188

/* Perform all the necessary postprocessing after the initial token

189

boundaries have been found: strip non-alphas from end, toss words

190

containing non-alphas, toss words containing certaing many digits,

191

toss words appearing in the stop list, stem the word, check the

192

stoplist again, toss words of length one. If the word is tossed,

193

return zero, otherwise return the length of the word. */

194

int

195

ifile_lexer_simple_postprocess_word (ifile_lexer_simple *self, ifile_lex *lex,

196

char *buf, int buflen)

197

{

198

int wordlen = strlen (buf);

199

200

/* Toss words that are longer than SELF->TOSS_WORDS_LONGER_THAN */

201

if (self->toss_words_longer_than)

202

{

203

if (wordlen > self->toss_words_longer_than)

204

return 0;

205

}

206

207

if (self->strip_non_alphas_from_end)

208

{

209

/* Strip any non-alphabetic characters off the end of the word */

210

while (wordlen && !isalpha(buf[wordlen-1]))

211

wordlen--;

212

/* Terminate it. */

213

buf[wordlen] = '\0';

214

if (wordlen == 0)

215

return 0;

216

}

217

218

if (self->toss_words_containing_non_alphas)

219

{

220

/* If the word contains any non-alphabetic characters, get

221

another word instead. */

222

{

223

char *bufp;

224

for (bufp = buf; *bufp; bufp++)

225

{

226

if (!isalpha (*bufp))

227

return 0;

228

}

229

}

230

}

231

232

/* If the word contain TOSS_WORDS_CONTAINING_THIS_MANY_DIGITS

233

number of digits, get another word instead. (Here the

234

variable BYTE holds the count of the number of digits.) */

235

if (self->toss_words_containing_this_many_digits)

236

{

237

int byte;

238

char *bufp;

239

for (bufp = buf, byte = 0; *bufp; bufp++)

240

{

241

if (isdigit (*bufp))

242

if (++byte > self->toss_words_containing_this_many_digits)

243

return 0;

244

}

245

}

246

247

if (self->stoplist_func && self->stoplist_func (buf))

248

return 0;

249

250

/* Apply the stemming algorithm to the word. */

251

if (self->stem_func)

252

self->stem_func (buf);

253

254

/* If the result of stemming is on the stoplist, go back and start again. */

255

if (self->stoplist_func && self->stoplist_func (buf))

256

return 0;

257

258

/* If the result of stemming is only one letter long, go back and

259

start again. */

260

if (buf[1] == '\0')

261

return 0;

262

263

/* Return the length of the word we found. */

264

return strlen (buf);

265

}

266

267

/* Scan a single token from the LEX buffer, placing it in BUF, and

268

returning the length of the token. BUFLEN is the maximum number of

269

characters that will fit in BUF. If the token won't fit in BUF,

270

an error is raised. */

271

int

272

ifile_lexer_simple_get_word (ifile_lexer *self, ifile_lex *lex,

273

char *buf, int buflen)

274

{

275

int wordlen; /* number of characters in the word so far */

276

277

278

{

279

wordlen = ifile_lexer_simple_get_raw_word ((ifile_lexer_simple*)self,

280

lex, buf, buflen);

281

if (wordlen == 0)

282

return 0;

283

}

284

while ((wordlen = ifile_lexer_simple_postprocess_word

285

((ifile_lexer_simple*)self, lex, buf, buflen))

286

== 0);

287

return wordlen;

288

}

289

290

/* The end of the ifile_lex_simple_ functions. */

291

#undef SELF

292

293

294

/* A function wrapper around POSIX's `isalpha' macro. */

295

int

296

ifile_isalpha (int character)

297

{

298

return isalpha (character);

299

}

300

301

/* A function wrapper around POSIX's `isgraph' macro. */

302

int

303

ifile_isgraph (int character)

304

{

305

return isgraph (character);

306

}

307

308

309

/* A lexer that keeps all alphabetic strings, delimited by

310

non-alphabetic characters. For example, the string

311

`http://www.cs.cmu.edu' will result in the tokens `http', `www',

312

`cs', `cmu', `edu'. */

313

const ifile_lexer_simple _ifile_alpha_lexer =

314

{

315

{

316

sizeof (ifile_lex),

317

ifile_lexer_simple_open_text_fp,

318

ifile_lexer_simple_get_word,

319

ifile_lexer_simple_close,

320

"", /* document start pattern begins right away */

321

NULL /* document end pattern goes to end */

322

323

ifile_isalpha, /* begin words with an alphabetic char */

324

ifile_isalpha, /* end words with any non-alphabetic char */

325

ifile_stoplist_present, /* use the default stoplist */

326

0, /* don't use the Porter stemming algorithm */

327

NO, /* be case-INsensitive */

328

NO, /* don't strip non-alphas from end */

329

NO, /* don't toss words w/ non-alphas */

330

0, /* don't toss words with digits */

331

59 /* toss words longer than 59 chars, uuenc=60 */

332

};

333

const ifile_lexer_simple *ifile_alpha_lexer = &_ifile_alpha_lexer;

334

335

/* A lexer that throws out all space-delimited strings that have any

336

non-alphabetical characters. For example, the string `obtained

337

from http://www.cs.cmu.edu' will result in the tokens `obtained'

338

and `from', but the URL will be skipped. */

339

const ifile_lexer_simple _ifile_alpha_only_lexer =

340

{

341

{

342

sizeof (ifile_lex),

343

ifile_lexer_simple_open_text_fp,

344

ifile_lexer_simple_get_word,

345

ifile_lexer_simple_close,

346

"", /* document start pattern begins right away */

347

NULL /* document end pattern goes to end */

348

349

ifile_isalpha, /* begin words with an alphabetic char */

350

ifile_isgraph, /* end words with space */

351

ifile_stoplist_present, /* use the default stoplist */

352

0, /* don't use the Porter stemming algorithm */

353

NO, /* be case-INsensitive */

354

YES, /* strip non-alphas from end */

355

YES, /* toss words w/ non-alphas */

356

3, /* toss words with 3 digits */

357

59 /* toss words longer than 59 chars, uuenc=60 */

358

};

359

const ifile_lexer_simple *ifile_alpha_only_lexer = &_ifile_alpha_only_lexer;

360

361

/* A lexer that keeps all strings that begin and end with alphabetic

362

characters, delimited by white-space. For example,

363

the string `http://www.cs.cmu.edu' will be a single token. */

364

const ifile_lexer_simple _ifile_white_lexer =

365

{

366

{

367

sizeof (ifile_lex),

368

ifile_lexer_simple_open_text_fp,

369

ifile_lexer_simple_get_word,

370

ifile_lexer_simple_close,

371

"", /* document start pattern begins right away */

372

NULL /* document end pattern goes to end */

373

374

ifile_isalpha, /* begin words with an alphabetic char */

375

ifile_isgraph, /* end words with any non-alphabetic char */

376

ifile_stoplist_present, /* use the default stoplist */

377

0, /* don't use the Porter stemming algorithm */

378

NO, /* be case-INsensitive */

379

YES, /* strip non-alphas from end */

380

NO, /* don't toss words w/ non-alphas */

381

4, /* toss words with 4 digits */

382

59 /* toss words longer than 59 chars, uuenc=60 */

383

};

384

const ifile_lexer_simple *ifile_white_lexer = &_ifile_white_lexer;

Older »