1
/* $Id: Keys.cpp,v 1.4 2003/06/23 14:38:41 terpstra Exp $
3
* Keys.cpp - Digest a hunk of string into keywords.
5
* Copyright (C) 2002 - Wesley W. Terpstra
9
* Authors: 'Wesley W. Terpstra' <wesley@terpstra.ca>
11
* This program is free software; you can redistribute it and/or modify
12
* it under the terms of the GNU General Public License as published by
13
* the Free Software Foundation; version 2.
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the Free Software
22
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
#define _XOPEN_SOURCE 500
26
#define _FILE_OFFSET_BITS 64
34
/*------------------------------------------------ Private global vars */
36
/* These are characters which should be interpretted as both part of the word
37
* and as a word seperator. eg: 'maul.sith.vpn' should be indexed as 'maul',
38
* 'sith', 'vpn', and 'maul.sith.vpn' because '.' is listed here.
40
static const char my_keyword_word_splits[] = "$@./:\\-_~&=%?#+";
41
static char my_keyword_is_split[256];
43
/* These are characters which should be interpretted as word breaks.
44
* No known language should use these as letters in a word.
45
* All chars 000-037 fall in this category too.
47
static const char my_keyword_word_divs[] = " !\"'()*,;<>[]^`{|}";
48
static char my_keyword_is_div[256];
50
/* These tables are the conversion for characters being written to keywords.
52
static const char my_keyword_orig[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
53
static const char my_keyword_dest[] = "abcdefghijklmnopqrstuvwxyz";
54
static char my_keyword_conv[256];
56
/* We need to be initd */
57
static int my_keyword_initd = 1;
59
/*------------------------------------------------ Private helper methods */
61
/* Combine the prefix with the substring */
62
static int my_keyword_index_hunk(
63
const unsigned char* buf,
64
const unsigned char* eos,
66
int (*writefn)(const char* keyword, void* arg),
69
char out[LU_KEYWORD_LEN+1];
74
{ /* Don't index nothing */
78
/* A quick check to avoid function calls */
81
strcpy(&out[0], prefix);
82
w = &out[strlen(prefix)];
89
e = &out[sizeof(out) - 1];
91
/* Copy the range into the buffer while converting it */
92
while (w != e && buf != eos)
94
*w++ = my_keyword_conv[*buf++];
100
/* Ignore this keyword */
104
return writefn(&out[0], arg);
107
/* Look at a section of non-whitespace chars and decide what to do with it. */
108
static int my_keyword_digest_hunk(
109
const unsigned char* buf,
110
const unsigned char* eos,
112
int (*writefn)(const char* keyword, void* arg),
116
const unsigned char* start;
117
const unsigned char* scan;
119
/*!!! Make me work with non-romanian languages (eg. japanese) */
120
/* Japanese has no spaces to delineate words */
122
/* Don't index vapour.
127
/* Firstly, index the entire chunk, with leading and trailing chars.
130
/* Index the entire hunk. */
131
if (my_keyword_index_hunk(buf, eos, prefix, writefn, arg) != 0)
134
if (!do_div) return 0;
136
/* Now, divide the chunk into bits which we will keyword index */
138
for (scan = buf; scan != eos; scan++)
140
if (my_keyword_is_split[*scan])
144
if (my_keyword_index_hunk(start, scan,
145
prefix, writefn, arg) != 0)
161
if (my_keyword_index_hunk(start, eos, prefix, writefn, arg) != 0)
168
static void my_keyword_init(void)
172
/* Clear the lookup tables */
173
memset(&my_keyword_is_split[0], 0, sizeof(my_keyword_is_split));
174
memset(&my_keyword_is_div [0], 0, sizeof(my_keyword_is_div));
176
/* Bootstrap the lookup tables */
177
for (i = 0; i < sizeof(my_keyword_word_splits)-1; i++)
178
my_keyword_is_split[((int)my_keyword_word_splits[i])] = 1;
179
for (i = 0; i < sizeof(my_keyword_word_divs)-1; i++)
180
my_keyword_is_div[((int)my_keyword_word_divs[i])] = 1;
182
/* All control characters divide words */
183
for (i = 0; i < 040; i++)
184
my_keyword_is_div[i] = 1;
186
/* Initialize conversion table */
187
for (i = 0; i < 256; i++)
188
my_keyword_conv[i] = i;
190
/* Fill the conversion entries */
191
for (i = 0; i < sizeof(my_keyword_orig)-1; i++)
192
my_keyword_conv[((int)my_keyword_orig[i])] =
195
my_keyword_initd = 0;
198
/*------------------------------------------------- Public component methods */
200
/* Run through a buffer looking for segments of non-divide characters.
202
int my_keyword_digest_string(
206
int (*writefn)(const char* keyword, void* arg),
210
const unsigned char* start;
211
const unsigned char* scan;
212
const unsigned char* eos = (const unsigned char*)buf + len;
214
if (my_keyword_initd)
218
for (scan = (const unsigned char*)buf; scan != eos; scan++)
220
if (my_keyword_is_div[*scan])
224
my_keyword_digest_hunk(start, scan,
225
prefix, writefn, arg, do_div);
240
my_keyword_digest_hunk(start, eos, prefix, writefn, arg, do_div);