2
/* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc.
3
Written by James Clark (jjc@jclark.com)
5
This file is part of groff.
7
groff is free software; you can redistribute it and/or modify it under
8
the terms of the GNU General Public License as published by the Free
9
Software Foundation; either version 2, or (at your option) any later
12
groff is distributed in the hope that it will be useful, but WITHOUT ANY
13
WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17
You should have received a copy of the GNU General Public License along
18
with groff; see the file COPYING. If not, write to the Free Software
19
Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
24
#define TOKEN_TABLE_SIZE 1009
25
// I believe in Icelandic thorn sorts after z.
26
#define THORN_SORT_KEY "{"
28
struct token_table_entry {
34
token_table_entry token_table[TOKEN_TABLE_SIZE];
37
static void skip_name(const char **ptr, const char *end)
57
int get_token(const char **ptr, const char *end)
62
if (c == '\\' && *ptr < end) {
81
token_info::token_info()
82
: type(TOKEN_OTHER), sort_key(0), other_case(0)
86
void token_info::set(token_type t, const char *sk, const char *oc)
88
assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER);
94
void token_info::sortify(const char *start, const char *end, string &result)
99
else if (type == TOKEN_UPPER || type == TOKEN_LOWER) {
100
for (; start < end; start++)
102
result += cmlower(*start);
106
int token_info::sortify_non_empty(const char *start, const char *end) const
109
return *sort_key != '\0';
110
if (type != TOKEN_UPPER && type != TOKEN_LOWER)
112
for (; start < end; start++)
119
void token_info::lower_case(const char *start, const char *end,
120
string &result) const
122
if (type != TOKEN_UPPER) {
127
result += other_case;
130
result += cmlower(*start++);
134
void token_info::upper_case(const char *start, const char *end,
135
string &result) const
137
if (type != TOKEN_LOWER) {
142
result += other_case;
145
result += cmupper(*start++);
149
token_table_entry::token_table_entry()
154
static void store_token(const char *tok, token_type typ,
155
const char *sk = 0, const char *oc = 0)
157
unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE;
159
if (token_table[n].tok == 0) {
160
if (++ntokens == TOKEN_TABLE_SIZE)
162
token_table[n].tok = tok;
165
if (strcmp(tok, token_table[n].tok) == 0)
168
n = TOKEN_TABLE_SIZE - 1;
172
token_table[n].ti.set(typ, sk, oc);
176
token_info default_token_info;
178
const token_info *lookup_token(const char *start, const char *end)
180
unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE;
182
if (token_table[n].tok == 0)
184
if (strlen(token_table[n].tok) == size_t(end - start)
185
&& memcmp(token_table[n].tok, start, end - start) == 0)
186
return &(token_table[n].ti);
188
n = TOKEN_TABLE_SIZE - 1;
192
return &default_token_info;
195
static void init_ascii()
198
for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) {
202
store_token(strsave(buf), TOKEN_LOWER);
203
buf[0] = cmupper(buf[0]);
204
store_token(strsave(buf), TOKEN_UPPER);
206
for (p = "0123456789"; *p; p++) {
210
const char *s = strsave(buf);
211
store_token(s, TOKEN_OTHER, s);
213
for (p = ".,:;?!"; *p; p++) {
217
store_token(strsave(buf), TOKEN_PUNCT);
219
store_token("-", TOKEN_HYPHEN);
222
static void store_letter(const char *lower, const char *upper,
223
const char *sort_key = 0)
225
store_token(lower, TOKEN_LOWER, sort_key, upper);
226
store_token(upper, TOKEN_UPPER, sort_key, lower);
229
static void init_letter(unsigned char uc_code, unsigned char lc_code,
230
const char *sort_key)
238
store_letter(strsave(lbuf), strsave(ubuf), sort_key);
241
static void init_latin1()
243
init_letter(0xc0, 0xe0, "a");
244
init_letter(0xc1, 0xe1, "a");
245
init_letter(0xc2, 0xe2, "a");
246
init_letter(0xc3, 0xe3, "a");
247
init_letter(0xc4, 0xe4, "a");
248
init_letter(0xc5, 0xe5, "a");
249
init_letter(0xc6, 0xe6, "ae");
250
init_letter(0xc7, 0xe7, "c");
251
init_letter(0xc8, 0xe8, "e");
252
init_letter(0xc9, 0xe9, "e");
253
init_letter(0xca, 0xea, "e");
254
init_letter(0xcb, 0xeb, "e");
255
init_letter(0xcc, 0xec, "i");
256
init_letter(0xcd, 0xed, "i");
257
init_letter(0xce, 0xee, "i");
258
init_letter(0xcf, 0xef, "i");
260
init_letter(0xd0, 0xf0, "d");
261
init_letter(0xd1, 0xf1, "n");
262
init_letter(0xd2, 0xf2, "o");
263
init_letter(0xd3, 0xf3, "o");
264
init_letter(0xd4, 0xf4, "o");
265
init_letter(0xd5, 0xf5, "o");
266
init_letter(0xd6, 0xf6, "o");
267
init_letter(0xd8, 0xf8, "o");
268
init_letter(0xd9, 0xf9, "u");
269
init_letter(0xda, 0xfa, "u");
270
init_letter(0xdb, 0xfb, "u");
271
init_letter(0xdc, 0xfc, "u");
272
init_letter(0xdd, 0xfd, "y");
273
init_letter(0xde, 0xfe, THORN_SORT_KEY);
275
store_token("\337", TOKEN_LOWER, "ss", "SS");
276
store_token("\377", TOKEN_LOWER, "y", "Y");
279
static void init_two_char_letter(char l1, char l2, char u1, char u2,
288
const char *p = strsave(buf);
291
store_letter(p, strsave(buf), sk);
298
store_letter(strsave(buf), p, sk);
302
static void init_special_chars()
305
for (p = "':^`~"; *p; p++)
306
for (const char *q = "aeiouy"; *q; q++) {
307
// Use a variable to work around bug in gcc 2.0
308
char c = cmupper(*q);
309
init_two_char_letter(*p, *q, *p, c);
311
for (p = "/l/o~n,coeaeij"; *p; p += 2) {
312
// Use variables to work around bug in gcc 2.0
313
char c0 = cmupper(p[0]);
314
char c1 = cmupper(p[1]);
315
init_two_char_letter(p[0], p[1], c0, c1);
317
init_two_char_letter('v', 's', 'v', 'S', "s");
318
init_two_char_letter('v', 'z', 'v', 'Z', "z");
319
init_two_char_letter('o', 'a', 'o', 'A', "a");
320
init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY);
321
init_two_char_letter('-', 'd', '-', 'D');
323
store_token("\\(ss", TOKEN_LOWER, 0, "SS");
324
store_token("\\[ss]", TOKEN_LOWER, 0, "SS");
326
store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D");
327
store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]");
328
store_token("\\(hy", TOKEN_HYPHEN);
329
store_token("\\[hy]", TOKEN_HYPHEN);
330
store_token("\\(en", TOKEN_RANGE_SEP);
331
store_token("\\[en]", TOKEN_RANGE_SEP);
334
static void init_strings()
339
for (const char *p = "'`^^,:~v_o./;"; *p; p++) {
342
store_token(strsave(buf), TOKEN_ACCENT);
347
store_token(strsave(buf), TOKEN_ACCENT);
350
// -ms special letters
351
store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY);
352
store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY);
353
store_letter("\\*(d-", "\\*(D-");
354
store_letter("\\*[d-]", "\\*[D-]");
355
store_letter("\\*(ae", "\\*(Ae", "ae");
356
store_letter("\\*[ae]", "\\*[Ae]", "ae");
357
store_letter("\\*(oe", "\\*(Oe", "oe");
358
store_letter("\\*[oe]", "\\*[Oe]", "oe");
360
store_token("\\*3", TOKEN_LOWER, "y", "Y");
361
store_token("\\*8", TOKEN_LOWER, "ss", "SS");
362
store_token("\\*q", TOKEN_LOWER, "o", "O");
365
struct token_initer {
369
static token_initer the_token_initer;
371
token_initer::token_initer()
375
init_special_chars();
377
default_token_info.set(TOKEN_OTHER);