2
/* Copyright (C) 1989-1992, 2000, 2001 Free Software Foundation, Inc.
3
Written by James Clark (jjc@jclark.com)
5
This file is part of groff.
7
groff is free software; you can redistribute it and/or modify it under
8
the terms of the GNU General Public License as published by the Free
9
Software Foundation; either version 2, or (at your option) any later
12
groff is distributed in the hope that it will be useful, but WITHOUT ANY
13
WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17
You should have received a copy of the GNU General Public License along
18
with groff; see the file COPYING. If not, write to the Free Software
19
Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
31
#include "stringclass.h"
40
extern "C" const char *Version_string;
42
#ifndef HAVE_MKSTEMP_PROTO
44
extern int mkstemp(char *);
48
#define DEFAULT_HASH_TABLE_SIZE 997
49
#define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
51
// (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
53
#define MALLOC_OVERHEAD 16
59
const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
60
- sizeof(int)) / sizeof(int));
66
block(block *p = 0) : next(p), used(0) { }
80
word_list(const char *, int, word_list *);
83
table_entry *hash_table;
84
int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
85
// We make this the same size as hash_table so we only have to do one
87
static word_list **common_words_table = 0;
93
char *temp_index_file = 0;
95
const char *ignore_fields = "XYZ";
96
const char *common_words_file = COMMON_WORDS_FILE;
97
int n_ignore_words = 100;
100
int max_keys_per_item = 100;
102
static void usage(FILE *stream);
103
static void write_hash_table();
104
static void init_hash_table();
105
static void read_common_words_file();
106
static int store_key(char *s, int len);
107
static void possibly_store_key(char *s, int len);
108
static int do_whole_file(const char *filename);
109
static int do_file(const char *filename);
110
static void store_reference(int filename_index, int pos, int len);
111
static void check_integer_arg(char opt, const char *arg, int min, int *res);
112
static void store_filename(const char *);
113
static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
114
static char *get_cwd();
118
void catch_fatal_signals();
119
void ignore_fatal_signals();
122
int main(int argc, char **argv)
124
program_name = argv[0];
125
static char stderr_buf[BUFSIZ];
126
setbuf(stderr, stderr_buf);
128
const char *basename = 0;
129
typedef int (*parser_t)(const char *);
130
parser_t parser = do_file;
131
const char *directory = 0;
132
const char *foption = 0;
134
static const struct option long_options[] = {
135
{ "help", no_argument, 0, CHAR_MAX + 1 },
136
{ "version", no_argument, 0, 'v' },
139
while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
144
common_words_file = optarg;
153
check_integer_arg('h', optarg, 1, &hash_table_size);
154
if (!is_prime(hash_table_size)) {
155
while (!is_prime(++hash_table_size))
157
warning("%1 not prime: using %2 instead", optarg, hash_table_size);
161
ignore_fields = optarg;
164
check_integer_arg('k', optarg, 1, &max_keys_per_item);
167
check_integer_arg('l', optarg, 0, &shortest_len);
170
check_integer_arg('n', optarg, 0, &n_ignore_words);
176
check_integer_arg('t', optarg, 1, &truncate_len);
179
parser = do_whole_file;
182
printf("GNU indxbib (groff) version %s\n", Version_string);
185
case CHAR_MAX + 1: // --help
197
if (optind >= argc && foption == 0)
198
fatal("no files and no -f option");
200
char *path = get_cwd();
201
store_filename(path);
205
store_filename(directory);
207
store_filename(common_words_file);
208
store_filename(ignore_fields);
209
key_buffer = new char[truncate_len];
210
read_common_words_file();
212
basename = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
213
const char *p = strrchr(basename, DIR_SEPS[0]), *p1;
214
const char *sep = &DIR_SEPS[1];
216
p1 = strrchr(basename, *sep);
217
if (p1 && (!p || p1 > p))
223
char *dir = strsave(basename);
224
dir[p - basename] = '\0';
225
name_max = file_name_max(dir);
229
name_max = file_name_max(".");
230
const char *filename = p ? p + 1 : basename;
232
long(strlen(filename) + sizeof(INDEX_SUFFIX) - 1) > name_max)
233
fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
236
temp_index_file = new char[p - basename + sizeof(TEMP_INDEX_TEMPLATE)];
237
memcpy(temp_index_file, basename, p - basename);
238
strcpy(temp_index_file + (p - basename), TEMP_INDEX_TEMPLATE);
241
temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
244
if (!mktemp(temp_index_file) || !temp_index_file[0])
245
fatal("cannot create file name for temporary file");
247
catch_fatal_signals();
249
int fd = mkstemp(temp_index_file);
251
int fd = creat(temp_index_file, S_IRUSR|S_IRGRP|S_IROTH);
254
fatal("can't create temporary index file: %1", strerror(errno));
255
indxfp = fdopen(fd, FOPEN_WB);
257
fatal("fdopen failed");
258
if (fseek(indxfp, sizeof(index_header), 0) < 0)
259
fatal("can't seek past index header: %1", strerror(errno));
263
if (strcmp(foption, "-") != 0) {
265
fp = fopen(foption, "r");
267
fatal("can't open `%1': %2", foption, strerror(errno));
273
for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
275
error_with_file_and_line(foption, lineno,
276
"nul character in pathname ignored");
280
if (path.length() > 0) {
282
if (!(*parser)(path.contents()))
293
for (int i = optind; i < argc; i++)
294
if (!(*parser)(argv[i]))
297
if (fclose(indxfp) < 0)
298
fatal("error closing temporary index file: %1", strerror(errno));
299
char *index_file = new char[strlen(basename) + sizeof(INDEX_SUFFIX)];
300
strcpy(index_file, basename);
301
strcat(index_file, INDEX_SUFFIX);
303
if (rename(temp_index_file, index_file) < 0) {
305
// RENAME could fail on plain MSDOS filesystems because
306
// INDEX_FILE is an invalid filename, e.g. it has multiple dots.
307
char *fname = p ? index_file + (p - basename) : 0;
310
// Replace the dot with an underscore and try again.
312
&& (dot = strchr(fname, '.')) != 0
313
&& strcmp(dot, INDEX_SUFFIX) != 0)
315
if (rename(temp_index_file, index_file) < 0)
317
fatal("can't rename temporary index file: %1", strerror(errno));
319
#else /* not HAVE_RENAME */
320
ignore_fatal_signals();
321
if (unlink(index_file) < 0) {
323
fatal("can't unlink `%1': %2", index_file, strerror(errno));
325
if (link(temp_index_file, index_file) < 0)
326
fatal("can't link temporary index file: %1", strerror(errno));
327
if (unlink(temp_index_file) < 0)
328
fatal("can't unlink temporary index file: %1", strerror(errno));
329
#endif /* not HAVE_RENAME */
334
static void usage(FILE *stream)
337
"usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
338
" [-l n] [-n n] [-o base] [-t n] [files...]\n",
342
static void check_integer_arg(char opt, const char *arg, int min, int *res)
345
long n = strtol(arg, &ptr, 10);
346
if (n == 0 && ptr == arg)
347
error("argument to -%1 not an integer", opt);
349
error("argument to -%1 must not be less than %2", opt, min);
352
error("argument to -%1 greater than maximum integer", opt);
353
else if (*ptr != '\0')
354
error("junk after integer argument to -%1", opt);
359
static char *get_cwd()
365
buf = new char[size];
366
if (getcwd(buf, size))
369
fatal("cannot get current working directory: %1", strerror(errno));
372
fatal("current working directory longer than INT_MAX");
373
if (size > INT_MAX/2)
381
word_list::word_list(const char *s, int n, word_list *p)
388
static void read_common_words_file()
390
if (n_ignore_words <= 0)
393
FILE *fp = fopen(common_words_file, "r");
395
fatal("can't open `%1': %2", common_words_file, strerror(errno));
396
common_words_table = new word_list * [hash_table_size];
397
for (int i = 0; i < hash_table_size; i++)
398
common_words_table[i] = 0;
403
while (c != EOF && !csalnum(c))
408
if (key_len < truncate_len)
409
key_buffer[key_len++] = cmlower(c);
411
} while (c != EOF && csalnum(c));
412
if (key_len >= shortest_len) {
413
int h = hash(key_buffer, key_len) % hash_table_size;
414
common_words_table[h] = new word_list(key_buffer, key_len,
415
common_words_table[h]);
417
if (++count >= n_ignore_words)
423
n_ignore_words = count;
427
static int do_whole_file(const char *filename)
430
FILE *fp = fopen(filename, "r");
432
error("can't open `%1': %2", filename, strerror(errno));
438
while ((c = getc(fp)) != EOF) {
442
while ((c = getc(fp)) != EOF) {
445
if (key_len < truncate_len)
446
key_buffer[key_len++] = c;
448
if (store_key(key_buffer, key_len)) {
449
if (++count >= max_keys_per_item)
456
store_reference(filenames.length(), 0, 0);
457
store_filename(filename);
462
static int do_file(const char *filename)
465
// Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
466
// byte counts to be consistent with fseek.
467
FILE *fp = fopen(filename, FOPEN_RB);
469
error("can't open `%1': %2", filename, strerror(errno));
472
int filename_index = filenames.length();
473
store_filename(filename);
476
START, // at the start of the file; also in between references
477
BOL, // in the middle of a reference, at the beginning of the line
478
PERCENT, // seen a percent at the beginning of the line
479
IGNORE, // ignoring a field
480
IGNORE_BOL, // at the beginning of a line ignoring a field
481
KEY, // in the middle of a key
482
DISCARD, // after truncate_len bytes of a key
483
MIDDLE // in between keys
486
// In states START, BOL, IGNORE_BOL, space_count how many spaces at
487
// the beginning have been seen. In states PERCENT, IGNORE, KEY,
488
// MIDDLE space_count must be 0.
490
int byte_count = 0; // bytes read
492
int ref_start = -1; // position of start of current reference
497
// We opened the file in binary mode, so we need to skip
498
// every CR character before a Newline.
508
#if defined(__MSDOS__) || defined(_MSC_VER)
509
else if (c == 0x1a) // ^Z means EOF in text files
515
if (c == ' ' || c == '\t') {
523
ref_start = byte_count - space_count - 1;
527
else if (csalnum(c)) {
538
if (space_count > 0) {
550
store_reference(filename_index, ref_start,
551
byte_count - 1 - space_count - ref_start);
567
if (strchr(ignore_fields, c) != 0)
581
if (space_count > 0) {
593
store_reference(filename_index, ref_start,
594
byte_count - 1 - space_count - ref_start);
605
if (key_len < truncate_len)
606
key_buffer[key_len++] = c;
611
possibly_store_key(key_buffer, key_len);
621
possibly_store_key(key_buffer, key_len);
647
possibly_store_key(key_buffer, key_len);
654
store_reference(filename_index, ref_start,
655
byte_count - ref_start - space_count);
664
static void store_reference(int filename_index, int pos, int len)
667
t.filename_index = filename_index;
670
fwrite_or_die(&t, sizeof(t), 1, indxfp);
674
static void store_filename(const char *fn)
680
static void init_hash_table()
682
hash_table = new table_entry[hash_table_size];
683
for (int i = 0; i < hash_table_size; i++)
684
hash_table[i].ptr = 0;
687
static void possibly_store_key(char *s, int len)
689
static int last_tagno = -1;
690
static int key_count;
691
if (last_tagno != ntags) {
695
if (key_count < max_keys_per_item) {
696
if (store_key(s, len))
701
static int store_key(char *s, int len)
703
if (len < shortest_len)
706
for (int i = 0; i < len; i++)
707
if (!csdigit(s[i])) {
709
s[i] = cmlower(s[i]);
711
if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
713
int h = hash(s, len) % hash_table_size;
714
if (common_words_table) {
715
for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
716
if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
719
table_entry *pp = hash_table + h;
722
else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
724
else if (pp->ptr->used >= BLOCK_SIZE)
725
pp->ptr = new block(pp->ptr);
726
pp->ptr->v[(pp->ptr->used)++] = ntags;
730
static void write_hash_table()
732
const int minus_one = -1;
734
for (int i = 0; i < hash_table_size; i++) {
735
block *ptr = hash_table[i].ptr;
737
hash_table[i].count = -1;
739
hash_table[i].count = li;
748
fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
754
fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
758
if (sizeof(table_entry) == sizeof(int))
759
fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
761
// write it out word by word
762
for (int i = 0; i < hash_table_size; i++)
763
fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
765
fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
766
if (fseek(indxfp, 0, 0) < 0)
767
fatal("error seeking on index file: %1", strerror(errno));
769
h.magic = INDEX_MAGIC;
770
h.version = INDEX_VERSION;
773
h.table_size = hash_table_size;
774
h.strings_size = filenames.length();
775
h.truncate = truncate_len;
776
h.shortest = shortest_len;
777
h.common = n_ignore_words;
778
fwrite_or_die(&h, sizeof(h), 1, indxfp);
781
static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
783
if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
784
fatal("fwrite failed: %1", strerror(errno));
787
void fatal_error_exit()
798
unlink(temp_index_file);