2
/* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc.
3
Written by James Clark (jjc@jclark.com)
5
This file is part of groff.
7
groff is free software; you can redistribute it and/or modify it under
8
the terms of the GNU General Public License as published by the Free
9
Software Foundation; either version 2, or (at your option) any later
12
groff is distributed in the hope that it will be useful, but WITHOUT ANY
13
WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17
You should have received a copy of the GNU General Public License along
18
with groff; see the file COPYING. If not, write to the Free Software
19
Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
26
static const char *find_day(const char *, const char *, const char **);
27
static int find_month(const char *start, const char *end);
28
static void abbreviate_names(string &);
30
#define DEFAULT_ARTICLES "the\000a\000an"
32
string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES));
34
// Multiple occurrences of fields are separated by FIELD_SEPARATOR.
35
const char FIELD_SEPARATOR = '\0';
37
const char MULTI_FIELD_NAMES[] = "AE";
38
const char *AUTHOR_FIELDS = "AQ";
40
enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM };
42
const char *reference_types[] = {
51
static string temp_fields[256];
53
reference::reference(const char *start, int len, reference_id *ridp)
54
: h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0),
55
computed_authors(0), last_needed_author(-1), nauthors(-1)
58
for (i = 0; i < 256; i++)
59
field_index[i] = NULL_FIELD_INDEX;
66
const char *end = start + len;
67
const char *ptr = start;
70
if (ptr + 1 < end && ptr[1] != '\0'
71
&& ((ptr[1] != '%' && ptr[1] == annotation_field)
72
|| (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0'
73
&& discard_fields.search(ptr[2]) < 0))) {
76
string &f = temp_fields[(unsigned char)ptr[1]];
78
while (ptr < end && csspace(*ptr))
90
if (ptr >= end || *ptr == '%')
94
else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%'
95
&& discard_fields.search(ptr[1]) < 0) {
96
string &f = temp_fields[(unsigned char)ptr[1]];
98
if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0)
109
while (ptr < end && *ptr != '\n')
111
// strip trailing white space
113
while (q > p && q[-1] != '\n' && csspace(q[-1]))
131
while (ptr < end && *ptr++ != '\n')
133
if (ptr >= end || *ptr == '%')
138
for (i = 0; i < 256; i++)
139
if (temp_fields[i].length() > 0)
141
field = new string[nfields];
143
for (i = 0; i < 256; i++)
144
if (temp_fields[i].length() > 0) {
145
field[j].move(temp_fields[i]);
146
if (abbreviate_fields.search(i) >= 0)
147
abbreviate_names(field[j]);
153
reference::~reference()
156
ad_delete(nfields) field;
159
// ref is the inline, this is the database ref
161
void reference::merge(reference &ref)
164
for (i = 0; i < 256; i++)
165
if (field_index[i] != NULL_FIELD_INDEX)
166
temp_fields[i].move(field[field_index[i]]);
167
for (i = 0; i < 256; i++)
168
if (ref.field_index[i] != NULL_FIELD_INDEX)
169
temp_fields[i].move(ref.field[ref.field_index[i]]);
170
for (i = 0; i < 256; i++)
171
field_index[i] = NULL_FIELD_INDEX;
172
int old_nfields = nfields;
174
for (i = 0; i < 256; i++)
175
if (temp_fields[i].length() > 0)
177
if (nfields != old_nfields) {
179
ad_delete(old_nfields) field;
180
field = new string[nfields];
183
for (i = 0; i < 256; i++)
184
if (temp_fields[i].length() > 0) {
185
field[j].move(temp_fields[i]);
192
void reference::insert_field(unsigned char c, string &s)
194
assert(s.length() > 0);
195
if (field_index[c] != NULL_FIELD_INDEX) {
196
field[field_index[c]].move(s);
199
assert(field_index[c] == NULL_FIELD_INDEX);
200
string *old_field = field;
201
field = new string[nfields + 1];
204
for (i = 0; i < int(c); i++)
205
if (field_index[i] != NULL_FIELD_INDEX)
207
for (i = 0; i < pos; i++)
208
field[i].move(old_field[i]);
210
for (i = pos; i < nfields; i++)
211
field[i + 1].move(old_field[i]);
213
ad_delete(nfields) old_field;
215
field_index[c] = pos;
216
for (i = c + 1; i < 256; i++)
217
if (field_index[i] != NULL_FIELD_INDEX)
221
void reference::delete_field(unsigned char c)
223
if (field_index[c] == NULL_FIELD_INDEX)
225
string *old_field = field;
226
field = new string[nfields - 1];
228
for (i = 0; i < int(field_index[c]); i++)
229
field[i].move(old_field[i]);
230
for (i = field_index[c]; i < nfields - 1; i++)
231
field[i].move(old_field[i + 1]);
233
ad_delete(nfields) old_field;
235
field_index[c] = NULL_FIELD_INDEX;
236
for (i = c + 1; i < 256; i++)
237
if (field_index[i] != NULL_FIELD_INDEX)
241
void reference::compute_hash_code()
247
for (int i = 0; i < nfields; i++)
248
if (field[i].length() > 0) {
250
h ^= hash_string(field[i].contents(), field[i].length());
255
void reference::set_number(int n)
260
const char SORT_SEP = '\001';
261
const char SORT_SUB_SEP = '\002';
262
const char SORT_SUB_SUB_SEP = '\003';
264
// sep specifies additional word separators
266
void sortify_words(const char *s, const char *end, const char *sep,
270
int need_separator = 0;
272
const char *token_start = s;
273
if (!get_token(&s, end))
275
if ((s - token_start == 1
276
&& (*token_start == ' '
277
|| *token_start == '\n'
278
|| (sep && *token_start != '\0'
279
&& strchr(sep, *token_start) != 0)))
280
|| (s - token_start == 2
281
&& token_start[0] == '\\' && token_start[1] == ' ')) {
286
const token_info *ti = lookup_token(token_start, s);
287
if (ti->sortify_non_empty(token_start, s)) {
288
if (need_separator) {
292
ti->sortify(token_start, s, result);
299
void sortify_word(const char *s, const char *end, string &result)
302
const char *token_start = s;
303
if (!get_token(&s, end))
305
const token_info *ti = lookup_token(token_start, s);
306
ti->sortify(token_start, s, result);
310
void sortify_other(const char *s, int len, string &key)
312
sortify_words(s, s + len, 0, key);
315
void sortify_title(const char *s, int len, string &key)
317
const char *end = s + len;
318
for (; s < end && (*s == ' ' || *s == '\n'); s++)
322
const char *token_start = ptr;
323
if (!get_token(&ptr, end))
325
if (ptr - token_start == 1
326
&& (*token_start == ' ' || *token_start == '\n'))
330
unsigned int first_word_len = ptr - s - 1;
331
const char *ae = articles.contents() + articles.length();
332
for (const char *a = articles.contents();
334
a = strchr(a, '\0') + 1)
335
if (first_word_len == strlen(a)) {
337
for (j = 0; j < first_word_len; j++)
338
if (a[j] != cmlower(s[j]))
340
if (j >= first_word_len) {
342
for (; s < end && (*s == ' ' || *s == '\n'); s++)
348
sortify_words(s, end, 0, key);
351
void sortify_name(const char *s, int len, string &key)
353
const char *last_name_end;
354
const char *last_name = find_last_name(s, s + len, &last_name_end);
355
sortify_word(last_name, last_name_end, key);
356
key += SORT_SUB_SUB_SEP;
358
sortify_words(s, last_name, ".", key);
359
key += SORT_SUB_SUB_SEP;
360
if (last_name_end < s + len)
361
sortify_words(last_name_end, s + len, ".,", key);
364
void sortify_date(const char *s, int len, string &key)
366
const char *year_end;
367
const char *year_start = find_year(s, s + len, &year_end);
369
// Things without years are often `forthcoming', so it makes sense
370
// that they sort after things with explicit years.
372
sortify_words(s, s + len, 0, key);
375
int n = year_end - year_start;
380
while (year_start < year_end)
381
key += *year_start++;
382
int m = find_month(s, s + len);
387
const char *day_start = find_day(s, s + len, &day_end);
390
if (day_end - day_start == 1)
392
while (day_start < day_end)
396
// SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification.
398
void sortify_label(const char *s, int len, string &key)
400
const char *end = s + len;
404
ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP;
408
sortify_words(s, ptr, 0, key);
416
void reference::compute_sort_key()
418
if (sort_fields.length() == 0)
421
const char *sf = sort_fields.contents();
422
while (*sf != '\0') {
423
if (sf > sort_fields)
424
sort_key += SORT_SEP;
431
else if (csdigit(*sf)) {
433
long l = strtol(sf, &ptr, 10);
434
if (l == 0 && ptr == sf)
447
sortify_label(label.contents(), label.length(), sort_key);
448
else if (f == AUTHOR_FIELDS[0])
449
sortify_authors(n, sort_key);
451
sortify_field(f, n, sort_key);
453
sort_fields.set_length(sort_fields.length() - 1);
456
void reference::sortify_authors(int n, string &result) const
458
for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++)
459
if (contains_field(*p)) {
460
sortify_field(*p, n, result);
463
sortify_field(AUTHOR_FIELDS[0], n, result);
466
void reference::canonicalize_authors(string &result) const
468
int len = result.length();
469
sortify_authors(INT_MAX, result);
470
if (result.length() > len)
471
result += SORT_SUB_SEP;
474
void reference::sortify_field(unsigned char f, int n, string &result) const
476
typedef void (*sortify_t)(const char *, int, string &);
477
sortify_t sortifier = sortify_other;
481
sortifier = sortify_name;
484
sortifier = sortify_date;
489
sortifier = sortify_title;
492
int fi = field_index[(unsigned char)f];
493
if (fi != NULL_FIELD_INDEX) {
494
string &str = field[fi];
495
const char *start = str.contents();
496
const char *end = start + str.length();
497
for (int i = 0; i < n && start < end; i++) {
498
const char *p = start;
499
while (start < end && *start != FIELD_SEPARATOR)
502
result += SORT_SUB_SEP;
503
(*sortifier)(p, start - p, result);
510
int compare_reference(const reference &r1, const reference &r2)
514
const char *s1 = r1.sort_key.contents();
515
int n1 = r1.sort_key.length();
516
const char *s2 = r2.sort_key.contents();
517
int n2 = r2.sort_key.length();
518
for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2)
520
return (int)(unsigned char)*s1 - (int)(unsigned char)*s2;
525
return r1.no - r2.no;
528
int same_reference(const reference &r1, const reference &r2)
530
if (!r1.rid.is_null() && r1.rid == r2.rid)
534
if (r1.nfields != r2.nfields)
537
for (i = 0; i < 256; i++)
538
if (r1.field_index != r2.field_index)
540
for (i = 0; i < r1.nfields; i++)
541
if (r1.field[i] != r2.field[i])
546
const char *find_last_name(const char *start, const char *end,
549
const char *ptr = start;
550
const char *last_word = start;
552
const char *token_start = ptr;
553
if (!get_token(&ptr, end))
555
if (ptr - token_start == 1) {
556
if (*token_start == ',') {
560
else if (*token_start == ' ' || *token_start == '\n') {
561
if (ptr < end && *ptr != ' ' && *ptr != '\n')
570
void abbreviate_name(const char *ptr, const char *end, string &result)
572
const char *last_name_end;
573
const char *last_name_start = find_last_name(ptr, end, &last_name_end);
576
const char *token_start = ptr;
577
if (!get_token(&ptr, last_name_start))
579
const token_info *ti = lookup_token(token_start, ptr);
581
if ((ptr - token_start == 1 && *token_start == ' ')
582
|| (ptr - token_start == 2 && token_start[0] == '\\'
583
&& token_start[1] == ' '))
586
result += period_before_initial;
588
result += period_before_other;
591
result.append(token_start, ptr - token_start);
592
if (ti->is_upper()) {
593
const char *lower_ptr = ptr;
597
if (!get_token(&ptr, last_name_start))
599
if ((ptr - token_start == 1 && *token_start == ' ')
600
|| (ptr - token_start == 2 && token_start[0] == '\\'
601
&& token_start[1] == ' '))
603
ti = lookup_token(token_start, ptr);
604
if (ti->is_hyphen()) {
605
const char *ptr1 = ptr;
606
if (get_token(&ptr1, last_name_start)) {
607
ti = lookup_token(ptr, ptr1);
608
if (ti->is_upper()) {
609
result += period_before_hyphen;
610
result.append(token_start, ptr1 - token_start);
615
else if (ti->is_upper()) {
616
// MacDougal -> MacD.
617
result.append(lower_ptr, ptr - lower_ptr);
621
else if (first_token && ti->is_accent()) {
622
result.append(token_start, ptr - token_start);
631
result += period_before_last_name;
632
result.append(last_name_start, end - last_name_start);
635
static void abbreviate_names(string &result)
639
const char *ptr = str.contents();
640
const char *end = ptr + str.length();
642
const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
645
abbreviate_name(ptr, name_end, result);
649
result += FIELD_SEPARATOR;
653
void reverse_name(const char *ptr, const char *name_end, string &result)
655
const char *last_name_end;
656
const char *last_name_start = find_last_name(ptr, name_end, &last_name_end);
657
result.append(last_name_start, last_name_end - last_name_start);
658
while (last_name_start > ptr
659
&& (last_name_start[-1] == ' ' || last_name_start[-1] == '\n'))
661
if (last_name_start > ptr) {
663
result.append(ptr, last_name_start - ptr);
665
if (last_name_end < name_end)
666
result.append(last_name_end, name_end - last_name_end);
669
void reverse_names(string &result, int n)
675
const char *ptr = str.contents();
676
const char *end = ptr + str.length();
679
result.append(ptr, end - ptr);
682
const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
685
reverse_name(ptr, name_end, result);
689
result += FIELD_SEPARATOR;
693
// Return number of field separators.
695
int join_fields(string &f)
697
const char *ptr = f.contents();
698
int len = f.length();
701
for (j = 0; j < len; j++)
702
if (ptr[j] == FIELD_SEPARATOR)
704
if (nfield_seps == 0)
707
int field_seps_left = nfield_seps;
708
for (j = 0; j < len; j++) {
709
if (ptr[j] == FIELD_SEPARATOR) {
710
if (nfield_seps == 1)
711
temp += join_authors_exactly_two;
712
else if (--field_seps_left == 0)
713
temp += join_authors_last_two;
715
temp += join_authors_default;
724
void uppercase(const char *start, const char *end, string &result)
727
const char *token_start = start;
728
if (!get_token(&start, end))
730
const token_info *ti = lookup_token(token_start, start);
731
ti->upper_case(token_start, start, result);
735
void lowercase(const char *start, const char *end, string &result)
738
const char *token_start = start;
739
if (!get_token(&start, end))
741
const token_info *ti = lookup_token(token_start, start);
742
ti->lower_case(token_start, start, result);
746
void capitalize(const char *ptr, const char *end, string &result)
748
int in_small_point_size = 0;
750
const char *start = ptr;
751
if (!get_token(&ptr, end))
753
const token_info *ti = lookup_token(start, ptr);
754
const char *char_end = ptr;
755
int is_lower = ti->is_lower();
756
if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) {
757
const token_info *ti2 = lookup_token(char_end, ptr);
758
if (!ti2->is_accent())
762
if (!in_small_point_size) {
764
in_small_point_size = 1;
766
ti->upper_case(start, char_end, result);
767
result.append(char_end, ptr - char_end);
770
if (in_small_point_size) {
772
in_small_point_size = 0;
774
result.append(start, ptr - start);
777
if (in_small_point_size)
781
void capitalize_field(string &str)
784
capitalize(str.contents(), str.contents() + str.length(), temp);
788
int is_terminated(const char *ptr, const char *end)
790
const char *last_token = end;
793
if (!get_token(&ptr, end))
797
return end - last_token == 1
798
&& (*last_token == '.' || *last_token == '!' || *last_token == '?');
801
void reference::output(FILE *fp)
804
for (int i = 0; i < 256; i++)
805
if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) {
806
string &f = field[field_index[i]];
808
int j = reverse_fields.search(i);
811
int len = reverse_fields.length();
812
if (++j < len && csdigit(reverse_fields[j])) {
813
n = reverse_fields[j] - '0';
814
for (++j; j < len && csdigit(reverse_fields[j]); j++)
815
// should check for overflow
816
n = n*10 + reverse_fields[j] - '0';
823
int is_multiple = join_fields(f) > 0;
824
if (capitalize_fields.search(i) >= 0)
826
if (memchr(f.contents(), '\n', f.length()) == 0) {
827
fprintf(fp, ".ds [%c ", i);
828
if (f[0] == ' ' || f[0] == '\\' || f[0] == '"')
834
fprintf(fp, ".de [%c\n", i);
839
int multiple_pages = 0;
840
const char *s = f.contents();
841
const char *end = f.contents() + f.length();
843
const char *token_start = s;
844
if (!get_token(&s, end))
846
const token_info *ti = lookup_token(token_start, s);
847
if (ti->is_hyphen() || ti->is_range_sep()) {
852
fprintf(fp, ".nr [P %d\n", multiple_pages);
855
fprintf(fp, ".nr [E %d\n", is_multiple);
857
for (const char *p = "TAO"; *p; p++) {
858
int fi = field_index[(unsigned char)*p];
859
if (fi != NULL_FIELD_INDEX) {
860
string &f = field[fi];
861
fprintf(fp, ".nr [%c %d\n", *p,
862
is_terminated(f.contents(), f.contents() + f.length()));
866
fprintf(fp, ".][ %d %s\n", t, reference_types[t]);
867
if (annotation_macro.length() > 0 && annotation_field >= 0
868
&& field_index[annotation_field] != NULL_FIELD_INDEX) {
870
put_string(annotation_macro, fp);
872
put_string(field[field_index[annotation_field]], fp);
876
void reference::print_sort_key_comment(FILE *fp)
879
put_string(sort_key, fp);
883
const char *find_year(const char *start, const char *end, const char **endp)
886
while (start < end && !csdigit(*start))
888
const char *ptr = start;
891
while (ptr < end && csdigit(*ptr))
893
if (ptr - start == 4 || ptr - start == 3
895
&& (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) {
904
static const char *find_day(const char *start, const char *end,
908
while (start < end && !csdigit(*start))
910
const char *ptr = start;
913
while (ptr < end && csdigit(*ptr))
915
if ((ptr - start == 1 && start[0] != '0')
916
|| (ptr - start == 2 &&
919
|| (start[0] == '3' && start[1] <= '1')
920
|| (start[0] == '0' && start[1] != '0')))) {
929
static int find_month(const char *start, const char *end)
931
static const char *months[] = {
946
while (start < end && !csalpha(*start))
948
const char *ptr = start;
951
while (ptr < end && csalpha(*ptr))
953
if (ptr - start >= 3) {
954
for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) {
955
const char *q = months[i];
956
const char *p = start;
957
for (; p < ptr; p++, q++)
958
if (cmlower(*p) != *q)
969
int reference::contains_field(char c) const
971
return field_index[(unsigned char)c] != NULL_FIELD_INDEX;
974
int reference::classify()
976
if (contains_field('J'))
977
return JOURNAL_ARTICLE;
978
if (contains_field('B'))
979
return ARTICLE_IN_BOOK;
980
if (contains_field('G'))
982
if (contains_field('R'))
984
if (contains_field('I'))
986
if (contains_field('M'))
991
const char *reference::get_year(const char **endp) const
993
if (field_index['D'] != NULL_FIELD_INDEX) {
994
string &date = field[field_index['D']];
995
const char *start = date.contents();
996
const char *end = start + date.length();
997
return find_year(start, end, endp);
1003
const char *reference::get_field(unsigned char c, const char **endp) const
1005
if (field_index[c] != NULL_FIELD_INDEX) {
1006
string &f = field[field_index[c]];
1007
const char *start = f.contents();
1008
*endp = start + f.length();
1015
const char *reference::get_date(const char **endp) const
1017
return get_field('D', endp);
1020
const char *nth_field(int i, const char *start, const char **endp)
1023
start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1028
const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1034
const char *reference::get_author(int i, const char **endp) const
1036
for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1037
const char *start = get_field(*f, endp);
1039
if (strchr(MULTI_FIELD_NAMES, *f) != 0)
1040
return nth_field(i, start, endp);
1050
const char *reference::get_author_last_name(int i, const char **endp) const
1052
for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1053
const char *start = get_field(*f, endp);
1055
if (strchr(MULTI_FIELD_NAMES, *f) != 0) {
1056
start = nth_field(i, start, endp);
1061
return find_last_name(start, *endp, endp);
1069
void reference::set_date(string &d)
1071
if (d.length() == 0)
1074
insert_field('D', d);
1077
int same_year(const reference &r1, const reference &r2)
1080
const char *ys1 = r1.get_year(&ye1);
1082
const char *ys2 = r2.get_year(&ye2);
1085
return same_date(r1, r2);
1091
else if (ye1 - ys1 != ye2 - ys2)
1094
return memcmp(ys1, ys2, ye1 - ys1) == 0;
1097
int same_date(const reference &r1, const reference &r2)
1100
const char *s1 = r1.get_date(&e1);
1102
const char *s2 = r2.get_date(&e2);
1107
else if (e1 - s1 != e2 - s2)
1110
return memcmp(s1, s2, e1 - s1) == 0;
1113
const char *reference::get_sort_field(int i, int si, int ssi,
1114
const char **endp) const
1116
const char *start = sort_key.contents();
1117
const char *end = start + sort_key.length();
1123
start = (char *)memchr(start, SORT_SEP, end - start);
1128
const char *e = (char *)memchr(start, SORT_SEP, end - start);
1136
start = (char *)memchr(start, SORT_SUB_SEP, end - start);
1141
e = (char *)memchr(start, SORT_SUB_SEP, end - start);
1148
while (--ssi >= 0) {
1149
start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1154
e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);