2
Copyright (C) 1989, 1990, 1991, 1992, 2000 Free Software Foundation, Inc.
3
Written by James Clark (jjc@jclark.com)
5
This file is part of groff.
7
groff is free software; you can redistribute it and/or modify it under
8
the terms of the GNU General Public License as published by the Free
9
Software Foundation; either version 2, or (at your option) any later
12
groff is distributed in the hope that it will be useful, but WITHOUT ANY
13
WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17
You should have received a copy of the GNU General Public License along
18
with groff; see the file COPYING. If not, write to the Free Software
19
Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
29
void yyerror(const char *);
32
static const char *format_serial(char c, int n);
39
label_info(const string &);
42
label_info *lookup_label(const string &label);
46
// Does the tentative label depend on the reference?
47
CONTAINS_VARIABLE = 01,
52
virtual ~expression() { }
53
virtual void evaluate(int, const reference &, string &,
54
substring_position &) = 0;
55
virtual unsigned analyze() { return 0; }
58
class at_expr : public expression {
61
void evaluate(int, const reference &, string &, substring_position &);
62
unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
65
class format_expr : public expression {
70
format_expr(char c, int w = 0, int f = 1)
71
: type(c), width(w), first_number(f) { }
72
void evaluate(int, const reference &, string &, substring_position &);
73
unsigned analyze() { return CONTAINS_FORMAT; }
76
class field_expr : public expression {
80
field_expr(char nm, int num) : number(num), name(nm) { }
81
void evaluate(int, const reference &, string &, substring_position &);
82
unsigned analyze() { return CONTAINS_VARIABLE; }
85
class literal_expr : public expression {
88
literal_expr(const char *ptr, int len) : s(ptr, len) { }
89
void evaluate(int, const reference &, string &, substring_position &);
92
class unary_expr : public expression {
96
unary_expr(expression *e) : expr(e) { }
97
~unary_expr() { delete expr; }
98
void evaluate(int, const reference &, string &, substring_position &) = 0;
99
unsigned analyze() { return expr ? expr->analyze() : 0; }
102
// This caches the analysis of an expression.
104
class analyzed_expr : public unary_expr {
107
analyzed_expr(expression *);
108
void evaluate(int, const reference &, string &, substring_position &);
109
unsigned analyze() { return flags; }
112
class star_expr : public unary_expr {
114
star_expr(expression *e) : unary_expr(e) { }
115
void evaluate(int, const reference &, string &, substring_position &);
117
return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
122
typedef void map_func(const char *, const char *, string &);
124
class map_expr : public unary_expr {
127
map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
128
void evaluate(int, const reference &, string &, substring_position &);
131
typedef const char *extractor_func(const char *, const char *, const char **);
133
class extractor_expr : public unary_expr {
135
extractor_func *func;
137
enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
138
extractor_expr(expression *e, extractor_func *f, int pt)
139
: unary_expr(e), part(pt), func(f) { }
140
void evaluate(int, const reference &, string &, substring_position &);
143
class truncate_expr : public unary_expr {
146
truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
147
void evaluate(int, const reference &, string &, substring_position &);
150
class separator_expr : public unary_expr {
152
separator_expr(expression *e) : unary_expr(e) { }
153
void evaluate(int, const reference &, string &, substring_position &);
156
class binary_expr : public expression {
161
binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
162
~binary_expr() { delete expr1; delete expr2; }
163
void evaluate(int, const reference &, string &, substring_position &) = 0;
165
return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
169
class alternative_expr : public binary_expr {
171
alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
172
void evaluate(int, const reference &, string &, substring_position &);
175
class list_expr : public binary_expr {
177
list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
178
void evaluate(int, const reference &, string &, substring_position &);
181
class substitute_expr : public binary_expr {
183
substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
184
void evaluate(int, const reference &, string &, substring_position &);
187
class ternary_expr : public expression {
193
ternary_expr(expression *e1, expression *e2, expression *e3)
194
: expr1(e1), expr2(e2), expr3(e3) { }
195
~ternary_expr() { delete expr1; delete expr2; delete expr3; }
196
void evaluate(int, const reference &, string &, substring_position &) = 0;
198
return ((expr1 ? expr1->analyze() : 0)
199
| (expr2 ? expr2->analyze() : 0)
200
| (expr3 ? expr3->analyze() : 0));
204
class conditional_expr : public ternary_expr {
206
conditional_expr(expression *e1, expression *e2, expression *e3)
207
: ternary_expr(e1, e2, e3) { }
208
void evaluate(int, const reference &, string &, substring_position &);
211
static expression *parsed_label = 0;
212
static expression *parsed_date_label = 0;
213
static expression *parsed_short_label = 0;
215
static expression *parse_result;
224
struct { int ndigits; int val; } dig;
225
struct { int start; int len; } str;
228
/* uppercase or lowercase letter */
229
%token <num> TOKEN_LETTER
230
/* literal characters */
231
%token <str> TOKEN_LITERAL
233
%token <num> TOKEN_DIGIT
235
%type <expr> conditional
236
%type <expr> alternative
239
%type <expr> substitute
240
%type <expr> optional_conditional
243
%type <num> optional_number
250
{ parse_result = ($1 ? new analyzed_expr($1) : 0); }
256
| alternative '?' optional_conditional ':' conditional
257
{ $$ = new conditional_expr($1, $3, $5); }
260
optional_conditional:
270
| alternative '|' list
271
{ $$ = new alternative_expr($1, $3); }
272
| alternative '&' list
273
{ $$ = new conditional_expr($1, $3, 0); }
280
{ $$ = new list_expr($1, $2); }
286
| substitute '~' string
287
{ $$ = new substitute_expr($1, $3); }
292
{ $$ = new at_expr; }
295
$$ = new literal_expr(literals.contents() + $1.start,
299
{ $$ = new field_expr($1, 0); }
300
| TOKEN_LETTER number
301
{ $$ = new field_expr($1, $2 - 1); }
309
$$ = new format_expr($2);
312
command_error("unrecognized format `%1'", char($2));
313
$$ = new format_expr('a');
320
$$ = new format_expr('0', $2.ndigits, $2.val);
322
| string '.' flag TOKEN_LETTER optional_number
326
$$ = new map_expr($1, lowercase);
329
$$ = new map_expr($1, uppercase);
332
$$ = new map_expr($1, capitalize);
335
$$ = new map_expr($1, reverse_name);
338
$$ = new map_expr($1, abbreviate_name);
341
$$ = new extractor_expr($1, find_year, $3);
344
$$ = new extractor_expr($1, find_last_name, $3);
348
command_error("unknown function `%1'", char($4));
354
{ $$ = new truncate_expr($1, $3); }
356
{ $$ = new truncate_expr($1, -$3); }
358
{ $$ = new star_expr($1); }
359
| '(' optional_conditional ')'
361
| '<' optional_conditional '>'
362
{ $$ = new separator_expr($2); }
381
{ $$.ndigits = 1; $$.val = $1; }
383
{ $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
398
/* bison defines const to be empty unless __STDC__ is defined, which it
399
isn't under cfront */
405
const char *spec_ptr;
406
const char *spec_end;
407
const char *spec_cur;
411
while (spec_ptr < spec_end && csspace(*spec_ptr))
414
if (spec_ptr >= spec_end)
416
unsigned char c = *spec_ptr++;
422
yylval.num = c - '0';
426
yylval.str.start = literals.length();
427
for (; spec_ptr < spec_end; spec_ptr++) {
428
if (*spec_ptr == '\'') {
429
if (++spec_ptr < spec_end && *spec_ptr == '\'')
432
yylval.str.len = literals.length() - yylval.str.start;
433
return TOKEN_LITERAL;
437
literals += *spec_ptr;
439
yylval.str.len = literals.length() - yylval.str.start;
440
return TOKEN_LITERAL;
445
int set_label_spec(const char *label_spec)
447
spec_cur = spec_ptr = label_spec;
448
spec_end = strchr(label_spec, '\0');
453
parsed_label = parse_result;
457
int set_date_label_spec(const char *label_spec)
459
spec_cur = spec_ptr = label_spec;
460
spec_end = strchr(label_spec, '\0');
464
delete parsed_date_label;
465
parsed_date_label = parse_result;
469
int set_short_label_spec(const char *label_spec)
471
spec_cur = spec_ptr = label_spec;
472
spec_end = strchr(label_spec, '\0');
476
delete parsed_short_label;
477
parsed_short_label = parse_result;
481
void yyerror(const char *message)
483
if (spec_cur < spec_end)
484
command_error("label specification %1 before `%2'", message, spec_cur);
486
command_error("label specification %1 at end of string",
490
void at_expr::evaluate(int tentative, const reference &ref,
491
string &result, substring_position &)
494
ref.canonicalize_authors(result);
496
const char *end, *start = ref.get_authors(&end);
498
result.append(start, end - start);
502
void format_expr::evaluate(int tentative, const reference &ref,
503
string &result, substring_position &)
507
const label_info *lp = ref.get_label_ptr();
508
int num = lp == 0 ? ref.get_number() : lp->count;
510
result += format_serial(type, num + 1);
512
const char *ptr = i_to_a(num + first_number);
513
int pad = width - strlen(ptr);
520
static const char *format_serial(char c, int n)
523
static char buf[128]; // more than enough.
529
// troff uses z and w to represent 10000 and 5000 in Roman
530
// numerals; I can find no historical basis for this usage
531
const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
538
for (int i = 1000; i > 0; i /= 10, s += 2) {
585
// this is derived from troff/reg.c
592
*p++ = c + d - 1; // ASCII dependent
612
void field_expr::evaluate(int, const reference &ref,
613
string &result, substring_position &)
616
const char *start = ref.get_field(name, &end);
618
start = nth_field(number, start, &end);
620
result.append(start, end - start);
624
void literal_expr::evaluate(int, const reference &,
625
string &result, substring_position &)
630
analyzed_expr::analyzed_expr(expression *e)
631
: unary_expr(e), flags(e ? e->analyze() : 0)
635
void analyzed_expr::evaluate(int tentative, const reference &ref,
636
string &result, substring_position &pos)
639
expr->evaluate(tentative, ref, result, pos);
642
void star_expr::evaluate(int tentative, const reference &ref,
643
string &result, substring_position &pos)
645
const label_info *lp = ref.get_label_ptr();
647
&& (lp == 0 || lp->total > 1)
649
expr->evaluate(tentative, ref, result, pos);
652
void separator_expr::evaluate(int tentative, const reference &ref,
653
string &result, substring_position &pos)
655
int start_length = result.length();
656
int is_first = pos.start < 0;
658
expr->evaluate(tentative, ref, result, pos);
660
pos.start = start_length;
661
pos.length = result.length() - start_length;
665
void map_expr::evaluate(int tentative, const reference &ref,
666
string &result, substring_position &)
670
substring_position temp_pos;
671
expr->evaluate(tentative, ref, temp, temp_pos);
672
(*func)(temp.contents(), temp.contents() + temp.length(), result);
676
void extractor_expr::evaluate(int tentative, const reference &ref,
677
string &result, substring_position &)
681
substring_position temp_pos;
682
expr->evaluate(tentative, ref, temp, temp_pos);
683
const char *end, *start = (*func)(temp.contents(),
684
temp.contents() + temp.length(),
689
result.append(temp.contents(), start - temp.contents());
695
result.append(start, end - start);
699
result.append(end, temp.contents() + temp.length() - end);
707
static void first_part(int len, const char *ptr, const char *end,
711
const char *token_start = ptr;
712
if (!get_token(&ptr, end))
714
const token_info *ti = lookup_token(token_start, ptr);
715
int counts = ti->sortify_non_empty(token_start, ptr);
716
if (counts && --len < 0)
718
if (counts || ti->is_accent())
719
result.append(token_start, ptr - token_start);
723
static void last_part(int len, const char *ptr, const char *end,
726
const char *start = ptr;
729
const char *token_start = ptr;
730
if (!get_token(&ptr, end))
732
const token_info *ti = lookup_token(token_start, ptr);
733
if (ti->sortify_non_empty(token_start, ptr))
737
int skip = count - len;
740
const char *token_start = ptr;
741
if (!get_token(&ptr, end))
743
const token_info *ti = lookup_token(token_start, ptr);
744
if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
750
first_part(len, ptr, end, result);
753
void truncate_expr::evaluate(int tentative, const reference &ref,
754
string &result, substring_position &)
758
substring_position temp_pos;
759
expr->evaluate(tentative, ref, temp, temp_pos);
760
const char *start = temp.contents();
761
const char *end = start + temp.length();
763
first_part(n, start, end, result);
765
last_part(-n, start, end, result);
769
void alternative_expr::evaluate(int tentative, const reference &ref,
770
string &result, substring_position &pos)
772
int start_length = result.length();
774
expr1->evaluate(tentative, ref, result, pos);
775
if (result.length() == start_length && expr2)
776
expr2->evaluate(tentative, ref, result, pos);
779
void list_expr::evaluate(int tentative, const reference &ref,
780
string &result, substring_position &pos)
783
expr1->evaluate(tentative, ref, result, pos);
785
expr2->evaluate(tentative, ref, result, pos);
788
void substitute_expr::evaluate(int tentative, const reference &ref,
789
string &result, substring_position &pos)
791
int start_length = result.length();
793
expr1->evaluate(tentative, ref, result, pos);
794
if (result.length() > start_length && result[result.length() - 1] == '-') {
795
// ought to see if pos covers the -
796
result.set_length(result.length() - 1);
798
expr2->evaluate(tentative, ref, result, pos);
802
void conditional_expr::evaluate(int tentative, const reference &ref,
803
string &result, substring_position &pos)
806
substring_position temp_pos;
808
expr1->evaluate(tentative, ref, temp, temp_pos);
809
if (temp.length() > 0) {
811
expr2->evaluate(tentative, ref, result, pos);
815
expr3->evaluate(tentative, ref, result, pos);
819
void reference::pre_compute_label()
821
if (parsed_label != 0
822
&& (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
824
substring_position temp_pos;
825
parsed_label->evaluate(1, *this, label, temp_pos);
826
label_ptr = lookup_label(label);
830
void reference::compute_label()
834
parsed_label->evaluate(0, *this, label, separator_pos);
835
if (short_label_flag && parsed_short_label)
836
parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
839
if (parsed_date_label) {
840
substring_position temp_pos;
841
parsed_date_label->evaluate(0, *this, new_date, temp_pos);
846
label_ptr->count += 1;
849
void reference::immediate_compute_label()
852
label_ptr->total = 2; // force use of disambiguator
856
int reference::merge_labels(reference **v, int n, label_type type,
859
if (abbreviate_label_ranges)
860
return merge_labels_by_number(v, n, type, result);
862
return merge_labels_by_parts(v, n, type, result);
865
int reference::merge_labels_by_number(reference **v, int n, label_type type,
870
int num = get_number();
871
// Only merge three or more labels.
872
if (v[0]->get_number() != num + 1
873
|| v[1]->get_number() != num + 2)
876
for (i = 2; i < n; i++)
877
if (v[i]->get_number() != num + i + 1)
879
result = get_label(type);
880
result += label_range_indicator;
881
result += v[i - 1]->get_label(type);
885
const substring_position &reference::get_separator_pos(label_type type) const
887
if (type == SHORT_LABEL && short_label_flag)
888
return short_separator_pos;
890
return separator_pos;
893
const string &reference::get_label(label_type type) const
895
if (type == SHORT_LABEL && short_label_flag)
901
int reference::merge_labels_by_parts(reference **v, int n, label_type type,
906
const string &lb = get_label(type);
907
const substring_position &sp = get_separator_pos(type);
909
|| sp.start != v[0]->get_separator_pos(type).start
910
|| memcmp(lb.contents(), v[0]->get_label(type).contents(),
916
result += separate_label_second_parts;
917
const substring_position &s = v[i]->get_separator_pos(type);
918
int sep_end_pos = s.start + s.length;
919
result.append(v[i]->get_label(type).contents() + sep_end_pos,
920
v[i]->get_label(type).length() - sep_end_pos);
922
&& sp.start == v[i]->get_separator_pos(type).start
923
&& memcmp(lb.contents(), v[i]->get_label(type).contents(),
930
label_info::label_info(const string &s)
931
: start(label_pool.length()), length(s.length()), count(0), total(1)
936
static label_info **label_table = 0;
937
static int label_table_size = 0;
938
static int label_table_used = 0;
940
label_info *lookup_label(const string &label)
942
if (label_table == 0) {
943
label_table = new label_info *[17];
944
label_table_size = 17;
945
for (int i = 0; i < 17; i++)
948
unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
950
for (ptr = label_table + h;
953
? (ptr = label_table + label_table_size - 1)
955
if ((*ptr)->length == label.length()
956
&& memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
957
label.length()) == 0) {
961
label_info *result = *ptr = new label_info(label);
962
if (++label_table_used * 2 > label_table_size) {
964
label_info **old_table = label_table;
965
int old_size = label_table_size;
966
label_table_size = next_size(label_table_size);
967
label_table = new label_info *[label_table_size];
969
for (i = 0; i < label_table_size; i++)
971
for (i = 0; i < old_size; i++)
973
unsigned h = hash_string(label_pool.contents() + old_table[i]->start,
974
old_table[i]->length);
976
for (p = label_table + (h % label_table_size);
979
? (p = label_table + label_table_size - 1)
991
for (int i = 0; i < label_table_size; i++) {
992
delete label_table[i];
995
label_table_used = 0;
999
static void consider_authors(reference **start, reference **end, int i);
1001
void compute_labels(reference **v, int n)
1004
&& (parsed_label->analyze() & expression::CONTAINS_AT)
1005
&& sort_fields.length() >= 2
1006
&& sort_fields[0] == 'A'
1007
&& sort_fields[1] == '+')
1008
consider_authors(v, v + n, 0);
1009
for (int i = 0; i < n; i++)
1010
v[i]->compute_label();
1014
/* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1015
where 0 <= i <= N if there exists a reference with a list of authors
1016
<B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1017
and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1018
A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1019
<B0,B1,...,BM>. If a reference needs author i we only have to call
1020
need_author(j) for some j >= i such that the reference also needs
1023
/* This function handles 2 tasks:
1024
determine which authors are needed (cannot be elided with et al.);
1025
determine which authors can have only last names in the labels.
1027
References >= start and < end have the same first i author names.
1028
Also they're sorted by A+. */
1030
static void consider_authors(reference **start, reference **end, int i)
1034
reference **p = start;
1035
if (i >= (*p)->get_nauthors()) {
1036
for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1038
if (p < end && i > 0) {
1039
// If we have an author list <A B C> and an author list <A B C D>,
1040
// then both lists need C.
1041
for (reference **q = start; q < end; q++)
1042
(*q)->need_author(i - 1);
1047
reference **last_name_start = p;
1048
reference **name_start = p;
1050
p < end && i < (*p)->get_nauthors()
1051
&& same_author_last_name(**last_name_start, **p, i);
1053
if (!same_author_name(**name_start, **p, i)) {
1054
consider_authors(name_start, p, i + 1);
1058
consider_authors(name_start, p, i + 1);
1059
if (last_name_start == name_start) {
1060
for (reference **q = last_name_start; q < p; q++)
1061
(*q)->set_last_name_unambiguous(i);
1063
// If we have an author list <A B C D> and <A B C E>, then the lists
1064
// need author D and E respectively.
1065
if (name_start > start || p < end) {
1066
for (reference **q = last_name_start; q < p; q++)
1067
(*q)->need_author(i);
1072
int same_author_last_name(const reference &r1, const reference &r2, int n)
1075
const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1078
const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1080
return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1083
int same_author_name(const reference &r1, const reference &r2, int n)
1086
const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1089
const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1091
return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1095
void int_set::set(int i)
1099
if (bytei >= v.length()) {
1100
int old_length = v.length();
1101
v.set_length(bytei + 1);
1102
for (int j = old_length; j <= bytei; j++)
1105
v[bytei] |= 1 << (i & 7);
1108
int int_set::get(int i) const
1112
return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1115
void reference::set_last_name_unambiguous(int i)
1117
last_name_unambiguous.set(i);
1120
void reference::need_author(int n)
1122
if (n > last_needed_author)
1123
last_needed_author = n;
1126
const char *reference::get_authors(const char **end) const
1128
if (!computed_authors) {
1129
((reference *)this)->computed_authors = 1;
1130
string &result = ((reference *)this)->authors;
1131
int na = get_nauthors();
1133
for (int i = 0; i < na; i++) {
1134
if (last_name_unambiguous.get(i)) {
1135
const char *e, *start = get_author_last_name(i, &e);
1137
result.append(start, e - start);
1140
const char *e, *start = get_author(i, &e);
1142
result.append(start, e - start);
1144
if (i == last_needed_author
1145
&& et_al.length() > 0
1146
&& et_al_min_elide > 0
1147
&& last_needed_author + et_al_min_elide < na
1148
&& na >= et_al_min_total) {
1154
result += join_authors_exactly_two;
1155
else if (i < na - 2)
1156
result += join_authors_default;
1158
result += join_authors_last_two;
1162
const char *start = authors.contents();
1163
*end = start + authors.length();
1167
int reference::get_nauthors() const
1172
for (na = 0; get_author(na, &dummy) != 0; na++)
1174
((reference *)this)->nauthors = na;