2 Copyright (C) 1989, 1990, 1991, 1992, 2000, 2004
3 Free Software Foundation, Inc.
4 Written by James Clark (jjc@jclark.com)
6 This file is part of groff.
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License along
19 with groff; see the file COPYING. If not, write to the Free Software
20 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
30 void yyerror(const char *);
33 static const char *format_serial(char c, int n);
40 label_info(const string &);
43 label_info *lookup_label(const string &label);
47 // Does the tentative label depend on the reference?
48 CONTAINS_VARIABLE = 01,
53 virtual ~expression() { }
54 virtual void evaluate(int, const reference &, string &,
55 substring_position &) = 0;
56 virtual unsigned analyze() { return 0; }
59 class at_expr : public expression {
62 void evaluate(int, const reference &, string &, substring_position &);
63 unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
66 class format_expr : public expression {
71 format_expr(char c, int w = 0, int f = 1)
72 : type(c), width(w), first_number(f) { }
73 void evaluate(int, const reference &, string &, substring_position &);
74 unsigned analyze() { return CONTAINS_FORMAT; }
77 class field_expr : public expression {
81 field_expr(char nm, int num) : number(num), name(nm) { }
82 void evaluate(int, const reference &, string &, substring_position &);
83 unsigned analyze() { return CONTAINS_VARIABLE; }
86 class literal_expr : public expression {
89 literal_expr(const char *ptr, int len) : s(ptr, len) { }
90 void evaluate(int, const reference &, string &, substring_position &);
93 class unary_expr : public expression {
97 unary_expr(expression *e) : expr(e) { }
98 ~unary_expr() { delete expr; }
99 void evaluate(int, const reference &, string &, substring_position &) = 0;
100 unsigned analyze() { return expr ? expr->analyze() : 0; }
103 // This caches the analysis of an expression.
105 class analyzed_expr : public unary_expr {
108 analyzed_expr(expression *);
109 void evaluate(int, const reference &, string &, substring_position &);
110 unsigned analyze() { return flags; }
113 class star_expr : public unary_expr {
115 star_expr(expression *e) : unary_expr(e) { }
116 void evaluate(int, const reference &, string &, substring_position &);
118 return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
123 typedef void map_func(const char *, const char *, string &);
125 class map_expr : public unary_expr {
128 map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
129 void evaluate(int, const reference &, string &, substring_position &);
132 typedef const char *extractor_func(const char *, const char *, const char **);
134 class extractor_expr : public unary_expr {
136 extractor_func *func;
138 enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
139 extractor_expr(expression *e, extractor_func *f, int pt)
140 : unary_expr(e), part(pt), func(f) { }
141 void evaluate(int, const reference &, string &, substring_position &);
144 class truncate_expr : public unary_expr {
147 truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
148 void evaluate(int, const reference &, string &, substring_position &);
151 class separator_expr : public unary_expr {
153 separator_expr(expression *e) : unary_expr(e) { }
154 void evaluate(int, const reference &, string &, substring_position &);
157 class binary_expr : public expression {
162 binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
163 ~binary_expr() { delete expr1; delete expr2; }
164 void evaluate(int, const reference &, string &, substring_position &) = 0;
166 return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
170 class alternative_expr : public binary_expr {
172 alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
173 void evaluate(int, const reference &, string &, substring_position &);
176 class list_expr : public binary_expr {
178 list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
179 void evaluate(int, const reference &, string &, substring_position &);
182 class substitute_expr : public binary_expr {
184 substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
185 void evaluate(int, const reference &, string &, substring_position &);
188 class ternary_expr : public expression {
194 ternary_expr(expression *e1, expression *e2, expression *e3)
195 : expr1(e1), expr2(e2), expr3(e3) { }
196 ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
197 void evaluate(int, const reference &, string &, substring_position &) = 0;
199 return ((expr1 ? expr1->analyze() : 0)
200 | (expr2 ? expr2->analyze() : 0)
201 | (expr3 ? expr3->analyze() : 0));
205 class conditional_expr : public ternary_expr {
207 conditional_expr(expression *e1, expression *e2, expression *e3)
208 : ternary_expr(e1, e2, e3) { }
209 void evaluate(int, const reference &, string &, substring_position &);
212 static expression *parsed_label = 0;
213 static expression *parsed_date_label = 0;
214 static expression *parsed_short_label = 0;
216 static expression *parse_result;
225 struct { int ndigits; int val; } dig;
226 struct { int start; int len; } str;
229 /* uppercase or lowercase letter */
230 %token <num> TOKEN_LETTER
231 /* literal characters */
232 %token <str> TOKEN_LITERAL
234 %token <num> TOKEN_DIGIT
236 %type <expr> conditional
237 %type <expr> alternative
240 %type <expr> substitute
241 %type <expr> optional_conditional
244 %type <num> optional_number
251 { parse_result = ($1 ? new analyzed_expr($1) : 0); }
257 | alternative '?' optional_conditional ':' conditional
258 { $$ = new conditional_expr($1, $3, $5); }
261 optional_conditional:
271 | alternative '|' list
272 { $$ = new alternative_expr($1, $3); }
273 | alternative '&' list
274 { $$ = new conditional_expr($1, $3, 0); }
281 { $$ = new list_expr($1, $2); }
287 | substitute '~' string
288 { $$ = new substitute_expr($1, $3); }
293 { $$ = new at_expr; }
296 $$ = new literal_expr(literals.contents() + $1.start,
300 { $$ = new field_expr($1, 0); }
301 | TOKEN_LETTER number
302 { $$ = new field_expr($1, $2 - 1); }
310 $$ = new format_expr($2);
313 command_error("unrecognized format `%1'", char($2));
314 $$ = new format_expr('a');
321 $$ = new format_expr('0', $2.ndigits, $2.val);
323 | string '.' flag TOKEN_LETTER optional_number
327 $$ = new map_expr($1, lowercase);
330 $$ = new map_expr($1, uppercase);
333 $$ = new map_expr($1, capitalize);
336 $$ = new map_expr($1, reverse_name);
339 $$ = new map_expr($1, abbreviate_name);
342 $$ = new extractor_expr($1, find_year, $3);
345 $$ = new extractor_expr($1, find_last_name, $3);
349 command_error("unknown function `%1'", char($4));
355 { $$ = new truncate_expr($1, $3); }
357 { $$ = new truncate_expr($1, -$3); }
359 { $$ = new star_expr($1); }
360 | '(' optional_conditional ')'
362 | '<' optional_conditional '>'
363 { $$ = new separator_expr($2); }
382 { $$.ndigits = 1; $$.val = $1; }
384 { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
399 /* bison defines const to be empty unless __STDC__ is defined, which it
400 isn't under cfront */
406 const char *spec_ptr;
407 const char *spec_end;
408 const char *spec_cur;
410 static char uppercase_array[] = {
411 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
412 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
413 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
417 static char lowercase_array[] = {
418 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
419 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
420 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
426 while (spec_ptr < spec_end && csspace(*spec_ptr))
429 if (spec_ptr >= spec_end)
431 unsigned char c = *spec_ptr++;
437 yylval.num = c - '0';
441 yylval.str.start = literals.length();
442 for (; spec_ptr < spec_end; spec_ptr++) {
443 if (*spec_ptr == '\'') {
444 if (++spec_ptr < spec_end && *spec_ptr == '\'')
447 yylval.str.len = literals.length() - yylval.str.start;
448 return TOKEN_LITERAL;
452 literals += *spec_ptr;
454 yylval.str.len = literals.length() - yylval.str.start;
455 return TOKEN_LITERAL;
460 int set_label_spec(const char *label_spec)
462 spec_cur = spec_ptr = label_spec;
463 spec_end = strchr(label_spec, '\0');
468 parsed_label = parse_result;
472 int set_date_label_spec(const char *label_spec)
474 spec_cur = spec_ptr = label_spec;
475 spec_end = strchr(label_spec, '\0');
479 delete parsed_date_label;
480 parsed_date_label = parse_result;
484 int set_short_label_spec(const char *label_spec)
486 spec_cur = spec_ptr = label_spec;
487 spec_end = strchr(label_spec, '\0');
491 delete parsed_short_label;
492 parsed_short_label = parse_result;
496 void yyerror(const char *message)
498 if (spec_cur < spec_end)
499 command_error("label specification %1 before `%2'", message, spec_cur);
501 command_error("label specification %1 at end of string",
505 void at_expr::evaluate(int tentative, const reference &ref,
506 string &result, substring_position &)
509 ref.canonicalize_authors(result);
511 const char *end, *start = ref.get_authors(&end);
513 result.append(start, end - start);
517 void format_expr::evaluate(int tentative, const reference &ref,
518 string &result, substring_position &)
522 const label_info *lp = ref.get_label_ptr();
523 int num = lp == 0 ? ref.get_number() : lp->count;
525 result += format_serial(type, num + 1);
527 const char *ptr = i_to_a(num + first_number);
528 int pad = width - strlen(ptr);
535 static const char *format_serial(char c, int n)
538 static char buf[128]; // more than enough.
544 // troff uses z and w to represent 10000 and 5000 in Roman
545 // numerals; I can find no historical basis for this usage
546 const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
553 for (int i = 1000; i > 0; i /= 10, s += 2) {
600 // this is derived from troff/reg.c
607 *p++ = c == 'a' ? lowercase_array[d - 1] :
608 uppercase_array[d - 1];
628 void field_expr::evaluate(int, const reference &ref,
629 string &result, substring_position &)
632 const char *start = ref.get_field(name, &end);
634 start = nth_field(number, start, &end);
636 result.append(start, end - start);
640 void literal_expr::evaluate(int, const reference &,
641 string &result, substring_position &)
646 analyzed_expr::analyzed_expr(expression *e)
647 : unary_expr(e), flags(e ? e->analyze() : 0)
651 void analyzed_expr::evaluate(int tentative, const reference &ref,
652 string &result, substring_position &pos)
655 expr->evaluate(tentative, ref, result, pos);
658 void star_expr::evaluate(int tentative, const reference &ref,
659 string &result, substring_position &pos)
661 const label_info *lp = ref.get_label_ptr();
663 && (lp == 0 || lp->total > 1)
665 expr->evaluate(tentative, ref, result, pos);
668 void separator_expr::evaluate(int tentative, const reference &ref,
669 string &result, substring_position &pos)
671 int start_length = result.length();
672 int is_first = pos.start < 0;
674 expr->evaluate(tentative, ref, result, pos);
676 pos.start = start_length;
677 pos.length = result.length() - start_length;
681 void map_expr::evaluate(int tentative, const reference &ref,
682 string &result, substring_position &)
686 substring_position temp_pos;
687 expr->evaluate(tentative, ref, temp, temp_pos);
688 (*func)(temp.contents(), temp.contents() + temp.length(), result);
692 void extractor_expr::evaluate(int tentative, const reference &ref,
693 string &result, substring_position &)
697 substring_position temp_pos;
698 expr->evaluate(tentative, ref, temp, temp_pos);
699 const char *end, *start = (*func)(temp.contents(),
700 temp.contents() + temp.length(),
705 result.append(temp.contents(), start - temp.contents());
711 result.append(start, end - start);
715 result.append(end, temp.contents() + temp.length() - end);
723 static void first_part(int len, const char *ptr, const char *end,
727 const char *token_start = ptr;
728 if (!get_token(&ptr, end))
730 const token_info *ti = lookup_token(token_start, ptr);
731 int counts = ti->sortify_non_empty(token_start, ptr);
732 if (counts && --len < 0)
734 if (counts || ti->is_accent())
735 result.append(token_start, ptr - token_start);
739 static void last_part(int len, const char *ptr, const char *end,
742 const char *start = ptr;
745 const char *token_start = ptr;
746 if (!get_token(&ptr, end))
748 const token_info *ti = lookup_token(token_start, ptr);
749 if (ti->sortify_non_empty(token_start, ptr))
753 int skip = count - len;
756 const char *token_start = ptr;
757 if (!get_token(&ptr, end))
759 const token_info *ti = lookup_token(token_start, ptr);
760 if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
766 first_part(len, ptr, end, result);
769 void truncate_expr::evaluate(int tentative, const reference &ref,
770 string &result, substring_position &)
774 substring_position temp_pos;
775 expr->evaluate(tentative, ref, temp, temp_pos);
776 const char *start = temp.contents();
777 const char *end = start + temp.length();
779 first_part(n, start, end, result);
781 last_part(-n, start, end, result);
785 void alternative_expr::evaluate(int tentative, const reference &ref,
786 string &result, substring_position &pos)
788 int start_length = result.length();
790 expr1->evaluate(tentative, ref, result, pos);
791 if (result.length() == start_length && expr2)
792 expr2->evaluate(tentative, ref, result, pos);
795 void list_expr::evaluate(int tentative, const reference &ref,
796 string &result, substring_position &pos)
799 expr1->evaluate(tentative, ref, result, pos);
801 expr2->evaluate(tentative, ref, result, pos);
804 void substitute_expr::evaluate(int tentative, const reference &ref,
805 string &result, substring_position &pos)
807 int start_length = result.length();
809 expr1->evaluate(tentative, ref, result, pos);
810 if (result.length() > start_length && result[result.length() - 1] == '-') {
811 // ought to see if pos covers the -
812 result.set_length(result.length() - 1);
814 expr2->evaluate(tentative, ref, result, pos);
818 void conditional_expr::evaluate(int tentative, const reference &ref,
819 string &result, substring_position &pos)
822 substring_position temp_pos;
824 expr1->evaluate(tentative, ref, temp, temp_pos);
825 if (temp.length() > 0) {
827 expr2->evaluate(tentative, ref, result, pos);
831 expr3->evaluate(tentative, ref, result, pos);
835 void reference::pre_compute_label()
837 if (parsed_label != 0
838 && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
840 substring_position temp_pos;
841 parsed_label->evaluate(1, *this, label, temp_pos);
842 label_ptr = lookup_label(label);
846 void reference::compute_label()
850 parsed_label->evaluate(0, *this, label, separator_pos);
851 if (short_label_flag && parsed_short_label)
852 parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
855 if (parsed_date_label) {
856 substring_position temp_pos;
857 parsed_date_label->evaluate(0, *this, new_date, temp_pos);
862 label_ptr->count += 1;
865 void reference::immediate_compute_label()
868 label_ptr->total = 2; // force use of disambiguator
872 int reference::merge_labels(reference **v, int n, label_type type,
875 if (abbreviate_label_ranges)
876 return merge_labels_by_number(v, n, type, result);
878 return merge_labels_by_parts(v, n, type, result);
881 int reference::merge_labels_by_number(reference **v, int n, label_type type,
886 int num = get_number();
887 // Only merge three or more labels.
888 if (v[0]->get_number() != num + 1
889 || v[1]->get_number() != num + 2)
892 for (i = 2; i < n; i++)
893 if (v[i]->get_number() != num + i + 1)
895 result = get_label(type);
896 result += label_range_indicator;
897 result += v[i - 1]->get_label(type);
901 const substring_position &reference::get_separator_pos(label_type type) const
903 if (type == SHORT_LABEL && short_label_flag)
904 return short_separator_pos;
906 return separator_pos;
909 const string &reference::get_label(label_type type) const
911 if (type == SHORT_LABEL && short_label_flag)
917 int reference::merge_labels_by_parts(reference **v, int n, label_type type,
922 const string &lb = get_label(type);
923 const substring_position &sp = get_separator_pos(type);
925 || sp.start != v[0]->get_separator_pos(type).start
926 || memcmp(lb.contents(), v[0]->get_label(type).contents(),
932 result += separate_label_second_parts;
933 const substring_position &s = v[i]->get_separator_pos(type);
934 int sep_end_pos = s.start + s.length;
935 result.append(v[i]->get_label(type).contents() + sep_end_pos,
936 v[i]->get_label(type).length() - sep_end_pos);
938 && sp.start == v[i]->get_separator_pos(type).start
939 && memcmp(lb.contents(), v[i]->get_label(type).contents(),
946 label_info::label_info(const string &s)
947 : start(label_pool.length()), length(s.length()), count(0), total(1)
952 static label_info **label_table = 0;
953 static int label_table_size = 0;
954 static int label_table_used = 0;
956 label_info *lookup_label(const string &label)
958 if (label_table == 0) {
959 label_table = new label_info *[17];
960 label_table_size = 17;
961 for (int i = 0; i < 17; i++)
964 unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
966 for (ptr = label_table + h;
969 ? (ptr = label_table + label_table_size - 1)
971 if ((*ptr)->length == label.length()
972 && memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
973 label.length()) == 0) {
977 label_info *result = *ptr = new label_info(label);
978 if (++label_table_used * 2 > label_table_size) {
980 label_info **old_table = label_table;
981 int old_size = label_table_size;
982 label_table_size = next_size(label_table_size);
983 label_table = new label_info *[label_table_size];
985 for (i = 0; i < label_table_size; i++)
987 for (i = 0; i < old_size; i++)
989 h = hash_string(label_pool.contents() + old_table[i]->start,
990 old_table[i]->length);
992 for (p = label_table + (h % label_table_size);
995 ? (p = label_table + label_table_size - 1)
1007 for (int i = 0; i < label_table_size; i++) {
1008 delete label_table[i];
1011 label_table_used = 0;
1015 static void consider_authors(reference **start, reference **end, int i);
1017 void compute_labels(reference **v, int n)
1020 && (parsed_label->analyze() & expression::CONTAINS_AT)
1021 && sort_fields.length() >= 2
1022 && sort_fields[0] == 'A'
1023 && sort_fields[1] == '+')
1024 consider_authors(v, v + n, 0);
1025 for (int i = 0; i < n; i++)
1026 v[i]->compute_label();
1030 /* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1031 where 0 <= i <= N if there exists a reference with a list of authors
1032 <B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1033 and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1034 A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1035 <B0,B1,...,BM>. If a reference needs author i we only have to call
1036 need_author(j) for some j >= i such that the reference also needs
1039 /* This function handles 2 tasks:
1040 determine which authors are needed (cannot be elided with et al.);
1041 determine which authors can have only last names in the labels.
1043 References >= start and < end have the same first i author names.
1044 Also they're sorted by A+. */
1046 static void consider_authors(reference **start, reference **end, int i)
1050 reference **p = start;
1051 if (i >= (*p)->get_nauthors()) {
1052 for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1054 if (p < end && i > 0) {
1055 // If we have an author list <A B C> and an author list <A B C D>,
1056 // then both lists need C.
1057 for (reference **q = start; q < end; q++)
1058 (*q)->need_author(i - 1);
1063 reference **last_name_start = p;
1064 reference **name_start = p;
1066 p < end && i < (*p)->get_nauthors()
1067 && same_author_last_name(**last_name_start, **p, i);
1069 if (!same_author_name(**name_start, **p, i)) {
1070 consider_authors(name_start, p, i + 1);
1074 consider_authors(name_start, p, i + 1);
1075 if (last_name_start == name_start) {
1076 for (reference **q = last_name_start; q < p; q++)
1077 (*q)->set_last_name_unambiguous(i);
1079 // If we have an author list <A B C D> and <A B C E>, then the lists
1080 // need author D and E respectively.
1081 if (name_start > start || p < end) {
1082 for (reference **q = last_name_start; q < p; q++)
1083 (*q)->need_author(i);
1088 int same_author_last_name(const reference &r1, const reference &r2, int n)
1091 const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1093 const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1094 if (!as1 && !as2) return 1; // they are the same
1095 if (!as1 || !as2) return 0;
1096 return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1099 int same_author_name(const reference &r1, const reference &r2, int n)
1102 const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1104 const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1105 if (!as1 && !as2) return 1; // they are the same
1106 if (!as1 || !as2) return 0;
1107 return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1111 void int_set::set(int i)
1115 if (bytei >= v.length()) {
1116 int old_length = v.length();
1117 v.set_length(bytei + 1);
1118 for (int j = old_length; j <= bytei; j++)
1121 v[bytei] |= 1 << (i & 7);
1124 int int_set::get(int i) const
1128 return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1131 void reference::set_last_name_unambiguous(int i)
1133 last_name_unambiguous.set(i);
1136 void reference::need_author(int n)
1138 if (n > last_needed_author)
1139 last_needed_author = n;
1142 const char *reference::get_authors(const char **end) const
1144 if (!computed_authors) {
1145 ((reference *)this)->computed_authors = 1;
1146 string &result = ((reference *)this)->authors;
1147 int na = get_nauthors();
1149 for (int i = 0; i < na; i++) {
1150 if (last_name_unambiguous.get(i)) {
1151 const char *e, *start = get_author_last_name(i, &e);
1153 result.append(start, e - start);
1156 const char *e, *start = get_author(i, &e);
1158 result.append(start, e - start);
1160 if (i == last_needed_author
1161 && et_al.length() > 0
1162 && et_al_min_elide > 0
1163 && last_needed_author + et_al_min_elide < na
1164 && na >= et_al_min_total) {
1170 result += join_authors_exactly_two;
1171 else if (i < na - 2)
1172 result += join_authors_default;
1174 result += join_authors_last_two;
1178 const char *start = authors.contents();
1179 *end = start + authors.length();
1183 int reference::get_nauthors() const
1188 for (na = 0; get_author(na, &dummy) != 0; na++)
1190 ((reference *)this)->nauthors = na;