1 /* Permuted index for GNU, with keywords in their context.
2 Copyright (C) 1990, 1991, 1993 Free Software Foundation, Inc.
3 Francois Pinard <pinard@iro.umontreal.ca>, 1988.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 const char *version_string = "GNU ptx version 0.3";
26 char *const copyright = "\
27 This program is free software; you can redistribute it and/or modify\n\
28 it under the terms of the GNU General Public License as published by\n\
29 the Free Software Foundation; either version 2, or (at your option)\n\
32 This program is distributed in the hope that it will be useful,\n\
33 but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
34 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\
35 GNU General Public License for more details.\n\
37 You should have received a copy of the GNU General Public License\n\
38 along with this program; if not, write to the Free Software\n\
39 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.\n";
41 /* Reallocation step when swallowing non regular files. The value is not
42 the actual reallocation step, but its base two logarithm. */
43 #define SWALLOW_REALLOC_LOG 12
45 /* Imported from "regex.c". */
53 #else /* not STDC_HEADERS */
55 /* These definitions work, for all 256 characters. */
56 #define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
58 (((unsigned char) (c) >= 'a' && (unsigned char) (c) <= 'f') \
59 || ((unsigned char) (c) >= 'A' && (unsigned char) (c) <= 'F') \
60 || ((unsigned char) (c) >= '0' && (unsigned char) (c) <= '9'))
61 #define islower(c) ((unsigned char) (c) >= 'a' && (unsigned char) (c) <= 'z')
62 #define isupper(c) ((unsigned char) (c) >= 'A' && (unsigned char) (c) <= 'Z')
63 #define isalpha(c) (islower (c) || isupper (c))
64 #define toupper(c) (islower (c) ? (c) - 'a' + 'A' : (c))
66 #endif /* not STDC_HEADERS */
68 #if !defined (isascii) || defined (STDC_HEADERS)
73 #define ISXDIGIT(c) (isascii (c) && isxdigit (c))
74 #define ISODIGIT(c) ((c) >= '0' && (c) <= '7')
75 #define HEXTOBIN(c) ((c)>='a'&&(c)<='f' ? (c)-'a'+10 : (c)>='A'&&(c)<='F' ? (c)-'A'+10 : (c)-'0')
76 #define OCTTOBIN(c) ((c) - '0')
80 #include <sys/types.h>
83 #if !defined(S_ISREG) && defined(S_IFREG)
84 #define S_ISREG(m) (((m) & S_IFMT) == S_IFREG)
89 #else /* not HAVE_STRING_H */
92 #define strrchr rindex
93 #endif /* not HAVE_STRING_H */
102 #include "bumpalloc.h"
104 #include "gnuregex.h"
111 void *xrealloc (void *, int);
115 /* Global definitions. */
117 const char *program_name; /* name of this program */
118 static int show_help = 0; /* display usage information and exit */
119 static int show_version = 0; /* print the version and exit */
121 /* Program options. */
125 DUMB_FORMAT, /* output for a dumb terminal */
126 ROFF_FORMAT, /* output for `troff' or `nroff' */
127 TEX_FORMAT, /* output for `TeX' or `LaTeX' */
128 UNKNOWN_FORMAT /* output format still unknown */
131 int gnu_extensions = 1; /* trigger all GNU extensions */
132 int auto_reference = 0; /* references are `file_name:line_number:' */
133 int input_reference = 0; /* references at beginning of input lines */
134 int right_reference = 0; /* output references after right context */
135 int line_width = 72; /* output line width in characters */
136 int gap_size = 3; /* number of spaces between output fields */
137 const char *truncation_string = "/";
138 /* string used to mark line truncations */
139 const char *macro_name = "xx"; /* macro name for roff or TeX output */
140 enum Format output_format = UNKNOWN_FORMAT;
143 int ignore_case = 0; /* fold lower to upper case for sorting */
144 const char *context_regex_string = NULL;
145 /* raw regex for end of context */
146 const char *word_regex_string = NULL;
147 /* raw regex for a keyword */
148 const char *break_file = NULL; /* name of the `Break characters' file */
149 const char *only_file = NULL; /* name of the `Only words' file */
150 const char *ignore_file = NULL; /* name of the `Ignore words' file */
152 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
153 whole file. A WORD is something smaller, its length should fit in a
154 short integer. A WORD_TABLE may contain several WORDs. */
158 char *start; /* pointer to beginning of region */
159 char *end; /* pointer to end + 1 of region */
165 char *start; /* pointer to beginning of region */
166 short size; /* length of the region */
172 WORD *start; /* array of WORDs */
173 size_t length; /* number of entries */
177 /* Pattern description tables. */
179 /* For each character, provide its folded equivalent. */
180 unsigned char folded_chars[CHAR_SET_SIZE];
182 /* For each character, indicate if it is part of a word. */
183 char syntax_table[CHAR_SET_SIZE];
184 char *re_syntax_table = syntax_table;
186 /* Compiled regex for end of context. */
187 struct re_pattern_buffer *context_regex;
189 /* End of context pattern register indices. */
190 struct re_registers context_regs;
192 /* Compiled regex for a keyword. */
193 struct re_pattern_buffer *word_regex;
195 /* Keyword pattern register indices. */
196 struct re_registers word_regs;
198 /* A word characters fastmap is used only when no word regexp has been
199 provided. A word is then made up of a sequence of one or more characters
200 allowed by the fastmap. Contains !0 if character allowed in word. Not
201 only this is faster in most cases, but it simplifies the implementation
202 of the Break files. */
203 char word_fastmap[CHAR_SET_SIZE];
205 /* Maximum length of any word read. */
206 int maximum_word_length;
208 /* Maximum width of any reference used. */
209 int reference_max_width;
212 /* Ignore and Only word tables. */
214 WORD_TABLE ignore_table; /* table of words to ignore */
215 WORD_TABLE only_table; /* table of words to select */
217 #define ALLOC_NEW_WORD(table) \
218 BUMP_ALLOC ((table)->start, (table)->length, 8, WORD)
220 /* Source text table, and scanning macros. */
222 int number_input_files; /* number of text input files */
223 int total_line_count; /* total number of lines seen so far */
224 const char **input_file_name; /* array of text input file names */
225 int *file_line_count; /* array of `total_line_count' values at end */
227 BLOCK text_buffer; /* file to study */
228 char *text_buffer_maxend; /* allocated end of text_buffer */
230 /* SKIP_NON_WHITE used only for getting or skipping the reference. */
232 #define SKIP_NON_WHITE(cursor, limit) \
233 while (cursor < limit && !isspace(*cursor)) \
236 #define SKIP_WHITE(cursor, limit) \
237 while (cursor < limit && isspace(*cursor)) \
240 #define SKIP_WHITE_BACKWARDS(cursor, start) \
241 while (cursor > start && isspace(cursor[-1])) \
244 #define SKIP_SOMETHING(cursor, limit) \
246 if (word_regex_string) \
249 count = re_match (word_regex, cursor, limit - cursor, 0, NULL); \
250 cursor += count <= 0 ? 1 : count; \
252 else if (word_fastmap[(unsigned char) *cursor]) \
253 while (cursor < limit && word_fastmap[(unsigned char) *cursor]) \
259 /* Occurrences table.
261 The `keyword' pointer provides the central word, which is surrounded
262 by a left context and a right context. The `keyword' and `length'
263 field allow full 8-bit characters keys, even including NULs. At other
264 places in this program, the name `keyafter' refers to the keyword
265 followed by its right context.
267 The left context does not extend, towards the beginning of the file,
268 further than a distance given by the `left' value. This value is
269 relative to the keyword beginning, it is usually negative. This
270 insures that, except for white space, we will never have to backward
271 scan the source text, when it is time to generate the final output
274 The right context, indirectly attainable through the keyword end, does
275 not extend, towards the end of the file, further than a distance given
276 by the `right' value. This value is relative to the keyword
277 beginning, it is usually positive.
279 When automatic references are used, the `reference' value is the
280 overall line number in all input files read so far, in this case, it
281 is of type (int). When input references are used, the `reference'
282 value indicates the distance between the keyword beginning and the
283 start of the reference field, it is of type (DELTA) and usually
286 typedef short DELTA; /* to hold displacement within one context */
290 WORD key; /* description of the keyword */
291 DELTA left; /* distance to left context start */
292 DELTA right; /* distance to right context end */
293 int reference; /* reference descriptor */
297 /* The various OCCURS tables are indexed by the language. But the time
298 being, there is no such multiple language support. */
300 OCCURS *occurs_table[1]; /* all words retained from the read text */
301 size_t number_of_occurs[1]; /* number of used slots in occurs_table */
303 #define ALLOC_NEW_OCCURS(language) \
304 BUMP_ALLOC (occurs_table[language], number_of_occurs[language], 9, OCCURS)
307 /* Communication among output routines. */
309 /* Indicate if special output processing is requested for each character. */
310 char edited_flag[CHAR_SET_SIZE];
312 int half_line_width; /* half of line width, reference excluded */
313 int before_max_width; /* maximum width of before field */
314 int keyafter_max_width; /* maximum width of keyword-and-after field */
315 int truncation_string_length; /* length of string used to flag truncation */
317 /* When context is limited by lines, wraparound may happen on final output:
318 the `head' pointer gives access to some supplementary left context which
319 will be seen at the end of the output line, the `tail' pointer gives
320 access to some supplementary right context which will be seen at the
321 beginning of the output line. */
323 BLOCK tail; /* tail field */
324 int tail_truncation; /* flag truncation after the tail field */
326 BLOCK before; /* before field */
327 int before_truncation; /* flag truncation before the before field */
329 BLOCK keyafter; /* keyword-and-after field */
330 int keyafter_truncation; /* flag truncation after the keyafter field */
332 BLOCK head; /* head field */
333 int head_truncation; /* flag truncation before the head field */
335 BLOCK reference; /* reference field for input reference mode */
338 /* Miscellaneous routines. */
340 /*------------------------------------------------------.
341 | Duplicate string STRING, while evaluating \-escapes. |
342 `------------------------------------------------------*/
344 /* Loosely adapted from GNU shellutils printf.c code. */
347 copy_unescaped_string (const char *string)
349 char *result; /* allocated result */
350 char *cursor; /* cursor in result */
351 int value; /* value of \nnn escape */
352 int length; /* length of \nnn escape */
354 result = xmalloc (strlen (string) + 1);
363 case 'x': /* \xhhh escape, 3 chars maximum */
365 for (length = 0, string++;
366 length < 3 && ISXDIGIT (*string);
368 value = value * 16 + HEXTOBIN (*string);
378 case '0': /* \0ooo escape, 3 chars maximum */
380 for (length = 0, string++;
381 length < 3 && ISODIGIT (*string);
383 value = value * 8 + OCTTOBIN (*string);
387 case 'a': /* alert */
396 case 'b': /* backspace */
401 case 'c': /* cancel the rest of the output */
406 case 'f': /* form feed */
411 case 'n': /* new line */
416 case 'r': /* carriage return */
421 case 't': /* horizontal tab */
426 case 'v': /* vertical tab */
437 *cursor++ = *string++;
442 *cursor++ = *string++;
448 /*-------------------------------------------------------------------.
449 | Compile the regex represented by STRING, diagnose and abort if any |
450 | error. Returns the compiled regex structure. |
451 `-------------------------------------------------------------------*/
453 struct re_pattern_buffer *
454 alloc_and_compile_regex (const char *string)
456 struct re_pattern_buffer *pattern; /* newly allocated structure */
457 const char *message; /* error message returned by regex.c */
459 pattern = (struct re_pattern_buffer *)
460 xmalloc (sizeof (struct re_pattern_buffer));
461 memset (pattern, 0, sizeof (struct re_pattern_buffer));
463 pattern->buffer = NULL;
464 pattern->allocated = 0;
465 pattern->translate = ignore_case ? (char *) folded_chars : NULL;
466 pattern->fastmap = (char *) xmalloc (CHAR_SET_SIZE);
468 message = re_compile_pattern (string, strlen (string), pattern);
470 error (1, 0, "%s (for regexp `%s')", message, string);
472 /* The fastmap should be compiled before `re_match'. The following
473 call is not mandatory, because `re_search' is always called sooner,
474 and it compiles the fastmap if this has not been done yet. */
476 re_compile_fastmap (pattern);
478 /* Do not waste extra allocated space. */
480 if (pattern->allocated > pattern->used)
483 = (unsigned char *) xrealloc (pattern->buffer, pattern->used);
484 pattern->allocated = pattern->used;
490 /*------------------------------------------------------------------------.
491 | This will initialize various tables for pattern match and compiles some |
493 `------------------------------------------------------------------------*/
496 initialize_regex (void)
498 int character; /* character value */
500 /* Initialize the regex syntax table. */
502 for (character = 0; character < CHAR_SET_SIZE; character++)
503 syntax_table[character] = isalpha (character) ? Sword : 0;
505 /* Initialize the case folding table. */
508 for (character = 0; character < CHAR_SET_SIZE; character++)
509 folded_chars[character] = toupper (character);
511 /* Unless the user already provided a description of the end of line or
512 end of sentence sequence, select an end of line sequence to compile.
513 If the user provided an empty definition, thus disabling end of line
514 or sentence feature, make it NULL to speed up tests. If GNU
515 extensions are enabled, use end of sentence like in GNU emacs. If
516 disabled, use end of lines. */
518 if (context_regex_string)
520 if (!*context_regex_string)
521 context_regex_string = NULL;
523 else if (gnu_extensions && !input_reference)
524 context_regex_string = "[.?!][]\"')}]*\\($\\|\t\\| \\)[ \t\n]*";
526 context_regex_string = "\n";
528 if (context_regex_string)
529 context_regex = alloc_and_compile_regex (context_regex_string);
531 /* If the user has already provided a non-empty regexp to describe
532 words, compile it. Else, unless this has already been done through
533 a user provided Break character file, construct a fastmap of
534 characters that may appear in a word. If GNU extensions enabled,
535 include only letters of the underlying character set. If disabled,
536 include almost everything, even punctuations; stop only on white
539 if (word_regex_string && *word_regex_string)
540 word_regex = alloc_and_compile_regex (word_regex_string);
541 else if (!break_file)
547 for (character = 0; character < CHAR_SET_SIZE; character++)
548 word_fastmap[character] = isalpha (character);
553 /* Simulate [^ \t\n]+. */
555 memset (word_fastmap, 1, CHAR_SET_SIZE);
556 word_fastmap[' '] = 0;
557 word_fastmap['\t'] = 0;
558 word_fastmap['\n'] = 0;
562 /*------------------------------------------------------------------------.
563 | This routine will attempt to swallow a whole file name FILE_NAME into a |
564 | contiguous region of memory and return a description of it into BLOCK. |
565 | Standard input is assumed whenever FILE_NAME is NULL, empty or "-". |
567 | Previously, in some cases, white space compression was attempted while |
568 | inputting text. This was defeating some regexps like default end of |
569 | sentence, which checks for two consecutive spaces. If white space |
570 | compression is ever reinstated, it should be in output routines. |
571 `------------------------------------------------------------------------*/
574 swallow_file_in_memory (const char *file_name, BLOCK *block)
576 int file_handle; /* file descriptor number */
577 struct stat stat_block; /* stat block for file */
578 int allocated_length; /* allocated length of memory buffer */
579 int used_length; /* used length in memory buffer */
580 int read_length; /* number of character gotten on last read */
582 /* As special cases, a file name which is NULL or "-" indicates standard
583 input, which is already opened. In all other cases, open the file from
586 if (!file_name || !*file_name || strcmp (file_name, "-") == 0)
587 file_handle = fileno (stdin);
589 if ((file_handle = open (file_name, O_RDONLY)) < 0)
590 error (1, errno, file_name);
592 /* If the file is a plain, regular file, allocate the memory buffer all at
593 once and swallow the file in one blow. In other cases, read the file
594 repeatedly in smaller chunks until we have it all, reallocating memory
595 once in a while, as we go. */
597 if (fstat (file_handle, &stat_block) < 0)
598 error (1, errno, file_name);
600 if (S_ISREG (stat_block.st_mode))
602 block->start = (char *) xmalloc ((int) stat_block.st_size);
604 if (read (file_handle, block->start, (int) stat_block.st_size)
605 != stat_block.st_size)
606 error (1, errno, file_name);
608 block->end = block->start + stat_block.st_size;
612 block->start = (char *) xmalloc (1 << SWALLOW_REALLOC_LOG);
614 allocated_length = (1 << SWALLOW_REALLOC_LOG);
616 while ((read_length = read (file_handle,
617 block->start + used_length,
618 allocated_length - used_length)) > 0)
620 used_length += read_length;
621 if (used_length == allocated_length)
623 allocated_length += (1 << SWALLOW_REALLOC_LOG);
625 = (char *) xrealloc (block->start, allocated_length);
630 error (1, errno, file_name);
632 block->end = block->start + used_length;
635 /* Close the file, but only if it was not the standard input. */
637 if (file_handle != fileno (stdin))
641 /* Sort and search routines. */
643 /*--------------------------------------------------------------------------.
644 | Compare two words, FIRST and SECOND, and return 0 if they are identical. |
645 | Return less than 0 if the first word goes before the second; return |
646 | greater than 0 if the first word goes after the second. |
648 | If a word is indeed a prefix of the other, the shorter should go first. |
649 `--------------------------------------------------------------------------*/
652 compare_words (const void *void_first, const void *void_second)
654 #define first ((WORD *) void_first)
655 #define second ((WORD *) void_second)
656 int length; /* minimum of two lengths */
657 int counter; /* cursor in words */
658 int value; /* value of comparison */
660 length = first->size < second->size ? first->size : second->size;
664 for (counter = 0; counter < length; counter++)
666 value = (folded_chars [(unsigned char) (first->start[counter])]
667 - folded_chars [(unsigned char) (second->start[counter])]);
674 for (counter = 0; counter < length; counter++)
676 value = ((unsigned char) first->start[counter]
677 - (unsigned char) second->start[counter]);
683 return first->size - second->size;
688 /*-----------------------------------------------------------------------.
689 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
690 | go first. In case of a tie, preserve the original order through a |
691 | pointer comparison. |
692 `-----------------------------------------------------------------------*/
695 compare_occurs (const void *void_first, const void *void_second)
697 #define first ((OCCURS *) void_first)
698 #define second ((OCCURS *) void_second)
701 value = compare_words (&first->key, &second->key);
702 return value == 0 ? first->key.start - second->key.start : value;
707 /*------------------------------------------------------------.
708 | Return !0 if WORD appears in TABLE. Uses a binary search. |
709 `------------------------------------------------------------*/
712 search_table (WORD *word, WORD_TABLE *table)
714 int lowest; /* current lowest possible index */
715 int highest; /* current highest possible index */
716 int middle; /* current middle index */
717 int value; /* value from last comparison */
720 highest = table->length - 1;
721 while (lowest <= highest)
723 middle = (lowest + highest) / 2;
724 value = compare_words (word, table->start + middle);
726 highest = middle - 1;
735 /*---------------------------------------------------------------------.
736 | Sort the whole occurs table in memory. Presumably, `qsort' does not |
737 | take intermediate copies or table elements, so the sort will be |
738 | stabilized throughout the comparison routine. |
739 `---------------------------------------------------------------------*/
742 sort_found_occurs (void)
745 /* Only one language for the time being. */
747 qsort (occurs_table[0], number_of_occurs[0], sizeof (OCCURS),
751 /* Parameter files reading routines. */
753 /*----------------------------------------------------------------------.
754 | Read a file named FILE_NAME, containing a set of break characters. |
755 | Build a content to the array word_fastmap in which all characters are |
756 | allowed except those found in the file. Characters may be repeated. |
757 `----------------------------------------------------------------------*/
760 digest_break_file (const char *file_name)
762 BLOCK file_contents; /* to receive a copy of the file */
763 char *cursor; /* cursor in file copy */
765 swallow_file_in_memory (file_name, &file_contents);
767 /* Make the fastmap and record the file contents in it. */
769 memset (word_fastmap, 1, CHAR_SET_SIZE);
770 for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
771 word_fastmap[(unsigned char) *cursor] = 0;
776 /* If GNU extensions are enabled, the only way to avoid newline as
777 a break character is to write all the break characters in the
778 file with no newline at all, not even at the end of the file.
779 If disabled, spaces, tabs and newlines are always considered as
780 break characters even if not included in the break file. */
782 word_fastmap[' '] = 0;
783 word_fastmap['\t'] = 0;
784 word_fastmap['\n'] = 0;
787 /* Return the space of the file, which is no more required. */
789 free (file_contents.start);
792 /*-----------------------------------------------------------------------.
793 | Read a file named FILE_NAME, containing one word per line, then |
794 | construct in TABLE a table of WORD descriptors for them. The routine |
795 | swallows the whole file in memory; this is at the expense of space |
796 | needed for newlines, which are useless; however, the reading is fast. |
797 `-----------------------------------------------------------------------*/
800 digest_word_file (const char *file_name, WORD_TABLE *table)
802 BLOCK file_contents; /* to receive a copy of the file */
803 char *cursor; /* cursor in file copy */
804 char *word_start; /* start of the current word */
806 swallow_file_in_memory (file_name, &file_contents);
811 /* Read the whole file. */
813 cursor = file_contents.start;
814 while (cursor < file_contents.end)
817 /* Read one line, and save the word in contains. */
820 while (cursor < file_contents.end && *cursor != '\n')
823 /* Record the word in table if it is not empty. */
825 if (cursor > word_start)
827 ALLOC_NEW_WORD (table);
828 table->start[table->length].start = word_start;
829 table->start[table->length].size = cursor - word_start;
833 /* This test allows for an incomplete line at end of file. */
835 if (cursor < file_contents.end)
839 /* Finally, sort all the words read. */
841 qsort (table->start, table->length, (size_t) sizeof (WORD), compare_words);
845 /* Keyword recognition and selection. */
847 /*----------------------------------------------------------------------.
848 | For each keyword in the source text, constructs an OCCURS structure. |
849 `----------------------------------------------------------------------*/
852 find_occurs_in_text (void)
854 char *cursor; /* for scanning the source text */
855 char *scan; /* for scanning the source text also */
856 char *line_start; /* start of the current input line */
857 char *line_scan; /* newlines scanned until this point */
858 int reference_length; /* length of reference in input mode */
859 WORD possible_key; /* possible key, to ease searches */
860 OCCURS *occurs_cursor; /* current OCCURS under construction */
862 char *context_start; /* start of left context */
863 char *context_end; /* end of right context */
864 char *word_start; /* start of word */
865 char *word_end; /* end of word */
866 char *next_context_start; /* next start of left context */
868 /* reference_length is always used within `if (input_reference)'.
869 However, GNU C diagnoses that it may be used uninitialized. The
870 following assignment is merely to shut it up. */
872 reference_length = 0;
874 /* Tracking where lines start is helpful for reference processing. In
875 auto reference mode, this allows counting lines. In input reference
876 mode, this permits finding the beginning of the references.
878 The first line begins with the file, skip immediately this very first
879 reference in input reference mode, to help further rejection any word
880 found inside it. Also, unconditionally assigning these variable has
881 the happy effect of shutting up lint. */
883 line_start = text_buffer.start;
884 line_scan = line_start;
887 SKIP_NON_WHITE (line_scan, text_buffer.end);
888 reference_length = line_scan - line_start;
889 SKIP_WHITE (line_scan, text_buffer.end);
892 /* Process the whole buffer, one line or one sentence at a time. */
894 for (cursor = text_buffer.start;
895 cursor < text_buffer.end;
896 cursor = next_context_start)
899 /* `context_start' gets initialized before the processing of each
900 line, or once for the whole buffer if no end of line or sentence
901 sequence separator. */
903 context_start = cursor;
905 /* If a end of line or end of sentence sequence is defined and
906 non-empty, `next_context_start' will be recomputed to be the end of
907 each line or sentence, before each one is processed. If no such
908 sequence, then `next_context_start' is set at the end of the whole
909 buffer, which is then considered to be a single line or sentence.
910 This test also accounts for the case of an incomplete line or
911 sentence at the end of the buffer. */
913 if (context_regex_string
914 && (re_search (context_regex, cursor, text_buffer.end - cursor,
915 0, text_buffer.end - cursor, &context_regs)
917 next_context_start = cursor + context_regs.end[0];
920 next_context_start = text_buffer.end;
922 /* Include the separator into the right context, but not any suffix
923 white space in this separator; this insures it will be seen in
924 output and will not take more space than necessary. */
926 context_end = next_context_start;
927 SKIP_WHITE_BACKWARDS (context_end, context_start);
929 /* Read and process a single input line or sentence, one word at a
936 /* If a word regexp has been compiled, use it to skip at the
937 beginning of the next word. If there is no such word, exit
941 if (re_search (word_regex, cursor, context_end - cursor,
942 0, context_end - cursor, &word_regs)
945 word_start = cursor + word_regs.start[0];
946 word_end = cursor + word_regs.end[0];
950 /* Avoid re_search and use the fastmap to skip to the
951 beginning of the next word. If there is no more word in
952 the buffer, exit the loop. */
956 while (scan < context_end
957 && !word_fastmap[(unsigned char) *scan])
960 if (scan == context_end)
965 while (scan < context_end
966 && word_fastmap[(unsigned char) *scan])
972 /* Skip right to the beginning of the found word. */
976 /* Skip any zero length word. Just advance a single position,
977 then go fetch the next word. */
979 if (word_end == word_start)
985 /* This is a genuine, non empty word, so save it as a possible
986 key. Then skip over it. Also, maintain the maximum length of
987 all words read so far. It is mandatory to take the maximum
988 length of all words in the file, without considering if they
989 are actually kept or rejected, because backward jumps at output
990 generation time may fall in *any* word. */
992 possible_key.start = cursor;
993 possible_key.size = word_end - word_start;
994 cursor += possible_key.size;
996 if (possible_key.size > maximum_word_length)
997 maximum_word_length = possible_key.size;
999 /* In input reference mode, update `line_start' from its previous
1000 value. Count the lines just in case auto reference mode is
1001 also selected. If it happens that the word just matched is
1002 indeed part of a reference; just ignore it. */
1004 if (input_reference)
1006 while (line_scan < possible_key.start)
1007 if (*line_scan == '\n')
1011 line_start = line_scan;
1012 SKIP_NON_WHITE (line_scan, text_buffer.end);
1013 reference_length = line_scan - line_start;
1017 if (line_scan > possible_key.start)
1021 /* Ignore the word if an `Ignore words' table exists and if it is
1022 part of it. Also ignore the word if an `Only words' table and
1023 if it is *not* part of it.
1025 It is allowed that both tables be used at once, even if this
1026 may look strange for now. Just ignore a word that would appear
1027 in both. If regexps are eventually implemented for these
1028 tables, the Ignore table could then reject words that would
1029 have been previously accepted by the Only table. */
1031 if (ignore_file && search_table (&possible_key, &ignore_table))
1033 if (only_file && !search_table (&possible_key, &only_table))
1036 /* A non-empty word has been found. First of all, insure
1037 proper allocation of the next OCCURS, and make a pointer to
1038 where it will be constructed. */
1040 ALLOC_NEW_OCCURS (0);
1041 occurs_cursor = occurs_table[0] + number_of_occurs[0];
1043 /* Define the refence field, if any. */
1048 /* While auto referencing, update `line_start' from its
1049 previous value, counting lines as we go. If input
1050 referencing at the same time, `line_start' has been
1051 advanced earlier, and the following loop is never really
1054 while (line_scan < possible_key.start)
1055 if (*line_scan == '\n')
1059 line_start = line_scan;
1060 SKIP_NON_WHITE (line_scan, text_buffer.end);
1065 occurs_cursor->reference = total_line_count;
1067 else if (input_reference)
1070 /* If only input referencing, `line_start' has been computed
1071 earlier to detect the case the word matched would be part
1072 of the reference. The reference position is simply the
1073 value of `line_start'. */
1075 occurs_cursor->reference
1076 = (DELTA) (line_start - possible_key.start);
1077 if (reference_length > reference_max_width)
1078 reference_max_width = reference_length;
1081 /* Exclude the reference from the context in simple cases. */
1083 if (input_reference && line_start == context_start)
1085 SKIP_NON_WHITE (context_start, context_end);
1086 SKIP_WHITE (context_start, context_end);
1089 /* Completes the OCCURS structure. */
1091 occurs_cursor->key = possible_key;
1092 occurs_cursor->left = context_start - possible_key.start;
1093 occurs_cursor->right = context_end - possible_key.start;
1095 number_of_occurs[0]++;
1100 /* Formatting and actual output - service routines. */
1102 /*-----------------------------------------.
1103 | Prints some NUMBER of spaces on stdout. |
1104 `-----------------------------------------*/
1107 print_spaces (int number)
1111 for (counter = number; counter > 0; counter--)
1115 /*-------------------------------------.
1116 | Prints the field provided by FIELD. |
1117 `-------------------------------------*/
1120 print_field (BLOCK field)
1122 char *cursor; /* Cursor in field to print */
1123 int character; /* Current character */
1124 int base; /* Base character, without diacritic */
1125 int diacritic; /* Diacritic code for the character */
1127 /* Whitespace is not really compressed. Instead, each white space
1128 character (tab, vt, ht etc.) is printed as one single space. */
1130 for (cursor = field.start; cursor < field.end; cursor++)
1132 character = (unsigned char) *cursor;
1133 if (edited_flag[character])
1136 /* First check if this is a diacriticized character.
1138 This works only for TeX. I do not know how diacriticized
1139 letters work with `roff'. Please someone explain it to me! */
1141 diacritic = todiac (character);
1142 if (diacritic != 0 && output_format == TEX_FORMAT)
1144 base = tobase (character);
1148 case 1: /* Latin diphthongs */
1172 case 2: /* Acute accent */
1173 printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
1176 case 3: /* Grave accent */
1177 printf ("\\`%s%c", (base == 'i' ? "\\" : ""), base);
1180 case 4: /* Circumflex accent */
1181 printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
1184 case 5: /* Diaeresis */
1185 printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
1188 case 6: /* Tilde accent */
1189 printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
1192 case 7: /* Cedilla */
1193 printf ("\\c{%c}", base);
1196 case 8: /* Small circle beneath */
1212 case 9: /* Strike through */
1231 /* This is not a diacritic character, so handle cases which are
1232 really specific to `roff' or TeX. All white space processing
1233 is done as the default case of this switch. */
1238 /* In roff output format, double any quote. */
1248 /* In TeX output format, precede these with a backslash. */
1250 putchar (character);
1255 /* In TeX output format, precede these with a backslash and
1256 force mathematical mode. */
1257 printf ("$\\%c$", character);
1261 /* In TeX output mode, request production of a backslash. */
1262 printf ("\\backslash{}");
1266 /* Any other flagged character produces a single space. */
1276 /* Formatting and actual output - planning routines. */
1278 /*--------------------------------------------------------------------.
1279 | From information collected from command line options and input file |
1280 | readings, compute and fix some output parameter values. |
1281 `--------------------------------------------------------------------*/
1284 fix_output_parameters (void)
1286 int file_index; /* index in text input file arrays */
1287 int line_ordinal; /* line ordinal value for reference */
1288 char ordinal_string[12]; /* edited line ordinal for reference */
1289 int reference_width; /* width for the whole reference */
1290 int character; /* character ordinal */
1291 const char *cursor; /* cursor in some constant strings */
1293 /* In auto reference mode, the maximum width of this field is
1294 precomputed and subtracted from the overall line width. Add one for
1295 the column which separate the file name from the line number. */
1299 reference_max_width = 0;
1300 for (file_index = 0; file_index < number_input_files; file_index++)
1302 line_ordinal = file_line_count[file_index] + 1;
1304 line_ordinal -= file_line_count[file_index - 1];
1305 sprintf (ordinal_string, "%d", line_ordinal);
1306 reference_width = strlen (ordinal_string);
1307 if (input_file_name[file_index])
1308 reference_width += strlen (input_file_name[file_index]);
1309 if (reference_width > reference_max_width)
1310 reference_max_width = reference_width;
1312 reference_max_width++;
1313 reference.start = (char *) xmalloc (reference_max_width + 1);
1316 /* If the reference appears to the left of the output line, reserve some
1317 space for it right away, including one gap size. */
1319 if ((auto_reference || input_reference) && !right_reference)
1320 line_width -= reference_max_width + gap_size;
1322 /* The output lines, minimally, will contain from left to right a left
1323 context, a gap, and a keyword followed by the right context with no
1324 special intervening gap. Half of the line width is dedicated to the
1325 left context and the gap, the other half is dedicated to the keyword
1326 and the right context; these values are computed once and for all here.
1327 There also are tail and head wrap around fields, used when the keyword
1328 is near the beginning or the end of the line, or when some long word
1329 cannot fit in, but leave place from wrapped around shorter words. The
1330 maximum width of these fields are recomputed separately for each line,
1331 on a case by case basis. It is worth noting that it cannot happen that
1332 both the tail and head fields are used at once. */
1334 half_line_width = line_width / 2;
1335 before_max_width = half_line_width - gap_size;
1336 keyafter_max_width = half_line_width;
1338 /* If truncation_string is the empty string, make it NULL to speed up
1339 tests. In this case, truncation_string_length will never get used, so
1340 there is no need to set it. */
1342 if (truncation_string && *truncation_string)
1343 truncation_string_length = strlen (truncation_string);
1345 truncation_string = NULL;
1350 /* When flagging truncation at the left of the keyword, the
1351 truncation mark goes at the beginning of the before field,
1352 unless there is a head field, in which case the mark goes at the
1353 left of the head field. When flagging truncation at the right
1354 of the keyword, the mark goes at the end of the keyafter field,
1355 unless there is a tail field, in which case the mark goes at the
1356 end of the tail field. Only eight combination cases could arise
1357 for truncation marks:
1360 . One beginning the before field.
1361 . One beginning the head field.
1362 . One ending the keyafter field.
1363 . One ending the tail field.
1364 . One beginning the before field, another ending the keyafter field.
1365 . One ending the tail field, another beginning the before field.
1366 . One ending the keyafter field, another beginning the head field.
1368 So, there is at most two truncation marks, which could appear both
1369 on the left side of the center of the output line, both on the
1370 right side, or one on either side. */
1372 before_max_width -= 2 * truncation_string_length;
1373 keyafter_max_width -= 2 * truncation_string_length;
1378 /* I never figured out exactly how UNIX' ptx plans the output width
1379 of its various fields. If GNU extensions are disabled, do not
1380 try computing the field widths correctly; instead, use the
1381 following formula, which does not completely imitate UNIX' ptx,
1384 keyafter_max_width -= 2 * truncation_string_length + 1;
1387 /* Compute which characters need special output processing. Initialize
1388 by flagging any white space character. Some systems do not consider
1389 form feed as a space character, but we do. */
1391 for (character = 0; character < CHAR_SET_SIZE; character++)
1392 edited_flag[character] = isspace (character);
1393 edited_flag['\f'] = 1;
1395 /* Complete the special character flagging according to selected output
1398 switch (output_format)
1400 case UNKNOWN_FORMAT:
1401 /* Should never happen. */
1408 /* `Quote' characters should be doubled. */
1410 edited_flag['"'] = 1;
1415 /* Various characters need special processing. */
1417 for (cursor = "$%&#_{}\\"; *cursor; cursor++)
1418 edited_flag[*cursor] = 1;
1420 /* Any character with 8th bit set will print to a single space, unless
1421 it is diacriticized. */
1423 for (character = 0200; character < CHAR_SET_SIZE; character++)
1424 edited_flag[character] = todiac (character) != 0;
1429 /*------------------------------------------------------------------.
1430 | Compute the position and length of all the output fields, given a |
1431 | pointer to some OCCURS. |
1432 `------------------------------------------------------------------*/
1435 define_all_fields (OCCURS *occurs)
1437 int tail_max_width; /* allowable width of tail field */
1438 int head_max_width; /* allowable width of head field */
1439 char *cursor; /* running cursor in source text */
1440 char *left_context_start; /* start of left context */
1441 char *right_context_end; /* end of right context */
1442 char *left_field_start; /* conservative start for `head'/`before' */
1443 int file_index; /* index in text input file arrays */
1444 const char *file_name; /* file name for reference */
1445 int line_ordinal; /* line ordinal for reference */
1447 /* Define `keyafter', start of left context and end of right context.
1448 `keyafter' starts at the saved position for keyword and extend to the
1449 right from the end of the keyword, eating separators or full words, but
1450 not beyond maximum allowed width for `keyafter' field or limit for the
1451 right context. Suffix spaces will be removed afterwards. */
1453 keyafter.start = occurs->key.start;
1454 keyafter.end = keyafter.start + occurs->key.size;
1455 left_context_start = keyafter.start + occurs->left;
1456 right_context_end = keyafter.start + occurs->right;
1458 cursor = keyafter.end;
1459 while (cursor < right_context_end
1460 && cursor <= keyafter.start + keyafter_max_width)
1462 keyafter.end = cursor;
1463 SKIP_SOMETHING (cursor, right_context_end);
1465 if (cursor <= keyafter.start + keyafter_max_width)
1466 keyafter.end = cursor;
1468 keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1470 SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1472 /* When the left context is wide, it might take some time to catch up from
1473 the left context boundary to the beginning of the `head' or `before'
1474 fields. So, in this case, to speed the catchup, we jump back from the
1475 keyword, using some secure distance, possibly falling in the middle of
1476 a word. A secure backward jump would be at least half the maximum
1477 width of a line, plus the size of the longest word met in the whole
1478 input. We conclude this backward jump by a skip forward of at least
1479 one word. In this manner, we should not inadvertently accept only part
1480 of a word. From the reached point, when it will be time to fix the
1481 beginning of `head' or `before' fields, we will skip forward words or
1482 delimiters until we get sufficiently near. */
1484 if (-occurs->left > half_line_width + maximum_word_length)
1487 = keyafter.start - (half_line_width + maximum_word_length);
1488 SKIP_SOMETHING (left_field_start, keyafter.start);
1491 left_field_start = keyafter.start + occurs->left;
1493 /* `before' certainly ends at the keyword, but not including separating
1494 spaces. It starts after than the saved value for the left context, by
1495 advancing it until it falls inside the maximum allowed width for the
1496 before field. There will be no prefix spaces either. `before' only
1497 advances by skipping single separators or whole words. */
1499 before.start = left_field_start;
1500 before.end = keyafter.start;
1501 SKIP_WHITE_BACKWARDS (before.end, before.start);
1503 while (before.start + before_max_width < before.end)
1504 SKIP_SOMETHING (before.start, before.end);
1506 if (truncation_string)
1508 cursor = before.start;
1509 SKIP_WHITE_BACKWARDS (cursor, text_buffer.start);
1510 before_truncation = cursor > left_context_start;
1513 before_truncation = 0;
1515 SKIP_WHITE (before.start, text_buffer.end);
1517 /* The tail could not take more columns than what has been left in the
1518 left context field, and a gap is mandatory. It starts after the
1519 right context, and does not contain prefixed spaces. It ends at
1520 the end of line, the end of buffer or when the tail field is full,
1521 whichever comes first. It cannot contain only part of a word, and
1522 has no suffixed spaces. */
1525 = before_max_width - (before.end - before.start) - gap_size;
1527 if (tail_max_width > 0)
1529 tail.start = keyafter.end;
1530 SKIP_WHITE (tail.start, text_buffer.end);
1532 tail.end = tail.start;
1534 while (cursor < right_context_end
1535 && cursor < tail.start + tail_max_width)
1538 SKIP_SOMETHING (cursor, right_context_end);
1541 if (cursor < tail.start + tail_max_width)
1544 if (tail.end > tail.start)
1546 keyafter_truncation = 0;
1547 tail_truncation = truncation_string && tail.end < right_context_end;
1550 tail_truncation = 0;
1552 SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1557 /* No place left for a tail field. */
1561 tail_truncation = 0;
1564 /* `head' could not take more columns than what has been left in the right
1565 context field, and a gap is mandatory. It ends before the left
1566 context, and does not contain suffixed spaces. Its pointer is advanced
1567 until the head field has shrunk to its allowed width. It cannot
1568 contain only part of a word, and has no suffixed spaces. */
1571 = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1573 if (head_max_width > 0)
1575 head.end = before.start;
1576 SKIP_WHITE_BACKWARDS (head.end, text_buffer.start);
1578 head.start = left_field_start;
1579 while (head.start + head_max_width < head.end)
1580 SKIP_SOMETHING (head.start, head.end);
1582 if (head.end > head.start)
1584 before_truncation = 0;
1585 head_truncation = (truncation_string
1586 && head.start > left_context_start);
1589 head_truncation = 0;
1591 SKIP_WHITE (head.start, head.end);
1596 /* No place left for a head field. */
1600 head_truncation = 0;
1606 /* Construct the reference text in preallocated space from the file
1607 name and the line number. Find out in which file the reference
1608 occurred. Standard input yields an empty file name. Insure line
1609 numbers are one based, even if they are computed zero based. */
1612 while (file_line_count[file_index] < occurs->reference)
1615 file_name = input_file_name[file_index];
1619 line_ordinal = occurs->reference + 1;
1621 line_ordinal -= file_line_count[file_index - 1];
1623 sprintf (reference.start, "%s:%d", file_name, line_ordinal);
1624 reference.end = reference.start + strlen (reference.start);
1626 else if (input_reference)
1629 /* Reference starts at saved position for reference and extends right
1630 until some white space is met. */
1632 reference.start = keyafter.start + (DELTA) occurs->reference;
1633 reference.end = reference.start;
1634 SKIP_NON_WHITE (reference.end, right_context_end);
1639 /* Formatting and actual output - control routines. */
1641 /*----------------------------------------------------------------------.
1642 | Output the current output fields as one line for `troff' or `nroff'. |
1643 `----------------------------------------------------------------------*/
1646 output_one_roff_line (void)
1648 /* Output the `tail' field. */
1650 printf (".%s \"", macro_name);
1652 if (tail_truncation)
1653 printf ("%s", truncation_string);
1656 /* Output the `before' field. */
1659 if (before_truncation)
1660 printf ("%s", truncation_string);
1661 print_field (before);
1664 /* Output the `keyafter' field. */
1667 print_field (keyafter);
1668 if (keyafter_truncation)
1669 printf ("%s", truncation_string);
1672 /* Output the `head' field. */
1675 if (head_truncation)
1676 printf ("%s", truncation_string);
1680 /* Conditionally output the `reference' field. */
1682 if (auto_reference || input_reference)
1685 print_field (reference);
1692 /*---------------------------------------------------------.
1693 | Output the current output fields as one line for `TeX'. |
1694 `---------------------------------------------------------*/
1697 output_one_tex_line (void)
1699 BLOCK key; /* key field, isolated */
1700 BLOCK after; /* after field, isolated */
1701 char *cursor; /* running cursor in source text */
1703 printf ("\\%s ", macro_name);
1707 print_field (before);
1709 key.start = keyafter.start;
1710 after.end = keyafter.end;
1711 cursor = keyafter.start;
1712 SKIP_SOMETHING (cursor, keyafter.end);
1714 after.start = cursor;
1717 print_field (after);
1721 if (auto_reference || input_reference)
1724 print_field (reference);
1730 /*-------------------------------------------------------------------.
1731 | Output the current output fields as one line for a dumb terminal. |
1732 `-------------------------------------------------------------------*/
1735 output_one_dumb_line (void)
1737 if (!right_reference)
1741 /* Output the `reference' field, in such a way that GNU emacs
1742 next-error will handle it. The ending colon is taken from the
1743 gap which follows. */
1745 print_field (reference);
1747 print_spaces (reference_max_width
1749 - (reference.end - reference.start)
1755 /* Output the `reference' field and its following gap. */
1757 print_field (reference);
1758 print_spaces (reference_max_width
1760 - (reference.end - reference.start));
1763 if (tail.start < tail.end)
1765 /* Output the `tail' field. */
1768 if (tail_truncation)
1769 printf ("%s", truncation_string);
1771 print_spaces (half_line_width - gap_size
1772 - (before.end - before.start)
1773 - (before_truncation ? truncation_string_length : 0)
1774 - (tail.end - tail.start)
1775 - (tail_truncation ? truncation_string_length : 0));
1778 print_spaces (half_line_width - gap_size
1779 - (before.end - before.start)
1780 - (before_truncation ? truncation_string_length : 0));
1782 /* Output the `before' field. */
1784 if (before_truncation)
1785 printf ("%s", truncation_string);
1786 print_field (before);
1788 print_spaces (gap_size);
1790 /* Output the `keyafter' field. */
1792 print_field (keyafter);
1793 if (keyafter_truncation)
1794 printf ("%s", truncation_string);
1796 if (head.start < head.end)
1798 /* Output the `head' field. */
1800 print_spaces (half_line_width
1801 - (keyafter.end - keyafter.start)
1802 - (keyafter_truncation ? truncation_string_length : 0)
1803 - (head.end - head.start)
1804 - (head_truncation ? truncation_string_length : 0));
1805 if (head_truncation)
1806 printf ("%s", truncation_string);
1811 if ((auto_reference || input_reference) && right_reference)
1812 print_spaces (half_line_width
1813 - (keyafter.end - keyafter.start)
1814 - (keyafter_truncation ? truncation_string_length : 0));
1816 if ((auto_reference || input_reference) && right_reference)
1818 /* Output the `reference' field. */
1820 print_spaces (gap_size);
1821 print_field (reference);
1827 /*------------------------------------------------------------------------.
1828 | Scan the whole occurs table and, for each entry, output one line in the |
1829 | appropriate format. |
1830 `------------------------------------------------------------------------*/
1833 generate_all_output (void)
1835 int occurs_index; /* index of keyword entry being processed */
1836 OCCURS *occurs_cursor; /* current keyword entry being processed */
1839 /* The following assignments are useful to provide default values in case
1840 line contexts or references are not used, in which case these variables
1841 would never be computed. */
1845 tail_truncation = 0;
1849 head_truncation = 0;
1852 /* Loop over all keyword occurrences. */
1854 occurs_cursor = occurs_table[0];
1856 for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
1858 /* Compute the exact size of every field and whenever truncation flags
1859 are present or not. */
1861 define_all_fields (occurs_cursor);
1863 /* Produce one output line according to selected format. */
1865 switch (output_format)
1867 case UNKNOWN_FORMAT:
1868 /* Should never happen. */
1871 output_one_dumb_line ();
1875 output_one_roff_line ();
1879 output_one_tex_line ();
1883 /* Advance the cursor into the occurs table. */
1889 /* Option decoding and main program. */
1891 /*------------------------------------------------------.
1892 | Print program identification and options, then exit. |
1893 `------------------------------------------------------*/
1899 fprintf (stderr, "Try `%s --help' for more information.\n", program_name);
1903 Usage: %s [OPTION]... [INPUT]... (without -G)\n\
1904 or: %s -G [OPTION]... [INPUT [OUTPUT]]\n", program_name, program_name);
1907 -A, --auto-reference output automatically generated references\n\
1908 -C, --copyright display Copyright and copying conditions\n\
1909 -G, --traditional behave more like System V `ptx'\n\
1910 -F, --flag-truncation=STRING use STRING for flagging line truncations\n\
1911 -M, --macro-name=STRING macro name to use instead of `xx'\n\
1912 -O, --format=roff generate output as roff directives\n\
1913 -R, --right-side-refs put references at right, not counted in -w\n\
1914 -S, --sentence-regexp=REGEXP for end of lines or end of sentences\n\
1915 -T, --format=tex generate output as TeX directives\n\
1916 -W, --word-regexp=REGEXP use REGEXP to match each keyword\n\
1917 -b, --break-file=FILE word break characters in this FILE\n\
1918 -f, --ignore-case fold lower case to upper case for sorting\n\
1919 -g, --gap-size=NUMBER gap size in columns between output fields\n\
1920 -i, --ignore-file=FILE read ignore word list from FILE\n\
1921 -o, --only-file=FILE read only word list from this FILE\n\
1922 -r, --references first field of each line is a reference\n\
1923 -t, --typeset-mode - not implemented -\n\
1924 -w, --width=NUMBER output width in columns, reference excluded\n\
1925 --help display this help and exit\n\
1926 --version output version information and exit\n\
1928 With no FILE or if FILE is -, read Standard Input. `-F /' by default.\n");
1933 /*----------------------------------------------------------------------.
1934 | Main program. Decode ARGC arguments passed through the ARGV array of |
1935 | strings, then launch execution. |
1936 `----------------------------------------------------------------------*/
1938 /* Long options equivalences. */
1939 const struct option long_options[] =
1941 {"auto-reference", no_argument, NULL, 'A'},
1942 {"break-file", required_argument, NULL, 'b'},
1943 {"copyright", no_argument, NULL, 'C'},
1944 {"flag-truncation", required_argument, NULL, 'F'},
1945 {"ignore-case", no_argument, NULL, 'f'},
1946 {"gap-size", required_argument, NULL, 'g'},
1947 {"help", no_argument, &show_help, 1},
1948 {"ignore-file", required_argument, NULL, 'i'},
1949 {"macro-name", required_argument, NULL, 'M'},
1950 {"only-file", required_argument, NULL, 'o'},
1951 {"references", no_argument, NULL, 'r'},
1952 {"right-side-refs", no_argument, NULL, 'R'},
1953 {"format", required_argument, NULL, 10},
1954 {"sentence-regexp", required_argument, NULL, 'S'},
1955 {"traditional", no_argument, NULL, 'G'},
1956 {"typeset-mode", no_argument, NULL, 't'},
1957 {"version", no_argument, &show_version, 1},
1958 {"width", required_argument, NULL, 'w'},
1959 {"word-regexp", required_argument, NULL, 'W'},
1963 static char const* const format_args[] =
1969 main (int argc, char *const argv[])
1971 int optchar; /* argument character */
1972 extern int optind; /* index of argument */
1973 extern char *optarg; /* value or argument */
1974 int file_index; /* index in text input file arrays */
1977 /* Use GNU malloc checking. It has proven to be useful! */
1979 #endif /* HAVE_MCHECK */
1982 #ifdef HAVE_SETCHRCLASS
1987 /* Decode program options. */
1989 program_name = argv[0];
1991 while ((optchar = getopt_long (argc, argv, "ACF:GM:ORS:TW:b:i:fg:o:trw:",
1992 long_options, NULL)),
2004 printf ("%s", copyright);
2012 break_file = optarg;
2020 gap_size = atoi (optarg);
2024 ignore_file = optarg;
2032 input_reference = 1;
2036 /* A decouvrir... */
2040 line_width = atoi (optarg);
2048 truncation_string = copy_unescaped_string (optarg);
2052 macro_name = optarg;
2056 output_format = ROFF_FORMAT;
2060 right_reference = 1;
2064 context_regex_string = copy_unescaped_string (optarg);
2068 output_format = TEX_FORMAT;
2072 word_regex_string = copy_unescaped_string (optarg);
2076 switch (argmatch (optarg, format_args))
2082 output_format = ROFF_FORMAT;
2086 output_format = TEX_FORMAT;
2092 /* Process trivial options. */
2099 printf ("%s\n", version_string);
2103 /* Change the default Ignore file if one is defined. */
2105 #ifdef DEFAULT_IGNORE_FILE
2107 ignore_file = DEFAULT_IGNORE_FILE;
2110 /* Process remaining arguments. If GNU extensions are enabled, process
2111 all arguments as input parameters. If disabled, accept at most two
2112 arguments, the second of which is an output parameter. */
2117 /* No more argument simply means: read standard input. */
2119 input_file_name = (const char **) xmalloc (sizeof (const char *));
2120 file_line_count = (int *) xmalloc (sizeof (int));
2121 number_input_files = 1;
2122 input_file_name[0] = NULL;
2124 else if (gnu_extensions)
2126 number_input_files = argc - optind;
2128 = (const char **) xmalloc (number_input_files * sizeof (const char *));
2130 = (int *) xmalloc (number_input_files * sizeof (int));
2132 for (file_index = 0; file_index < number_input_files; file_index++)
2134 input_file_name[file_index] = argv[optind];
2135 if (!*argv[optind] || strcmp (argv[optind], "-") == 0)
2136 input_file_name[0] = NULL;
2138 input_file_name[0] = argv[optind];
2145 /* There is one necessary input file. */
2147 number_input_files = 1;
2148 input_file_name = (const char **) xmalloc (sizeof (const char *));
2149 file_line_count = (int *) xmalloc (sizeof (int));
2150 if (!*argv[optind] || strcmp (argv[optind], "-") == 0)
2151 input_file_name[0] = NULL;
2153 input_file_name[0] = argv[optind];
2156 /* Redirect standard output, only if requested. */
2161 if (fopen (argv[optind], "w") == NULL)
2162 error (1, errno, argv[optind]);
2166 /* Diagnose any other argument as an error. */
2172 /* If the output format has not been explicitly selected, choose dumb
2173 terminal format if GNU extensions are enabled, else `roff' format. */
2175 if (output_format == UNKNOWN_FORMAT)
2176 output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
2178 /* Initialize the main tables. */
2180 initialize_regex ();
2182 /* Read `Break character' file, if any. */
2185 digest_break_file (break_file);
2187 /* Read `Ignore words' file and `Only words' files, if any. If any of
2188 these files is empty, reset the name of the file to NULL, to avoid
2189 unnecessary calls to search_table. */
2193 digest_word_file (ignore_file, &ignore_table);
2194 if (ignore_table.length == 0)
2200 digest_word_file (only_file, &only_table);
2201 if (only_table.length == 0)
2205 /* Prepare to study all the input files. */
2207 number_of_occurs[0] = 0;
2208 total_line_count = 0;
2209 maximum_word_length = 0;
2210 reference_max_width = 0;
2212 for (file_index = 0; file_index < number_input_files; file_index++)
2215 /* Read the file in core, than study it. */
2217 swallow_file_in_memory (input_file_name[file_index], &text_buffer);
2218 find_occurs_in_text ();
2220 /* Maintain for each file how many lines has been read so far when its
2221 end is reached. Incrementing the count first is a simple kludge to
2222 handle a possible incomplete line at end of file. */
2225 file_line_count[file_index] = total_line_count;
2228 /* Do the output process phase. */
2230 sort_found_occurs ();
2231 fix_output_parameters ();
2232 generate_all_output ();