1 /* texindex -- sort TeX index dribble output into an actual index.
2 $Id: texindex.c,v 1.11 2004/04/11 17:56:47 karl Exp $
4 Copyright (C) 1987, 1991, 1992, 1996, 1997, 1998, 1999, 2000, 2001,
5 2002, 2003, 2004 Free Software Foundation, Inc.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307. */
24 static char *program_name = "texindex";
27 # include "../src/config.h"
28 /* Some s/os.h files redefine these. */
35 #if !defined (HAVE_MEMSET)
37 #define memset(ptr, ignore, count) bzero (ptr, count)
40 char *mktemp (char *);
42 #if !defined (SEEK_SET)
46 #endif /* !SEEK_SET */
50 /* When sorting in core, this structure describes one line
51 and the position and length of its first keyfield. */
54 char *text; /* The actual text of the line. */
56 char *text; /* The start of the key (for textual comparison). */
57 long number; /* The numeric value (for numeric comparison). */
59 long keylen; /* Length of KEY field. */
62 /* This structure describes a field to use as a sort key. */
65 int startwords; /* Number of words to skip. */
66 int startchars; /* Number of additional chars to skip. */
67 int endwords; /* Number of words to ignore at end. */
68 int endchars; /* Ditto for characters of last word. */
69 char ignore_blanks; /* Non-zero means ignore spaces and tabs. */
70 char fold_case; /* Non-zero means case doesn't matter. */
71 char reverse; /* Non-zero means compare in reverse order. */
72 char numeric; /* Non-zeros means field is ASCII numeric. */
73 char positional; /* Sort according to file position. */
74 char braced; /* Count balanced-braced groupings as fields. */
77 /* Vector of keyfields to use. */
78 struct keyfield keyfields[3];
80 /* Number of keyfields stored in that vector. */
81 int num_keyfields = 3;
83 /* Vector of input file names, terminated with a null pointer. */
86 /* Vector of corresponding output file names, or NULL, meaning default it
87 (add an `s' to the end). */
90 /* Length of `infiles'. */
93 /* Pointer to the array of pointers to lines being sorted. */
96 /* The allocated length of `linearray'. */
99 /* Directory to use for temporary files. On Unix, it ends with a slash. */
102 /* Number of last temporary file. */
105 /* Number of last temporary file already deleted.
106 Temporary files are deleted by `flush_tempfiles' in order of creation. */
107 int last_deleted_tempcount;
109 /* During in-core sort, this points to the base of the data block
110 which contains all the lines of data. */
113 /* Initially 0; changed to 1 if we want initials in this index. */
116 /* Remembers the first initial letter seen in this index, so we can
117 determine whether we need initials in the sorted form. */
120 /* Additional command switches .*/
122 /* Nonzero means do not delete tempfiles -- for debugging. */
125 /* Forward declarations of functions in this file. */
126 void decode_command (int argc, char **argv);
127 void sort_in_core (char *infile, int total, char *outfile);
128 void sort_offline (char *infile, off_t total, char *outfile);
129 char **parsefile (char *filename, char **nextline, char *data, long int size);
130 char *find_field (struct keyfield *keyfield, char *str, long int *lengthptr);
131 char *find_pos (char *str, int words, int chars, int ignore_blanks);
132 long find_value (char *start, long int length);
133 char *find_braced_pos (char *str, int words, int chars, int ignore_blanks);
134 char *find_braced_end (char *str);
135 void writelines (char **linearray, int nlines, FILE *ostream);
136 int compare_field (struct keyfield *keyfield, char *start1,
137 long int length1, long int pos1, char *start2,
138 long int length2, long int pos2);
139 int compare_full (const void *, const void *);
140 long readline (struct linebuffer *linebuffer, FILE *stream);
141 int merge_files (char **infiles, int nfiles, char *outfile);
142 int merge_direct (char **infiles, int nfiles, char *outfile);
143 void pfatal_with_name (const char *name);
144 void fatal (const char *format, const char *arg);
145 void error (const char *format, const char *arg);
146 void *xmalloc (), *xrealloc ();
147 char *concat (char *s1, char *s2);
148 void flush_tempfiles (int to_count);
150 #define MAX_IN_CORE_SORT 500000
153 main (int argc, char **argv)
158 last_deleted_tempcount = 0;
160 #ifdef HAVE_SETLOCALE
161 /* Set locale via LC_ALL. */
162 setlocale (LC_ALL, "");
165 /* Set the text message domain. */
166 bindtextdomain (PACKAGE, LOCALEDIR);
167 textdomain (PACKAGE);
169 /* In case we write to a redirected stdout that fails. */
170 /* not ready atexit (close_stdout); */
172 /* Describe the kind of sorting to do. */
173 /* The first keyfield uses the first braced field and folds case. */
174 keyfields[0].braced = 1;
175 keyfields[0].fold_case = 1;
176 keyfields[0].endwords = -1;
177 keyfields[0].endchars = -1;
179 /* The second keyfield uses the second braced field, numerically. */
180 keyfields[1].braced = 1;
181 keyfields[1].numeric = 1;
182 keyfields[1].startwords = 1;
183 keyfields[1].endwords = -1;
184 keyfields[1].endchars = -1;
186 /* The third keyfield (which is ignored while discarding duplicates)
187 compares the whole line. */
188 keyfields[2].endwords = -1;
189 keyfields[2].endchars = -1;
191 decode_command (argc, argv);
193 /* Process input files completely, one by one. */
195 for (i = 0; i < num_infiles; i++)
202 desc = open (infiles[i], O_RDONLY, 0);
204 pfatal_with_name (infiles[i]);
206 if (stat (infiles[i], &instat))
207 pfatal_with_name (infiles[i]);
208 if (S_ISDIR (instat.st_mode))
213 pfatal_with_name (infiles[i]);
216 lseek (desc, (off_t) 0, SEEK_END);
217 ptr = (off_t) lseek (desc, (off_t) 0, SEEK_CUR);
221 outfile = outfiles[i];
223 outfile = concat (infiles[i], "s");
226 first_initial = '\0';
228 if (ptr < MAX_IN_CORE_SORT)
229 /* Sort a small amount of data. */
230 sort_in_core (infiles[i], (int)ptr, outfile);
232 sort_offline (infiles[i], ptr, outfile);
235 flush_tempfiles (tempcount);
237 return 0; /* Avoid bogus warnings. */
250 TEXINDEX_OPTION texindex_options[] = {
251 { "--help", "-h", (int *)NULL, 0, (char *)NULL,
252 N_("display this help and exit") },
253 { "--keep", "-k", &keep_tempfiles, 1, (char *)NULL,
254 N_("keep temporary files around after processing") },
255 { "--no-keep", 0, &keep_tempfiles, 0, (char *)NULL,
256 N_("do not keep temporary files around after processing (default)") },
257 { "--output", "-o", (int *)NULL, 0, "FILE",
258 N_("send output to FILE") },
259 { "--version", (char *)NULL, (int *)NULL, 0, (char *)NULL,
260 N_("display version information and exit") },
261 { (char *)NULL, (char *)NULL, (int *)NULL, 0, (char *)NULL }
265 usage (int result_value)
268 FILE *f = result_value ? stderr : stdout;
270 fprintf (f, _("Usage: %s [OPTION]... FILE...\n"), program_name);
271 fprintf (f, _("Generate a sorted index for each TeX output FILE.\n"));
272 /* Avoid trigraph nonsense. */
274 _("Usually FILE... is specified as `foo.%c%c\' for a document `foo.texi'.\n"),
275 '?', '?'); /* avoid trigraph in cat-id-tbl.c */
276 fprintf (f, _("\nOptions:\n"));
278 for (i = 0; texindex_options[i].long_name; i++)
282 if (texindex_options[i].short_name)
283 fprintf (f, "%s, ", texindex_options[i].short_name);
286 texindex_options[i].long_name,
287 texindex_options[i].arg_name
288 ? texindex_options[i].arg_name : "");
290 fprintf (f, "\t%s\n", _(texindex_options[i].doc_string));
293 Email bug reports to bug-texinfo@gnu.org,\n\
294 general questions and discussion to help-texinfo@gnu.org.\n\
295 Texinfo home page: http://www.gnu.org/software/texinfo/"), f);
298 xexit (result_value);
301 /* Decode the command line arguments to set the parameter variables
302 and set up the vector of keyfields and the vector of input files. */
305 decode_command (int argc, char **argv)
311 /* Store default values into parameter variables. */
313 tempdir = getenv ("TMPDIR");
315 tempdir = getenv ("TEMP");
317 tempdir = getenv ("TMP");
319 tempdir = DEFAULT_TMPDIR;
321 tempdir = concat (tempdir, "/");
325 /* Allocate ARGC input files, which must be enough. */
327 infiles = (char **) xmalloc (argc * sizeof (char *));
328 outfiles = (char **) xmalloc (argc * sizeof (char *));
332 while (arg_index < argc)
334 char *arg = argv[arg_index++];
338 if (strcmp (arg, "--version") == 0)
340 printf ("texindex (GNU %s) %s\n", PACKAGE, VERSION);
342 puts ("Copyright (C) 2004 Free Software Foundation, Inc.");
343 printf (_("There is NO warranty. You may redistribute this software\n\
344 under the terms of the GNU General Public License.\n\
345 For more information about these matters, see the files named COPYING.\n"));
348 else if ((strcmp (arg, "--keep") == 0) ||
349 (strcmp (arg, "-k") == 0))
353 else if ((strcmp (arg, "--help") == 0) ||
354 (strcmp (arg, "-h") == 0))
358 else if ((strcmp (arg, "--output") == 0) ||
359 (strcmp (arg, "-o") == 0))
361 if (argv[arg_index] != (char *)NULL)
365 *(op - 1) = argv[arg_index];
376 *op++ = (char *)NULL;
380 /* Record number of keyfields and terminate list of filenames. */
381 num_infiles = ip - infiles;
383 if (num_infiles == 0)
392 findtempname (char *tempname)
396 for (i = 0; i < tv_used; i++)
397 if (strcmp (tv[i], tempname) == 0)
402 /* Return a name for temporary file COUNT. */
405 maketempname (int count)
407 static char *tempbase = NULL;
414 tempbase = concat (tempdir, "txidxXXXXXX");
416 fd = mkstemp (tempbase);
418 pfatal_with_name (tempbase);
421 sprintf (tempsuffix, ".%d", count);
422 tempname = concat (tempbase, tempsuffix);
424 * The open logic becomes a bit convoluted. If open(2) fails due to EEXIST,
425 * it's likely because somebody attempted to race us, or because we have
426 * already created this file.
428 fd = open (tempname, O_CREAT|O_EXCL|O_WRONLY, 0600);
432 * If errno is not EEXIST, then open failed for some other reason, so
433 * we should terminate. If errno == EEXIST AND we didn't create this
434 * file, terminate. Otherwise, it's safe to say that errno == EEXIST
435 * because we already created it, in this event, we can just return.
437 if (errno != EEXIST ||
438 (errno == EEXIST && findtempname (tempname) == 0))
439 pfatal_with_name (tempname);
449 tv = calloc (tv_alloc, sizeof (char *));
452 fprintf (stderr, "calloc failed\n");
456 else if (tv_used == tv_alloc)
459 tv = realloc (tv, tv_alloc * sizeof (char *));
462 fprintf (stderr, "realloc failed");
466 tv[tv_used++] = strdup (tempname);
471 /* Delete all temporary files up to TO_COUNT. */
474 flush_tempfiles (int to_count)
478 while (last_deleted_tempcount < to_count)
479 unlink (maketempname (++last_deleted_tempcount));
483 /* Compare LINE1 and LINE2 according to the specified set of keyfields. */
486 compare_full (const void *p1, const void *p2)
488 char **line1 = (char **) p1;
489 char **line2 = (char **) p2;
492 /* Compare using the first keyfield;
493 if that does not distinguish the lines, try the second keyfield;
496 for (i = 0; i < num_keyfields; i++)
498 long length1, length2;
499 char *start1 = find_field (&keyfields[i], *line1, &length1);
500 char *start2 = find_field (&keyfields[i], *line2, &length2);
501 int tem = compare_field (&keyfields[i], start1, length1,
503 start2, length2, *line2 - text_base);
506 if (keyfields[i].reverse)
512 return 0; /* Lines match exactly. */
515 /* Compare LINE1 and LINE2, described by structures
516 in which the first keyfield is identified in advance.
517 For positional sorting, assumes that the order of the lines in core
518 reflects their nominal order. */
520 compare_prepared (const void *p1, const void *p2)
522 struct lineinfo *line1 = (struct lineinfo *) p1;
523 struct lineinfo *line2 = (struct lineinfo *) p2;
528 /* Compare using the first keyfield, which has been found for us already. */
529 if (keyfields->positional)
531 if (line1->text - text_base > line2->text - text_base)
536 else if (keyfields->numeric)
537 tem = line1->key.number - line2->key.number;
539 tem = compare_field (keyfields, line1->key.text, line1->keylen, 0,
540 line2->key.text, line2->keylen, 0);
543 if (keyfields->reverse)
551 /* Compare using the second keyfield;
552 if that does not distinguish the lines, try the third keyfield;
555 for (i = 1; i < num_keyfields; i++)
557 long length1, length2;
558 char *start1 = find_field (&keyfields[i], text1, &length1);
559 char *start2 = find_field (&keyfields[i], text2, &length2);
560 int tem = compare_field (&keyfields[i], start1, length1,
562 start2, length2, text2 - text_base);
565 if (keyfields[i].reverse)
571 return 0; /* Lines match exactly. */
574 /* Like compare_full but more general.
575 You can pass any strings, and you can say how many keyfields to use.
576 POS1 and POS2 should indicate the nominal positional ordering of
577 the two lines in the input. */
580 compare_general (char *str1, char *str2, long int pos1, long int pos2, int use_keyfields)
584 /* Compare using the first keyfield;
585 if that does not distinguish the lines, try the second keyfield;
588 for (i = 0; i < use_keyfields; i++)
590 long length1, length2;
591 char *start1 = find_field (&keyfields[i], str1, &length1);
592 char *start2 = find_field (&keyfields[i], str2, &length2);
593 int tem = compare_field (&keyfields[i], start1, length1, pos1,
594 start2, length2, pos2);
597 if (keyfields[i].reverse)
603 return 0; /* Lines match exactly. */
606 /* Find the start and length of a field in STR according to KEYFIELD.
607 A pointer to the starting character is returned, and the length
608 is stored into the int that LENGTHPTR points to. */
611 find_field (struct keyfield *keyfield, char *str, long int *lengthptr)
617 if (keyfield->braced)
618 fun = find_braced_pos;
622 start = (*fun) (str, keyfield->startwords, keyfield->startchars,
623 keyfield->ignore_blanks);
624 if (keyfield->endwords < 0)
626 if (keyfield->braced)
627 end = find_braced_end (start);
631 while (*end && *end != '\n')
637 end = (*fun) (str, keyfield->endwords, keyfield->endchars, 0);
638 if (end - str < start - str)
641 *lengthptr = end - start;
645 /* Return a pointer to a specified place within STR,
646 skipping (from the beginning) WORDS words and then CHARS chars.
647 If IGNORE_BLANKS is nonzero, we skip all blanks
648 after finding the specified word. */
651 find_pos (char *str, int words, int chars, int ignore_blanks)
656 for (i = 0; i < words; i++)
659 /* Find next bunch of nonblanks and skip them. */
660 while ((c = *p) == ' ' || c == '\t')
662 while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t'))
664 if (!*p || *p == '\n')
668 while (*p == ' ' || *p == '\t')
671 for (i = 0; i < chars; i++)
673 if (!*p || *p == '\n')
680 /* Like find_pos but assumes that each field is surrounded by braces
681 and that braces within fields are balanced. */
684 find_braced_pos (char *str, int words, int chars, int ignore_blanks)
691 for (i = 0; i < words; i++)
694 while ((c = *p++) != '{' && c != '\n' && c)
705 if (c == 0 || c == '\n')
710 while ((c = *p++) != '{' && c != '\n' && c)
717 while ((c = *p) == ' ' || c == '\t')
720 for (i = 0; i < chars; i++)
722 if (!*p || *p == '\n')
729 /* Find the end of the balanced-brace field which starts at STR.
730 The position returned is just before the closing brace. */
733 find_braced_end (char *str)
747 if (c == 0 || c == '\n')
754 find_value (char *start, long int length)
758 if (isdigit (*start))
766 /* Vector used to translate characters for comparison.
767 This is how we make all alphanumerics follow all else,
768 and ignore case in the first sorting. */
772 init_char_order (void)
775 for (i = 1; i < 256; i++)
778 for (i = '0'; i <= '9'; i++)
779 char_order[i] += 512;
781 for (i = 'a'; i <= 'z'; i++)
783 char_order[i] = 512 + i;
784 char_order[i + 'A' - 'a'] = 512 + i;
788 /* Compare two fields (each specified as a start pointer and a character count)
789 according to KEYFIELD.
790 The sign of the value reports the relation between the fields. */
793 compare_field (struct keyfield *keyfield, char *start1, long int length1,
794 long int pos1, char *start2, long int length2, long int pos2)
796 if (keyfields->positional)
803 if (keyfield->numeric)
805 long value = find_value (start1, length1) - find_value (start2, length2);
816 char *e1 = start1 + length1;
817 char *e2 = start2 + length2;
832 if (char_order[c1] != char_order[c2])
833 return char_order[c1] - char_order[c2];
838 /* Strings are equal except possibly for case. */
855 /* Reverse sign here so upper case comes out last. */
865 /* A `struct linebuffer' is a structure which holds a line of text.
866 `readline' reads a line from a stream into a linebuffer
867 and works regardless of the length of the line. */
875 /* Initialize LINEBUFFER for use. */
878 initbuffer (struct linebuffer *linebuffer)
880 linebuffer->size = 200;
881 linebuffer->buffer = (char *) xmalloc (200);
884 /* Read a line of text from STREAM into LINEBUFFER.
885 Return the length of the line. */
888 readline (struct linebuffer *linebuffer, FILE *stream)
890 char *buffer = linebuffer->buffer;
891 char *p = linebuffer->buffer;
892 char *end = p + linebuffer->size;
896 int c = getc (stream);
899 buffer = (char *) xrealloc (buffer, linebuffer->size *= 2);
900 p += buffer - linebuffer->buffer;
901 end += buffer - linebuffer->buffer;
902 linebuffer->buffer = buffer;
904 if (c < 0 || c == '\n')
915 /* Sort an input file too big to sort in core. */
918 sort_offline (char *infile, off_t total, char *outfile)
920 /* More than enough. */
921 int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT;
922 char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
923 FILE *istream = fopen (infile, "r");
925 struct linebuffer lb;
931 /* Read in one line of input data. */
933 linelength = readline (&lb, istream);
935 if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
937 error (_("%s: not a texinfo index file"), infile);
941 /* Split up the input into `ntemps' temporary files, or maybe fewer,
942 and put the new files' names into `tempfiles' */
944 for (i = 0; i < ntemps; i++)
946 char *outname = maketempname (++tempcount);
947 FILE *ostream = fopen (outname, "w");
951 pfatal_with_name (outname);
952 tempfiles[i] = outname;
954 /* Copy lines into this temp file as long as it does not make file
955 "too big" or until there are no more lines. */
957 while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
959 tempsize += linelength + 1;
960 fputs (lb.buffer, ostream);
961 putc ('\n', ostream);
963 /* Read another line of input data. */
965 linelength = readline (&lb, istream);
966 if (!linelength && feof (istream))
969 if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
971 error (_("%s: not a texinfo index file"), infile);
984 /* Record number of temp files we actually needed. */
988 /* Sort each tempfile into another tempfile.
989 Delete the first set of tempfiles and put the names of the second
992 for (i = 0; i < ntemps; i++)
994 char *newtemp = maketempname (++tempcount);
995 sort_in_core (tempfiles[i], MAX_IN_CORE_SORT, newtemp);
997 unlink (tempfiles[i]);
998 tempfiles[i] = newtemp;
1004 /* Merge the tempfiles together and indexify. */
1006 merge_files (tempfiles, ntemps, outfile);
1009 /* Sort INFILE, whose size is TOTAL,
1010 assuming that is small enough to be done in-core,
1011 then indexify it and send the output to OUTFILE (or to stdout). */
1014 sort_in_core (char *infile, int total, char *outfile)
1017 char *data = (char *) xmalloc (total + 1);
1021 FILE *ostream = stdout;
1022 struct lineinfo *lineinfo;
1024 /* Read the contents of the file into the moby array `data'. */
1026 int desc = open (infile, O_RDONLY, 0);
1029 fatal (_("failure reopening %s"), infile);
1030 for (file_size = 0;;)
1032 i = read (desc, data + file_size, total - file_size);
1038 data[file_size] = 0;
1042 if (file_size > 0 && data[0] != '\\' && data[0] != '@')
1044 error (_("%s: not a texinfo index file"), infile);
1050 /* Sort routines want to know this address. */
1054 /* Create the array of pointers to lines, with a default size
1055 frequently enough. */
1057 nlines = total / 50;
1060 linearray = (char **) xmalloc (nlines * sizeof (char *));
1062 /* `nextline' points to the next free slot in this array.
1063 `nlines' is the allocated size. */
1065 nextline = linearray;
1067 /* Parse the input file's data, and make entries for the lines. */
1069 nextline = parsefile (infile, nextline, file_data, file_size);
1072 error (_("%s: not a texinfo index file"), infile);
1076 /* Sort the lines. */
1078 /* If we have enough space, find the first keyfield of each line in advance.
1079 Make a `struct lineinfo' for each line, which records the keyfield
1080 as well as the line, and sort them. */
1082 lineinfo = malloc ((nextline - linearray) * sizeof (struct lineinfo));
1086 struct lineinfo *lp;
1089 for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1092 lp->key.text = find_field (keyfields, *p, &lp->keylen);
1093 if (keyfields->numeric)
1094 lp->key.number = find_value (lp->key.text, lp->keylen);
1097 qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo),
1100 for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1106 qsort (linearray, nextline - linearray, sizeof (char *), compare_full);
1108 /* Open the output file. */
1112 ostream = fopen (outfile, "w");
1114 pfatal_with_name (outfile);
1117 writelines (linearray, nextline - linearray, ostream);
1125 /* Parse an input string in core into lines.
1126 DATA is the input string, and SIZE is its length.
1127 Data goes in LINEARRAY starting at NEXTLINE.
1128 The value returned is the first entry in LINEARRAY still unused.
1129 Value 0 means input file contents are invalid. */
1132 parsefile (char *filename, char **nextline, char *data, long int size)
1135 char **line = nextline;
1143 if (p[0] != '\\' && p[0] != '@')
1148 /* Find the first letter of the first field of this line. If it
1149 is different from the first letter of the first field of the
1150 first line, we need initial headers in the output index. */
1151 while (*p && *p != '{')
1158 if (first_initial != toupper (*p))
1162 first_initial = toupper (*p);
1164 while (*p && *p != '\n')
1170 if (line == linearray + nlines)
1172 char **old = linearray;
1173 linearray = xrealloc (linearray, sizeof (char *) * (nlines *= 4));
1174 line += linearray - old;
1181 /* Indexification is a filter applied to the sorted lines
1182 as they are being written to the output file.
1183 Multiple entries for the same name, with different page numbers,
1184 get combined into a single entry with multiple page numbers.
1185 The first braced field, which is used for sorting, is discarded.
1186 However, its first character is examined, folded to lower case,
1187 and if it is different from that in the previous line fed to us
1188 a \initial line is written with one argument, the new initial.
1190 If an entry has four braced fields, then the second and third
1191 constitute primary and secondary names.
1192 In this case, each change of primary name
1193 generates a \primary line which contains only the primary name,
1194 and in between these are \secondary lines which contain
1195 just a secondary name and page numbers. */
1197 /* The last primary name we wrote a \primary entry for.
1198 If only one level of indexing is being done, this is the last name seen. */
1200 /* Length of storage allocated for lastprimary. */
1201 int lastprimarylength;
1203 /* Similar, for the secondary name. */
1204 char *lastsecondary;
1205 int lastsecondarylength;
1207 /* Zero if we are not in the middle of writing an entry.
1208 One if we have written the beginning of an entry but have not
1209 yet written any page numbers into it.
1210 Greater than one if we have written the beginning of an entry
1211 plus at least one page number. */
1214 /* The initial (for sorting purposes) of the last primary entry written.
1215 When this changes, a \initial {c} line is written */
1219 int lastinitiallength;
1221 /* When we need a string of length 1 for the value of lastinitial,
1224 char lastinitial1[2];
1226 /* Initialize static storage for writing an index. */
1232 lastinitial = lastinitial1;
1233 lastinitial1[0] = 0;
1234 lastinitial1[1] = 0;
1235 lastinitiallength = 0;
1236 lastprimarylength = 100;
1237 lastprimary = (char *) xmalloc (lastprimarylength + 1);
1238 memset (lastprimary, '\0', lastprimarylength + 1);
1239 lastsecondarylength = 100;
1240 lastsecondary = (char *) xmalloc (lastsecondarylength + 1);
1241 memset (lastsecondary, '\0', lastsecondarylength + 1);
1244 /* Indexify. Merge entries for the same name,
1245 insert headers for each initial character, etc. */
1248 indexify (char *line, FILE *ostream)
1250 char *primary, *secondary, *pagenumber;
1251 int primarylength, secondarylength = 0, pagelength;
1258 /* First, analyze the parts of the entry fed to us this time. */
1260 p = find_braced_pos (line, 0, 0, 0);
1264 /* Get length of inner pair of braces starting at `p',
1265 including that inner pair of braces. */
1266 initiallength = find_braced_end (p + 1) + 1 - p;
1271 initial1[0] = toupper (*p);
1276 pagenumber = find_braced_pos (line, 1, 0, 0);
1277 pagelength = find_braced_end (pagenumber) - pagenumber;
1278 if (pagelength == 0)
1279 fatal (_("No page number in %s"), line);
1281 primary = find_braced_pos (line, 2, 0, 0);
1282 primarylength = find_braced_end (primary) - primary;
1284 secondary = find_braced_pos (line, 3, 0, 0);
1285 nosecondary = !*secondary;
1287 secondarylength = find_braced_end (secondary) - secondary;
1289 /* If the primary is different from before, make a new primary entry. */
1290 if (strncmp (primary, lastprimary, primarylength))
1292 /* Close off current secondary entry first, if one is open. */
1295 fputs ("}\n", ostream);
1299 /* If this primary has a different initial, include an entry for
1301 if (need_initials &&
1302 (initiallength != lastinitiallength ||
1303 strncmp (initial, lastinitial, initiallength)))
1305 fprintf (ostream, "\\initial {");
1306 fwrite (initial, 1, initiallength, ostream);
1307 fputs ("}\n", ostream);
1308 if (initial == initial1)
1310 lastinitial = lastinitial1;
1311 *lastinitial1 = *initial1;
1315 lastinitial = initial;
1317 lastinitiallength = initiallength;
1320 /* Make the entry for the primary. */
1322 fputs ("\\entry {", ostream);
1324 fputs ("\\primary {", ostream);
1325 fwrite (primary, primarylength, 1, ostream);
1328 fputs ("}{", ostream);
1332 fputs ("}\n", ostream);
1334 /* Record name of most recent primary. */
1335 if (lastprimarylength < primarylength)
1337 lastprimarylength = primarylength + 100;
1338 lastprimary = (char *) xrealloc (lastprimary,
1339 1 + lastprimarylength);
1341 strncpy (lastprimary, primary, primarylength);
1342 lastprimary[primarylength] = 0;
1344 /* There is no current secondary within this primary, now. */
1345 lastsecondary[0] = 0;
1348 /* Should not have an entry with no subtopic following one with a
1351 if (nosecondary && *lastsecondary)
1352 error (_("entry %s follows an entry with a secondary name"), line);
1354 /* Start a new secondary entry if necessary. */
1355 if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength))
1359 fputs ("}\n", ostream);
1363 /* Write the entry for the secondary. */
1364 fputs ("\\secondary {", ostream);
1365 fwrite (secondary, secondarylength, 1, ostream);
1366 fputs ("}{", ostream);
1369 /* Record name of most recent secondary. */
1370 if (lastsecondarylength < secondarylength)
1372 lastsecondarylength = secondarylength + 100;
1373 lastsecondary = (char *) xrealloc (lastsecondary,
1374 1 + lastsecondarylength);
1376 strncpy (lastsecondary, secondary, secondarylength);
1377 lastsecondary[secondarylength] = 0;
1380 /* Here to add one more page number to the current entry. */
1382 fputs (", ", ostream); /* Punctuate first, if this is not the first. */
1383 fwrite (pagenumber, pagelength, 1, ostream);
1386 /* Close out any unfinished output entry. */
1389 finish_index (FILE *ostream)
1392 fputs ("}\n", ostream);
1394 free (lastsecondary);
1397 /* Copy the lines in the sorted order.
1398 Each line is copied out of the input file it was found in. */
1401 writelines (char **linearray, int nlines, FILE *ostream)
1403 char **stop_line = linearray + nlines;
1408 /* Output the text of the lines, and free the buffer space. */
1410 for (next_line = linearray; next_line != stop_line; next_line++)
1412 /* If -u was specified, output the line only if distinct from
1414 if (next_line == linearray
1415 /* Compare previous line with this one, using only the
1416 explicitly specd keyfields. */
1417 || compare_general (*(next_line - 1), *next_line, 0L, 0L,
1420 char *p = *next_line;
1423 while ((c = *p++) && c != '\n')
1426 indexify (*next_line, ostream);
1430 finish_index (ostream);
1433 /* Assume (and optionally verify) that each input file is sorted;
1434 merge them and output the result.
1435 Returns nonzero if any input file fails to be sorted.
1437 This is the high-level interface that can handle an unlimited
1440 #define MAX_DIRECT_MERGE 10
1443 merge_files (char **infiles, int nfiles, char *outfile)
1449 int start_tempcount = tempcount;
1451 if (nfiles <= MAX_DIRECT_MERGE)
1452 return merge_direct (infiles, nfiles, outfile);
1454 /* Merge groups of MAX_DIRECT_MERGE input files at a time,
1455 making a temporary file to hold each group's result. */
1457 ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
1458 tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
1459 for (i = 0; i < ntemps; i++)
1461 int nf = MAX_DIRECT_MERGE;
1462 if (i + 1 == ntemps)
1463 nf = nfiles - i * MAX_DIRECT_MERGE;
1464 tempfiles[i] = maketempname (++tempcount);
1465 value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
1468 /* All temporary files that existed before are no longer needed
1469 since their contents have been merged into our new tempfiles.
1471 flush_tempfiles (start_tempcount);
1473 /* Now merge the temporary files we created. */
1475 merge_files (tempfiles, ntemps, outfile);
1482 /* Assume (and optionally verify) that each input file is sorted;
1483 merge them and output the result.
1484 Returns nonzero if any input file fails to be sorted.
1486 This version of merging will not work if the number of
1487 input files gets too high. Higher level functions
1488 use it only with a bounded number of input files. */
1491 merge_direct (char **infiles, int nfiles, char *outfile)
1493 struct linebuffer *lb1, *lb2;
1494 struct linebuffer **thisline, **prevline;
1500 struct linebuffer *prev_out = 0;
1501 FILE *ostream = stdout;
1505 ostream = fopen (outfile, "w");
1508 pfatal_with_name (outfile);
1519 /* For each file, make two line buffers. Also, for each file, there
1520 is an element of `thisline' which points at any time to one of the
1521 file's two buffers, and an element of `prevline' which points to
1522 the other buffer. `thisline' is supposed to point to the next
1523 available line from the file, while `prevline' holds the last file
1524 line used, which is remembered so that we can verify that the file
1525 is properly sorted. */
1527 /* lb1 and lb2 contain one buffer each per file. */
1528 lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1529 lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1531 /* thisline[i] points to the linebuffer holding the next available
1532 line in file i, or is zero if there are no lines left in that file. */
1533 thisline = (struct linebuffer **)
1534 xmalloc (nfiles * sizeof (struct linebuffer *));
1535 /* prevline[i] points to the linebuffer holding the last used line
1536 from file i. This is just for verifying that file i is properly
1538 prevline = (struct linebuffer **)
1539 xmalloc (nfiles * sizeof (struct linebuffer *));
1540 /* streams[i] holds the input stream for file i. */
1541 streams = (FILE **) xmalloc (nfiles * sizeof (FILE *));
1542 /* file_lossage[i] is nonzero if we already know file i is not
1544 file_lossage = (int *) xmalloc (nfiles * sizeof (int));
1546 /* Allocate and initialize all that storage. */
1548 for (i = 0; i < nfiles; i++)
1550 initbuffer (&lb1[i]);
1551 initbuffer (&lb2[i]);
1552 thisline[i] = &lb1[i];
1553 prevline[i] = &lb2[i];
1554 file_lossage[i] = 0;
1555 streams[i] = fopen (infiles[i], "r");
1557 pfatal_with_name (infiles[i]);
1559 readline (thisline[i], streams[i]);
1562 /* Keep count of number of files not at eof. */
1567 struct linebuffer *best = 0;
1568 struct linebuffer *exch;
1572 /* Look at the next avail line of each file; choose the least one. */
1574 for (i = 0; i < nfiles; i++)
1578 0 < compare_general (best->buffer, thisline[i]->buffer,
1579 (long) bestfile, (long) i, num_keyfields)))
1586 /* Output that line, unless it matches the previous one and we
1587 don't want duplicates. */
1590 !compare_general (prev_out->buffer,
1591 best->buffer, 0L, 1L, num_keyfields - 1)))
1592 indexify (best->buffer, ostream);
1595 /* Now make the line the previous of its file, and fetch a new
1596 line from that file. */
1598 exch = prevline[bestfile];
1599 prevline[bestfile] = thisline[bestfile];
1600 thisline[bestfile] = exch;
1604 /* If the file has no more, mark it empty. */
1606 if (feof (streams[bestfile]))
1608 thisline[bestfile] = 0;
1609 /* Update the number of files still not empty. */
1613 readline (thisline[bestfile], streams[bestfile]);
1614 if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile]))
1619 finish_index (ostream);
1621 /* Free all storage and close all input streams. */
1623 for (i = 0; i < nfiles; i++)
1625 fclose (streams[i]);
1626 free (lb1[i].buffer);
1627 free (lb2[i].buffer);
1629 free (file_lossage);
1642 /* Print error message and exit. */
1645 fatal (const char *format, const char *arg)
1647 error (format, arg);
1651 /* Print error message. FORMAT is printf control string, ARG is arg for it. */
1653 error (const char *format, const char *arg)
1655 printf ("%s: ", program_name);
1656 printf (format, arg);
1657 if (format[strlen (format) -1] != '\n')
1662 perror_with_name (const char *name)
1664 fprintf (stderr, "%s: ", program_name);
1669 pfatal_with_name (const char *name)
1671 perror_with_name (name);
1676 /* Return a newly-allocated string concatenating S1 and S2. */
1679 concat (char *s1, char *s2)
1681 int len1 = strlen (s1), len2 = strlen (s2);
1682 char *result = (char *) xmalloc (len1 + len2 + 1);
1684 strcpy (result, s1);
1685 strcpy (result + len1, s2);
1686 *(result + len1 + len2) = 0;