1 /* File I/O for GNU DIFF.
2 Copyright (C) 1988, 1989, 1992, 1993, 1994 Free Software Foundation, Inc.
4 This file is part of GNU DIFF.
6 GNU DIFF is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GNU DIFF is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU DIFF; see the file COPYING. If not, write to
18 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
22 /* Rotate a value n bits to the left. */
23 #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
24 #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
26 /* Given a hash value and a new character, return a new hash value. */
27 #define HASH(h, c) ((c) + ROL (h, 7))
29 /* Guess remaining number of lines from number N of lines so far,
30 size S so far, and total size T. */
31 #define GUESS_LINES(n,s,t) (((t) - (s)) / ((n) < 10 ? 32 : (s) / ((n)-1)) + 5)
33 /* Type used for fast prefix comparison in find_identical_ends. */
38 /* Lines are put into equivalence classes (of lines that match in line_cmp).
39 Each equivalence class is represented by one of these structures,
40 but only while the classes are being computed.
41 Afterward, each class is represented by a number. */
44 int next; /* Next item in this bucket. */
45 unsigned hash; /* Hash of lines in this class. */
46 char const *line; /* A line that fits this class. */
47 size_t length; /* That line's length, not counting its newline. */
50 /* Hash-table: array of buckets, each being a chain of equivalence classes.
51 buckets[-1] is reserved for incomplete lines. */
54 /* Number of buckets in the hash table array, not counting buckets[-1]. */
57 /* Array in which the equivalence classes are allocated.
58 The bucket-chains go through the elements in this array.
59 The number of an equivalence class is its index in this array. */
60 static struct equivclass *equivs;
62 /* Index of first free element in the array `equivs'. */
63 static int equivs_index;
65 /* Number of elements allocated in the array `equivs'. */
66 static int equivs_alloc;
68 static void find_and_hash_each_line PARAMS((struct file_data *));
69 static void find_identical_ends PARAMS((struct file_data[]));
70 static void prepare_text_end PARAMS((struct file_data *));
72 /* Check for binary files and compare them for exact identity. */
74 /* Return 1 if BUF contains a non text character.
75 SIZE is the number of characters in BUF. */
77 #define binary_file_p(buf, size) (memchr (buf, '\0', size) != 0)
79 /* Get ready to read the current file.
80 Return nonzero if SKIP_TEST is zero,
81 and if it appears to be a binary file. */
84 sip (current, skip_test)
85 struct file_data *current;
88 /* If we have a nonexistent file at this stage, treat it as empty. */
89 if (current->desc < 0)
91 /* Leave room for a sentinel. */
92 current->bufsize = sizeof (word);
93 current->buffer = xmalloc (current->bufsize);
97 current->bufsize = STAT_BLOCKSIZE (current->stat);
98 current->buffer = xmalloc (current->bufsize);
102 /* Check first part of file to see if it's a binary file. */
104 int oldmode = setmode (current->desc, O_BINARY);
106 size_t n = read (current->desc, current->buffer, current->bufsize);
108 pfatal_with_name (current->name);
109 current->buffered_chars = n;
111 if (oldmode != O_BINARY)
113 if (lseek (current->desc, - (off_t) n, SEEK_CUR) == -1)
114 pfatal_with_name (current->name);
115 setmode (current->desc, oldmode);
116 current->buffered_chars = 0;
119 return binary_file_p (current->buffer, n);
123 current->buffered_chars = 0;
127 /* Slurp the rest of the current file completely into memory. */
131 struct file_data *current;
135 if (current->desc < 0)
136 /* The file is nonexistent. */
138 else if (S_ISREG (current->stat.st_mode))
140 /* It's a regular file; slurp in the rest all at once. */
142 /* Get the size out of the stat block.
143 Allocate enough room for appended newline and sentinel. */
144 cc = current->stat.st_size + 1 + sizeof (word);
145 if (current->bufsize < cc)
147 current->bufsize = cc;
148 current->buffer = xrealloc (current->buffer, cc);
151 if (current->buffered_chars < current->stat.st_size)
153 cc = read (current->desc,
154 current->buffer + current->buffered_chars,
155 current->stat.st_size - current->buffered_chars);
157 pfatal_with_name (current->name);
158 current->buffered_chars += cc;
161 /* It's not a regular file; read it, growing the buffer as needed. */
162 else if (always_text_flag || current->buffered_chars != 0)
166 if (current->buffered_chars == current->bufsize)
168 current->bufsize = current->bufsize * 2;
169 current->buffer = xrealloc (current->buffer, current->bufsize);
171 cc = read (current->desc,
172 current->buffer + current->buffered_chars,
173 current->bufsize - current->buffered_chars);
177 pfatal_with_name (current->name);
178 current->buffered_chars += cc;
180 /* Allocate just enough room for appended newline and sentinel. */
181 current->bufsize = current->buffered_chars + 1 + sizeof (word);
182 current->buffer = xrealloc (current->buffer, current->bufsize);
186 /* Split the file into lines, simultaneously computing the equivalence class for
190 find_and_hash_each_line (current)
191 struct file_data *current;
194 unsigned char const *p = (unsigned char const *) current->prefix_end;
199 /* Cache often-used quantities in local variables to help the compiler. */
200 char const **linbuf = current->linbuf;
201 int alloc_lines = current->alloc_lines;
203 int linbuf_base = current->linbuf_base;
204 int *cureqs = (int *) xmalloc (alloc_lines * sizeof (int));
205 struct equivclass *eqs = equivs;
206 int eqs_index = equivs_index;
207 int eqs_alloc = equivs_alloc;
208 char const *suffix_begin = current->suffix_begin;
209 char const *bufend = current->buffer + current->buffered_chars;
210 int use_line_cmp = ignore_some_line_changes;
212 while ((char const *) p < suffix_begin)
214 char const *ip = (char const *) p;
216 /* Compute the equivalence class for this line. */
220 /* Hash this line until we find a newline. */
221 if (ignore_case_flag)
223 if (ignore_all_space_flag)
224 while ((c = *p++) != '\n')
227 h = HASH (h, ISUPPER (c) ? tolower (c) : c);
229 else if (ignore_space_change_flag)
230 while ((c = *p++) != '\n')
244 /* C is now the first non-space. */
245 h = HASH (h, ISUPPER (c) ? tolower (c) : c);
248 while ((c = *p++) != '\n')
249 h = HASH (h, ISUPPER (c) ? tolower (c) : c);
253 if (ignore_all_space_flag)
254 while ((c = *p++) != '\n')
259 else if (ignore_space_change_flag)
260 while ((c = *p++) != '\n')
274 /* C is now the first non-space. */
278 while ((c = *p++) != '\n')
283 bucket = &buckets[h % nbuckets];
284 length = (char const *) p - ip - 1;
286 if ((char const *) p == bufend
287 && current->missing_newline
288 && ROBUST_OUTPUT_STYLE (output_style))
290 /* This line is incomplete. If this is significant,
291 put the line into bucket[-1]. */
292 if (! (ignore_space_change_flag | ignore_all_space_flag))
293 bucket = &buckets[-1];
295 /* Omit the inserted newline when computing linbuf later. */
297 bufend = suffix_begin = (char const *) p;
300 for (i = *bucket; ; i = eqs[i].next)
303 /* Create a new equivalence class in this bucket. */
306 eqs = (struct equivclass *)
307 xrealloc (eqs, (eqs_alloc*=2) * sizeof(*eqs));
308 eqs[i].next = *bucket;
311 eqs[i].length = length;
315 else if (eqs[i].hash == h)
317 char const *eqline = eqs[i].line;
319 /* Reuse existing equivalence class if the lines are identical.
320 This detects the common case of exact identity
321 faster than complete comparison would. */
322 if (eqs[i].length == length && memcmp (eqline, ip, length) == 0)
325 /* Reuse existing class if line_cmp reports the lines equal. */
326 if (use_line_cmp && line_cmp (eqline, ip) == 0)
330 /* Maybe increase the size of the line table. */
331 if (line == alloc_lines)
333 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
334 alloc_lines = 2 * alloc_lines - linbuf_base;
335 cureqs = (int *) xrealloc (cureqs, alloc_lines * sizeof (*cureqs));
336 linbuf = (char const **) xrealloc (linbuf + linbuf_base,
337 (alloc_lines - linbuf_base)
346 current->buffered_lines = line;
350 /* Record the line start for lines in the suffix that we care about.
351 Record one more line start than lines,
352 so that we can compute the length of any buffered line. */
353 if (line == alloc_lines)
355 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
356 alloc_lines = 2 * alloc_lines - linbuf_base;
357 linbuf = (char const **) xrealloc (linbuf + linbuf_base,
358 (alloc_lines - linbuf_base)
362 linbuf[line] = (char const *) p;
364 if ((char const *) p == bufend)
367 if (context <= i && no_diff_means_no_output)
376 /* Done with cache in local variables. */
377 current->linbuf = linbuf;
378 current->valid_lines = line;
379 current->alloc_lines = alloc_lines;
380 current->equivs = cureqs;
382 equivs_alloc = eqs_alloc;
383 equivs_index = eqs_index;
386 /* Prepare the end of the text. Make sure it's initialized.
387 Make sure text ends in a newline,
388 but remember that we had to add one. */
391 prepare_text_end (current)
392 struct file_data *current;
394 size_t buffered_chars = current->buffered_chars;
395 char *p = current->buffer;
397 if (buffered_chars == 0 || p[buffered_chars - 1] == '\n')
398 current->missing_newline = 0;
401 p[buffered_chars++] = '\n';
402 current->buffered_chars = buffered_chars;
403 current->missing_newline = 1;
406 /* Don't use uninitialized storage when planting or using sentinels. */
408 bzero (p + buffered_chars, sizeof (word));
411 /* Given a vector of two file_data objects, find the identical
412 prefixes and suffixes of each object. */
415 find_identical_ends (filevec)
416 struct file_data filevec[];
419 char *p0, *p1, *buffer0, *buffer1;
420 char const *end0, *beg0;
421 char const **linbuf0, **linbuf1;
424 int alloc_lines0, alloc_lines1;
425 int buffered_prefix, prefix_count, prefix_mask;
428 if (filevec[0].desc != filevec[1].desc)
432 filevec[1].buffer = filevec[0].buffer;
433 filevec[1].bufsize = filevec[0].bufsize;
434 filevec[1].buffered_chars = filevec[0].buffered_chars;
436 for (i = 0; i < 2; i++)
437 prepare_text_end (&filevec[i]);
439 /* Find identical prefix. */
441 p0 = buffer0 = filevec[0].buffer;
442 p1 = buffer1 = filevec[1].buffer;
444 n0 = filevec[0].buffered_chars;
445 n1 = filevec[1].buffered_chars;
448 /* The buffers are the same; sentinels won't work. */
452 /* Insert end sentinels, in this case characters that are guaranteed
453 to make the equality test false, and thus terminate the loop. */
460 /* Loop until first mismatch, or to the sentinel characters. */
462 /* Compare a word at a time for speed. */
465 while (*w0++ == *w1++)
469 /* Do the last few bytes of comparison a byte at a time. */
472 while (*p0++ == *p1++)
476 /* Don't mistakenly count missing newline as part of prefix. */
477 if (ROBUST_OUTPUT_STYLE (output_style)
478 && (buffer0 + n0 - filevec[0].missing_newline < p0)
480 (buffer1 + n1 - filevec[1].missing_newline < p1))
484 /* Now P0 and P1 point at the first nonmatching characters. */
486 /* Skip back to last line-beginning in the prefix,
487 and then discard up to HORIZON_LINES lines from the prefix. */
489 while (p0 != buffer0 && (p0[-1] != '\n' || i--))
492 /* Record the prefix. */
493 filevec[0].prefix_end = p0;
494 filevec[1].prefix_end = p1;
496 /* Find identical suffix. */
498 /* P0 and P1 point beyond the last chars not yet compared. */
502 if (! ROBUST_OUTPUT_STYLE (output_style)
503 || filevec[0].missing_newline == filevec[1].missing_newline)
505 end0 = p0; /* Addr of last char in file 0. */
507 /* Get value of P0 at which we should stop scanning backward:
508 this is when either P0 or P1 points just past the last char
509 of the identical prefix. */
510 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
512 /* Scan back until chars don't match or we reach that point. */
516 /* Point at the first char of the matching suffix. */
522 /* Are we at a line-beginning in both files? If not, add the rest of
523 this line to the main body. Discard up to HORIZON_LINES lines from
524 the identical suffix. Also, discard one extra line,
525 because shift_boundaries may need it. */
526 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
528 (buffer1 == p1 || p1[-1] == '\n'));
529 while (i-- && p0 != end0)
530 while (*p0++ != '\n')
536 /* Record the suffix. */
537 filevec[0].suffix_begin = p0;
538 filevec[1].suffix_begin = p1;
540 /* Calculate number of lines of prefix to save.
542 prefix_count == 0 means save the whole prefix;
543 we need this with for options like -D that output the whole file.
544 We also need it for options like -F that output some preceding line;
545 at least we will need to find the last few lines,
546 but since we don't know how many, it's easiest to find them all.
548 Otherwise, prefix_count != 0. Save just prefix_count lines at start
549 of the line buffer; they'll be moved to the proper location later.
550 Handle 1 more line than the context says (because we count 1 too many),
551 rounded up to the next power of 2 to speed index computation. */
553 if (no_diff_means_no_output && ! function_regexp_list)
555 for (prefix_count = 1; prefix_count < context + 1; prefix_count *= 2)
557 prefix_mask = prefix_count - 1;
560 + GUESS_LINES (0, 0, p0 - filevec[0].prefix_end)
567 alloc_lines0 = GUESS_LINES (0, 0, n0);
571 linbuf0 = (char const **) xmalloc (alloc_lines0 * sizeof (*linbuf0));
573 /* If the prefix is needed, find the prefix lines. */
574 if (! (no_diff_means_no_output
575 && filevec[0].prefix_end == p0
576 && filevec[1].prefix_end == p1))
579 end0 = filevec[0].prefix_end;
582 int l = lines++ & prefix_mask;
583 if (l == alloc_lines0)
584 linbuf0 = (char const **) xrealloc (linbuf0, (alloc_lines0 *= 2)
587 while (*p0++ != '\n')
591 buffered_prefix = prefix_count && context < lines ? context : lines;
593 /* Allocate line buffer 1. */
594 tem = prefix_count ? filevec[1].suffix_begin - buffer1 : n1;
598 + GUESS_LINES (lines, filevec[1].prefix_end - buffer1, tem)
600 linbuf1 = (char const **) xmalloc (alloc_lines1 * sizeof (*linbuf1));
602 if (buffered_prefix != lines)
604 /* Rotate prefix lines to proper location. */
605 for (i = 0; i < buffered_prefix; i++)
606 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
607 for (i = 0; i < buffered_prefix; i++)
608 linbuf0[i] = linbuf1[i];
611 /* Initialize line buffer 1 from line buffer 0. */
612 for (i = 0; i < buffered_prefix; i++)
613 linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
615 /* Record the line buffer, adjusted so that
616 linbuf*[0] points at the first differing line. */
617 filevec[0].linbuf = linbuf0 + buffered_prefix;
618 filevec[1].linbuf = linbuf1 + buffered_prefix;
619 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
620 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
621 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
622 filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
625 /* Largest primes less than some power of two, for nbuckets. Values range
626 from useful to preposterous. If one of these numbers isn't prime
627 after all, don't blame it on me, blame it on primes (6) . . . */
628 static int const primes[] =
648 67108859, /* Preposterously large . . . */
658 /* Given a vector of two file_data objects, read the file associated
659 with each one, and build the table of equivalence classes.
660 Return 1 if either file appears to be a binary file.
661 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
664 read_files (filevec, pretend_binary)
665 struct file_data filevec[];
669 int skip_test = always_text_flag | pretend_binary;
670 int appears_binary = pretend_binary | sip (&filevec[0], skip_test);
672 if (filevec[0].desc != filevec[1].desc)
673 appears_binary |= sip (&filevec[1], skip_test | appears_binary);
676 filevec[1].buffer = filevec[0].buffer;
677 filevec[1].bufsize = filevec[0].bufsize;
678 filevec[1].buffered_chars = filevec[0].buffered_chars;
683 setmode (filevec[0].desc, O_BINARY);
684 setmode (filevec[1].desc, O_BINARY);
689 find_identical_ends (filevec);
691 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
692 equivs = (struct equivclass *) xmalloc (equivs_alloc * sizeof (struct equivclass));
693 /* Equivalence class 0 is permanently safe for lines that were not
694 hashed. Real equivalence classes start at 1. */
697 for (i = 0; primes[i] < equivs_alloc / 3; i++)
700 nbuckets = primes[i];
702 buckets = (int *) xmalloc ((nbuckets + 1) * sizeof (*buckets));
703 bzero (buckets++, (nbuckets + 1) * sizeof (*buckets));
705 for (i = 0; i < 2; i++)
706 find_and_hash_each_line (&filevec[i]);
708 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;