contrib/diff/analyze.c

   1 /* Analyze file differences for GNU DIFF.
   2    Copyright (C) 1988, 1989, 1992, 1993 Free Software Foundation, Inc.
   3
   4 This file is part of GNU DIFF.
   5
   6 GNU DIFF is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU DIFF is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU DIFF; see the file COPYING.  If not, write to
  18 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 /* The basic algorithm is described in:
  21    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
  22    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
  23    see especially section 4.2, which describes the variation used below.
  24    Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
  25    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
  26    at the price of producing suboptimal output for large inputs with
  27    many differences.
  28
  29    The basic algorithm was independently discovered as described in:
  30    "Algorithms for Approximate String Matching", E. Ukkonen,
  31    Information and Control Vol. 64, 1985, pp. 100-118.  */
  32
  33 #include "diff.h"
  34 #include "cmpbuf.h"
  35
  36 extern int no_discards;
  37
  38 static int *xvec, *yvec;        /* Vectors being compared. */
  39 static int *fdiag;              /* Vector, indexed by diagonal, containing
  40                                    1 + the X coordinate of the point furthest
  41                                    along the given diagonal in the forward
  42                                    search of the edit matrix. */
  43 static int *bdiag;              /* Vector, indexed by diagonal, containing
  44                                    the X coordinate of the point furthest
  45                                    along the given diagonal in the backward
  46                                    search of the edit matrix. */
  47 static int too_expensive;       /* Edit scripts longer than this are too
  48                                    expensive to compute.  */
  49
  50 #define SNAKE_LIMIT 20  /* Snakes bigger than this are considered `big'.  */
  51
  52 struct partition
  53 {
  54   int xmid, ymid;       /* Midpoints of this partition.  */
  55   int lo_minimal;       /* Nonzero if low half will be analyzed minimally.  */
  56   int hi_minimal;       /* Likewise for high half.  */
  57 };
  58
  59 static int diag PARAMS((int, int, int, int, int, struct partition *));
  60 static struct change *add_change PARAMS((int, int, int, int, struct change *));
  61 static struct change *build_reverse_script PARAMS((struct file_data const[]));
  62 static struct change *build_script PARAMS((struct file_data const[]));
  63 static void briefly_report PARAMS((int, struct file_data const[]));
  64 static void compareseq PARAMS((int, int, int, int, int));
  65 static void discard_confusing_lines PARAMS((struct file_data[]));
  66 static void shift_boundaries PARAMS((struct file_data[]));
  67
  68 /* Find the midpoint of the shortest edit script for a specified
  69    portion of the two files.
  70
  71    Scan from the beginnings of the files, and simultaneously from the ends,
  72    doing a breadth-first search through the space of edit-sequence.
  73    When the two searches meet, we have found the midpoint of the shortest
  74    edit sequence.
  75
  76    If MINIMAL is nonzero, find the minimal edit script regardless
  77    of expense.  Otherwise, if the search is too expensive, use
  78    heuristics to stop the search and report a suboptimal answer.
  79
  80    Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal number
  81    XMID - YMID equals the number of inserted lines minus the number
  82    of deleted lines (counting only lines before the midpoint).
  83    Return the approximate edit cost; this is the total number of
  84    lines inserted or deleted (counting only lines before the midpoint),
  85    unless a heuristic is used to terminate the search prematurely.
  86
  87    Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script for the
  88    left half of the partition is known; similarly for PART->RIGHT_MINIMAL.
  89
  90    This function assumes that the first lines of the specified portions
  91    of the two files do not match, and likewise that the last lines do not
  92    match.  The caller must trim matching lines from the beginning and end
  93    of the portions it is going to specify.
  94
  95    If we return the "wrong" partitions,
  96    the worst this can do is cause suboptimal diff output.
  97    It cannot cause incorrect diff output.  */
  98
  99 static int
 100 diag (xoff, xlim, yoff, ylim, minimal, part)
 101      int xoff, xlim, yoff, ylim, minimal;
 102      struct partition *part;
 103 {
 104   int *const fd = fdiag;        /* Give the compiler a chance. */
 105   int *const bd = bdiag;        /* Additional help for the compiler. */
 106   int const *const xv = xvec;   /* Still more help for the compiler. */
 107   int const *const yv = yvec;   /* And more and more . . . */
 108   int const dmin = xoff - ylim; /* Minimum valid diagonal. */
 109   int const dmax = xlim - yoff; /* Maximum valid diagonal. */
 110   int const fmid = xoff - yoff; /* Center diagonal of top-down search. */
 111   int const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
 112   int fmin = fmid, fmax = fmid; /* Limits of top-down search. */
 113   int bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */
 114   int c;                        /* Cost. */
 115   int odd = (fmid - bmid) & 1;  /* True if southeast corner is on an odd
 116                                    diagonal with respect to the northwest. */
 117
 118   fd[fmid] = xoff;
 119   bd[bmid] = xlim;
 120
 121   for (c = 1;; ++c)
 122     {
 123       int d;                    /* Active diagonal. */
 124       int big_snake = 0;
 125
 126       /* Extend the top-down search by an edit step in each diagonal. */
 127       fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
 128       fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
 129       for (d = fmax; d >= fmin; d -= 2)
 130         {
 131           int x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
 132
 133           if (tlo >= thi)
 134             x = tlo + 1;
 135           else
 136             x = thi;
 137           oldx = x;
 138           y = x - d;
 139           while (x < xlim && y < ylim && xv[x] == yv[y])
 140             ++x, ++y;
 141           if (x - oldx > SNAKE_LIMIT)
 142             big_snake = 1;
 143           fd[d] = x;
 144           if (odd && bmin <= d && d <= bmax && bd[d] <= x)
 145             {
 146               part->xmid = x;
 147               part->ymid = y;
 148               part->lo_minimal = part->hi_minimal = 1;
 149               return 2 * c - 1;
 150             }
 151         }
 152
 153       /* Similarly extend the bottom-up search.  */
 154       bmin > dmin ? bd[--bmin - 1] = INT_MAX : ++bmin;
 155       bmax < dmax ? bd[++bmax + 1] = INT_MAX : --bmax;
 156       for (d = bmax; d >= bmin; d -= 2)
 157         {
 158           int x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
 159
 160           if (tlo < thi)
 161             x = tlo;
 162           else
 163             x = thi - 1;
 164           oldx = x;
 165           y = x - d;
 166           while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
 167             --x, --y;
 168           if (oldx - x > SNAKE_LIMIT)
 169             big_snake = 1;
 170           bd[d] = x;
 171           if (!odd && fmin <= d && d <= fmax && x <= fd[d])
 172             {
 173               part->xmid = x;
 174               part->ymid = y;
 175               part->lo_minimal = part->hi_minimal = 1;
 176               return 2 * c;
 177             }
 178         }
 179
 180       if (minimal)
 181         continue;
 182
 183       /* Heuristic: check occasionally for a diagonal that has made
 184          lots of progress compared with the edit distance.
 185          If we have any such, find the one that has made the most
 186          progress and return it as if it had succeeded.
 187
 188          With this heuristic, for files with a constant small density
 189          of changes, the algorithm is linear in the file size.  */
 190
 191       if (c > 200 && big_snake && heuristic)
 192         {
 193           int best;
 194
 195           best = 0;
 196           for (d = fmax; d >= fmin; d -= 2)
 197             {
 198               int dd = d - fmid;
 199               int x = fd[d];
 200               int y = x - d;
 201               int v = (x - xoff) * 2 - dd;
 202               if (v > 12 * (c + (dd < 0 ? -dd : dd)))
 203                 {
 204                   if (v > best
 205                       && xoff + SNAKE_LIMIT <= x && x < xlim
 206                       && yoff + SNAKE_LIMIT <= y && y < ylim)
 207                     {
 208                       /* We have a good enough best diagonal;
 209                          now insist that it end with a significant snake.  */
 210                       int k;
 211
 212                       for (k = 1; xv[x - k] == yv[y - k]; k++)
 213                         if (k == SNAKE_LIMIT)
 214                           {
 215                             best = v;
 216                             part->xmid = x;
 217                             part->ymid = y;
 218                             break;
 219                           }
 220                     }
 221                 }
 222             }
 223           if (best > 0)
 224             {
 225               part->lo_minimal = 1;
 226               part->hi_minimal = 0;
 227               return 2 * c - 1;
 228             }
 229
 230           best = 0;
 231           for (d = bmax; d >= bmin; d -= 2)
 232             {
 233               int dd = d - bmid;
 234               int x = bd[d];
 235               int y = x - d;
 236               int v = (xlim - x) * 2 + dd;
 237               if (v > 12 * (c + (dd < 0 ? -dd : dd)))
 238                 {
 239                   if (v > best
 240                       && xoff < x && x <= xlim - SNAKE_LIMIT
 241                       && yoff < y && y <= ylim - SNAKE_LIMIT)
 242                     {
 243                       /* We have a good enough best diagonal;
 244                          now insist that it end with a significant snake.  */
 245                       int k;
 246
 247                       for (k = 0; xv[x + k] == yv[y + k]; k++)
 248                         if (k == SNAKE_LIMIT - 1)
 249                           {
 250                             best = v;
 251                             part->xmid = x;
 252                             part->ymid = y;
 253                             break;
 254                           }
 255                     }
 256                 }
 257             }
 258           if (best > 0)
 259             {
 260               part->lo_minimal = 0;
 261               part->hi_minimal = 1;
 262               return 2 * c - 1;
 263             }
 264         }
 265
 266       /* Heuristic: if we've gone well beyond the call of duty,
 267          give up and report halfway between our best results so far.  */
 268       if (c >= too_expensive)
 269         {
 270           int fxybest, fxbest;
 271           int bxybest, bxbest;
 272
 273           fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
 274
 275           /* Find forward diagonal that maximizes X + Y.  */
 276           fxybest = -1;
 277           for (d = fmax; d >= fmin; d -= 2)
 278             {
 279               int x = min (fd[d], xlim);
 280               int y = x - d;
 281               if (ylim < y)
 282                 x = ylim + d, y = ylim;
 283               if (fxybest < x + y)
 284                 {
 285                   fxybest = x + y;
 286                   fxbest = x;
 287                 }
 288             }
 289
 290           /* Find backward diagonal that minimizes X + Y.  */
 291           bxybest = INT_MAX;
 292           for (d = bmax; d >= bmin; d -= 2)
 293             {
 294               int x = max (xoff, bd[d]);
 295               int y = x - d;
 296               if (y < yoff)
 297                 x = yoff + d, y = yoff;
 298               if (x + y < bxybest)
 299                 {
 300                   bxybest = x + y;
 301                   bxbest = x;
 302                 }
 303             }
 304
 305           /* Use the better of the two diagonals.  */
 306           if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
 307             {
 308               part->xmid = fxbest;
 309               part->ymid = fxybest - fxbest;
 310               part->lo_minimal = 1;
 311               part->hi_minimal = 0;
 312             }
 313           else
 314             {
 315               part->xmid = bxbest;
 316               part->ymid = bxybest - bxbest;
 317               part->lo_minimal = 0;
 318               part->hi_minimal = 1;
 319             }
 320           return 2 * c - 1;
 321         }
 322     }
 323 }
 324 \f
 325 /* Compare in detail contiguous subsequences of the two files
 326    which are known, as a whole, to match each other.
 327
 328    The results are recorded in the vectors files[N].changed_flag, by
 329    storing a 1 in the element for each line that is an insertion or deletion.
 330
 331    The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
 332
 333    Note that XLIM, YLIM are exclusive bounds.
 334    All line numbers are origin-0 and discarded lines are not counted.
 335
 336    If MINIMAL is nonzero, find a minimal difference no matter how
 337    expensive it is.  */
 338
 339 static void
 340 compareseq (xoff, xlim, yoff, ylim, minimal)
 341      int xoff, xlim, yoff, ylim, minimal;
 342 {
 343   int * const xv = xvec; /* Help the compiler.  */
 344   int * const yv = yvec;
 345
 346   /* Slide down the bottom initial diagonal. */
 347   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
 348     ++xoff, ++yoff;
 349   /* Slide up the top initial diagonal. */
 350   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
 351     --xlim, --ylim;
 352
 353   /* Handle simple cases. */
 354   if (xoff == xlim)
 355     while (yoff < ylim)
 356       files[1].changed_flag[files[1].realindexes[yoff++]] = 1;
 357   else if (yoff == ylim)
 358     while (xoff < xlim)
 359       files[0].changed_flag[files[0].realindexes[xoff++]] = 1;
 360   else
 361     {
 362       int c;
 363       struct partition part;
 364
 365       /* Find a point of correspondence in the middle of the files.  */
 366
 367       c = diag (xoff, xlim, yoff, ylim, minimal, &part);
 368
 369       if (c == 1)
 370         {
 371           /* This should be impossible, because it implies that
 372              one of the two subsequences is empty,
 373              and that case was handled above without calling `diag'.
 374              Let's verify that this is true.  */
 375           abort ();
 376 #if 0
 377           /* The two subsequences differ by a single insert or delete;
 378              record it and we are done.  */
 379           if (part.xmid - part.ymid < xoff - yoff)
 380             files[1].changed_flag[files[1].realindexes[part.ymid - 1]] = 1;
 381           else
 382             files[0].changed_flag[files[0].realindexes[part.xmid]] = 1;
 383 #endif
 384         }
 385       else
 386         {
 387           /* Use the partitions to split this problem into subproblems.  */
 388           compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
 389           compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
 390         }
 391     }
 392 }
 393 \f
 394 /* Discard lines from one file that have no matches in the other file.
 395
 396    A line which is discarded will not be considered by the actual
 397    comparison algorithm; it will be as if that line were not in the file.
 398    The file's `realindexes' table maps virtual line numbers
 399    (which don't count the discarded lines) into real line numbers;
 400    this is how the actual comparison algorithm produces results
 401    that are comprehensible when the discarded lines are counted.
 402
 403    When we discard a line, we also mark it as a deletion or insertion
 404    so that it will be printed in the output.  */
 405
 406 static void
 407 discard_confusing_lines (filevec)
 408      struct file_data filevec[];
 409 {
 410   unsigned int f, i;
 411   char *discarded[2];
 412   int *equiv_count[2];
 413   int *p;
 414
 415   /* Allocate our results.  */
 416   p = (int *) xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
 417                        * (2 * sizeof (int)));
 418   for (f = 0; f < 2; f++)
 419     {
 420       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
 421       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
 422     }
 423
 424   /* Set up equiv_count[F][I] as the number of lines in file F
 425      that fall in equivalence class I.  */
 426
 427   p = (int *) xmalloc (filevec[0].equiv_max * (2 * sizeof (int)));
 428   equiv_count[0] = p;
 429   equiv_count[1] = p + filevec[0].equiv_max;
 430   bzero (p, filevec[0].equiv_max * (2 * sizeof (int)));
 431
 432   for (i = 0; i < filevec[0].buffered_lines; ++i)
 433     ++equiv_count[0][filevec[0].equivs[i]];
 434   for (i = 0; i < filevec[1].buffered_lines; ++i)
 435     ++equiv_count[1][filevec[1].equivs[i]];
 436
 437   /* Set up tables of which lines are going to be discarded.  */
 438
 439   discarded[0] = xmalloc (sizeof (char)
 440                           * (filevec[0].buffered_lines
 441                              + filevec[1].buffered_lines));
 442   discarded[1] = discarded[0] + filevec[0].buffered_lines;
 443   bzero (discarded[0], sizeof (char) * (filevec[0].buffered_lines
 444                                         + filevec[1].buffered_lines));
 445
 446   /* Mark to be discarded each line that matches no line of the other file.
 447      If a line matches many lines, mark it as provisionally discardable.  */
 448
 449   for (f = 0; f < 2; f++)
 450     {
 451       unsigned int end = filevec[f].buffered_lines;
 452       char *discards = discarded[f];
 453       int *counts = equiv_count[1 - f];
 454       int *equivs = filevec[f].equivs;
 455       unsigned int many = 5;
 456       unsigned int tem = end / 64;
 457
 458       /* Multiply MANY by approximate square root of number of lines.
 459          That is the threshold for provisionally discardable lines.  */
 460       while ((tem = tem >> 2) > 0)
 461         many *= 2;
 462
 463       for (i = 0; i < end; i++)
 464         {
 465           int nmatch;
 466           if (equivs[i] == 0)
 467             continue;
 468           nmatch = counts[equivs[i]];
 469           if (nmatch == 0)
 470             discards[i] = 1;
 471           else if (nmatch > many)
 472             discards[i] = 2;
 473         }
 474     }
 475
 476   /* Don't really discard the provisional lines except when they occur
 477      in a run of discardables, with nonprovisionals at the beginning
 478      and end.  */
 479
 480   for (f = 0; f < 2; f++)
 481     {
 482       unsigned int end = filevec[f].buffered_lines;
 483       register char *discards = discarded[f];
 484
 485       for (i = 0; i < end; i++)
 486         {
 487           /* Cancel provisional discards not in middle of run of discards.  */
 488           if (discards[i] == 2)
 489             discards[i] = 0;
 490           else if (discards[i] != 0)
 491             {
 492               /* We have found a nonprovisional discard.  */
 493               register int j;
 494               unsigned int length;
 495               unsigned int provisional = 0;
 496
 497               /* Find end of this run of discardable lines.
 498                  Count how many are provisionally discardable.  */
 499               for (j = i; j < end; j++)
 500                 {
 501                   if (discards[j] == 0)
 502                     break;
 503                   if (discards[j] == 2)
 504                     ++provisional;
 505                 }
 506
 507               /* Cancel provisional discards at end, and shrink the run.  */
 508               while (j > i && discards[j - 1] == 2)
 509                 discards[--j] = 0, --provisional;
 510
 511               /* Now we have the length of a run of discardable lines
 512                  whose first and last are not provisional.  */
 513               length = j - i;
 514
 515               /* If 1/4 of the lines in the run are provisional,
 516                  cancel discarding of all provisional lines in the run.  */
 517               if (provisional * 4 > length)
 518                 {
 519                   while (j > i)
 520                     if (discards[--j] == 2)
 521                       discards[j] = 0;
 522                 }
 523               else
 524                 {
 525                   register unsigned int consec;
 526                   unsigned int minimum = 1;
 527                   unsigned int tem = length / 4;
 528
 529                   /* MINIMUM is approximate square root of LENGTH/4.
 530                      A subrun of two or more provisionals can stand
 531                      when LENGTH is at least 16.
 532                      A subrun of 4 or more can stand when LENGTH >= 64.  */
 533                   while ((tem = tem >> 2) > 0)
 534                     minimum *= 2;
 535                   minimum++;
 536
 537                   /* Cancel any subrun of MINIMUM or more provisionals
 538                      within the larger run.  */
 539                   for (j = 0, consec = 0; j < length; j++)
 540                     if (discards[i + j] != 2)
 541                       consec = 0;
 542                     else if (minimum == ++consec)
 543                       /* Back up to start of subrun, to cancel it all.  */
 544                       j -= consec;
 545                     else if (minimum < consec)
 546                       discards[i + j] = 0;
 547
 548                   /* Scan from beginning of run
 549                      until we find 3 or more nonprovisionals in a row
 550                      or until the first nonprovisional at least 8 lines in.
 551                      Until that point, cancel any provisionals.  */
 552                   for (j = 0, consec = 0; j < length; j++)
 553                     {
 554                       if (j >= 8 && discards[i + j] == 1)
 555                         break;
 556                       if (discards[i + j] == 2)
 557                         consec = 0, discards[i + j] = 0;
 558                       else if (discards[i + j] == 0)
 559                         consec = 0;
 560                       else
 561                         consec++;
 562                       if (consec == 3)
 563                         break;
 564                     }
 565
 566                   /* I advances to the last line of the run.  */
 567                   i += length - 1;
 568
 569                   /* Same thing, from end.  */
 570                   for (j = 0, consec = 0; j < length; j++)
 571                     {
 572                       if (j >= 8 && discards[i - j] == 1)
 573                         break;
 574                       if (discards[i - j] == 2)
 575                         consec = 0, discards[i - j] = 0;
 576                       else if (discards[i - j] == 0)
 577                         consec = 0;
 578                       else
 579                         consec++;
 580                       if (consec == 3)
 581                         break;
 582                     }
 583                 }
 584             }
 585         }
 586     }
 587
 588   /* Actually discard the lines. */
 589   for (f = 0; f < 2; f++)
 590     {
 591       char *discards = discarded[f];
 592       unsigned int end = filevec[f].buffered_lines;
 593       unsigned int j = 0;
 594       for (i = 0; i < end; ++i)
 595         if (no_discards || discards[i] == 0)
 596           {
 597             filevec[f].undiscarded[j] = filevec[f].equivs[i];
 598             filevec[f].realindexes[j++] = i;
 599           }
 600         else
 601           filevec[f].changed_flag[i] = 1;
 602       filevec[f].nondiscarded_lines = j;
 603     }
 604
 605   free (discarded[0]);
 606   free (equiv_count[0]);
 607 }
 608 \f
 609 /* Adjust inserts/deletes of identical lines to join changes
 610    as much as possible.
 611
 612    We do something when a run of changed lines include a
 613    line at one end and have an excluded, identical line at the other.
 614    We are free to choose which identical line is included.
 615    `compareseq' usually chooses the one at the beginning,
 616    but usually it is cleaner to consider the following identical line
 617    to be the "change".  */
 618
 619 int inhibit;
 620
 621 static void
 622 shift_boundaries (filevec)
 623      struct file_data filevec[];
 624 {
 625   int f;
 626
 627   if (inhibit)
 628     return;
 629
 630   for (f = 0; f < 2; f++)
 631     {
 632       char *changed = filevec[f].changed_flag;
 633       char const *other_changed = filevec[1-f].changed_flag;
 634       int const *equivs = filevec[f].equivs;
 635       int i = 0;
 636       int j = 0;
 637       int i_end = filevec[f].buffered_lines;
 638
 639       while (1)
 640         {
 641           int runlength, start, corresponding;
 642
 643           /* Scan forwards to find beginning of another run of changes.
 644              Also keep track of the corresponding point in the other file.  */
 645
 646           while (i < i_end && changed[i] == 0)
 647             {
 648               while (other_changed[j++])
 649                 continue;
 650               i++;
 651             }
 652
 653           if (i == i_end)
 654             break;
 655
 656           start = i;
 657
 658           /* Find the end of this run of changes.  */
 659
 660           while (changed[++i])
 661             continue;
 662           while (other_changed[j])
 663             j++;
 664
 665           do
 666             {
 667               /* Record the length of this run of changes, so that
 668                  we can later determine whether the run has grown.  */
 669               runlength = i - start;
 670
 671               /* Move the changed region back, so long as the
 672                  previous unchanged line matches the last changed one.
 673                  This merges with previous changed regions.  */
 674
 675               while (start && equivs[start - 1] == equivs[i - 1])
 676                 {
 677                   changed[--start] = 1;
 678                   changed[--i] = 0;
 679                   while (changed[start - 1])
 680                     start--;
 681                   while (other_changed[--j])
 682                     continue;
 683                 }
 684
 685               /* Set CORRESPONDING to the end of the changed run, at the last
 686                  point where it corresponds to a changed run in the other file.
 687                  CORRESPONDING == I_END means no such point has been found.  */
 688               corresponding = other_changed[j - 1] ? i : i_end;
 689
 690               /* Move the changed region forward, so long as the
 691                  first changed line matches the following unchanged one.
 692                  This merges with following changed regions.
 693                  Do this second, so that if there are no merges,
 694                  the changed region is moved forward as far as possible.  */
 695
 696               while (i != i_end && equivs[start] == equivs[i])
 697                 {
 698                   changed[start++] = 0;
 699                   changed[i++] = 1;
 700                   while (changed[i])
 701                     i++;
 702                   while (other_changed[++j])
 703                     corresponding = i;
 704                 }
 705             }
 706           while (runlength != i - start);
 707
 708           /* If possible, move the fully-merged run of changes
 709              back to a corresponding run in the other file.  */
 710
 711           while (corresponding < i)
 712             {
 713               changed[--start] = 1;
 714               changed[--i] = 0;
 715               while (other_changed[--j])
 716                 continue;
 717             }
 718         }
 719     }
 720 }
 721 \f
 722 /* Cons an additional entry onto the front of an edit script OLD.
 723    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
 724    DELETED is the number of lines deleted here from file 0.
 725    INSERTED is the number of lines inserted here in file 1.
 726
 727    If DELETED is 0 then LINE0 is the number of the line before
 728    which the insertion was done; vice versa for INSERTED and LINE1.  */
 729
 730 static struct change *
 731 add_change (line0, line1, deleted, inserted, old)
 732      int line0, line1, deleted, inserted;
 733      struct change *old;
 734 {
 735   struct change *new = (struct change *) xmalloc (sizeof (struct change));
 736
 737   new->line0 = line0;
 738   new->line1 = line1;
 739   new->inserted = inserted;
 740   new->deleted = deleted;
 741   new->link = old;
 742   return new;
 743 }
 744
 745 /* Scan the tables of which lines are inserted and deleted,
 746    producing an edit script in reverse order.  */
 747
 748 static struct change *
 749 build_reverse_script (filevec)
 750      struct file_data const filevec[];
 751 {
 752   struct change *script = 0;
 753   char *changed0 = filevec[0].changed_flag;
 754   char *changed1 = filevec[1].changed_flag;
 755   int len0 = filevec[0].buffered_lines;
 756   int len1 = filevec[1].buffered_lines;
 757
 758   /* Note that changedN[len0] does exist, and contains 0.  */
 759
 760   int i0 = 0, i1 = 0;
 761
 762   while (i0 < len0 || i1 < len1)
 763     {
 764       if (changed0[i0] || changed1[i1])
 765         {
 766           int line0 = i0, line1 = i1;
 767
 768           /* Find # lines changed here in each file.  */
 769           while (changed0[i0]) ++i0;
 770           while (changed1[i1]) ++i1;
 771
 772           /* Record this change.  */
 773           script = add_change (line0, line1, i0 - line0, i1 - line1, script);
 774         }
 775
 776       /* We have reached lines in the two files that match each other.  */
 777       i0++, i1++;
 778     }
 779
 780   return script;
 781 }
 782
 783 /* Scan the tables of which lines are inserted and deleted,
 784    producing an edit script in forward order.  */
 785
 786 static struct change *
 787 build_script (filevec)
 788      struct file_data const filevec[];
 789 {
 790   struct change *script = 0;
 791   char *changed0 = filevec[0].changed_flag;
 792   char *changed1 = filevec[1].changed_flag;
 793   int i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
 794
 795   /* Note that changedN[-1] does exist, and contains 0.  */
 796
 797   while (i0 >= 0 || i1 >= 0)
 798     {
 799       if (changed0[i0 - 1] || changed1[i1 - 1])
 800         {
 801           int line0 = i0, line1 = i1;
 802
 803           /* Find # lines changed here in each file.  */
 804           while (changed0[i0 - 1]) --i0;
 805           while (changed1[i1 - 1]) --i1;
 806
 807           /* Record this change.  */
 808           script = add_change (i0, i1, line0 - i0, line1 - i1, script);
 809         }
 810
 811       /* We have reached lines in the two files that match each other.  */
 812       i0--, i1--;
 813     }
 814
 815   return script;
 816 }
 817 \f
 818 /* If CHANGES, briefly report that two files differed.  */
 819 static void
 820 briefly_report (changes, filevec)
 821      int changes;
 822      struct file_data const filevec[];
 823 {
 824   if (changes)
 825     message (no_details_flag ? "Files %s and %s differ\n"
 826              : "Binary files %s and %s differ\n",
 827              filevec[0].name, filevec[1].name);
 828 }
 829
 830 /* Report the differences of two files.  DEPTH is the current directory
 831    depth. */
 832 int
 833 diff_2_files (filevec, depth)
 834      struct file_data filevec[];
 835      int depth;
 836 {
 837   int diags;
 838   int i;
 839   struct change *e, *p;
 840   struct change *script;
 841   int changes;
 842
 843
 844   /* If we have detected that either file is binary,
 845      compare the two files as binary.  This can happen
 846      only when the first chunk is read.
 847      Also, --brief without any --ignore-* options means
 848      we can speed things up by treating the files as binary.  */
 849
 850   if (read_files (filevec, no_details_flag & ~ignore_some_changes))
 851     {
 852       /* Files with different lengths must be different.  */
 853       if (filevec[0].stat.st_size != filevec[1].stat.st_size
 854           && (filevec[0].desc < 0 || S_ISREG (filevec[0].stat.st_mode))
 855           && (filevec[1].desc < 0 || S_ISREG (filevec[1].stat.st_mode)))
 856         changes = 1;
 857
 858       /* Standard input equals itself.  */
 859       else if (filevec[0].desc == filevec[1].desc)
 860         changes = 0;
 861
 862       else
 863         /* Scan both files, a buffer at a time, looking for a difference.  */
 864         {
 865           /* Allocate same-sized buffers for both files.  */
 866           size_t buffer_size = buffer_lcm (STAT_BLOCKSIZE (filevec[0].stat),
 867                                            STAT_BLOCKSIZE (filevec[1].stat));
 868           for (i = 0; i < 2; i++)
 869             filevec[i].buffer = xrealloc (filevec[i].buffer, buffer_size);
 870
 871           for (;;  filevec[0].buffered_chars = filevec[1].buffered_chars = 0)
 872             {
 873               /* Read a buffer's worth from both files.  */
 874               for (i = 0; i < 2; i++)
 875                 if (0 <= filevec[i].desc)
 876                   while (filevec[i].buffered_chars != buffer_size)
 877                     {
 878                       int r = read (filevec[i].desc,
 879                                     filevec[i].buffer
 880                                     + filevec[i].buffered_chars,
 881                                     buffer_size - filevec[i].buffered_chars);
 882                       if (r == 0)
 883                         break;
 884                       if (r < 0)
 885                         pfatal_with_name (filevec[i].name);
 886                       filevec[i].buffered_chars += r;
 887                     }
 888
 889               /* If the buffers differ, the files differ.  */
 890               if (filevec[0].buffered_chars != filevec[1].buffered_chars
 891                   || (filevec[0].buffered_chars != 0
 892                       && memcmp (filevec[0].buffer,
 893                                  filevec[1].buffer,
 894                                  filevec[0].buffered_chars) != 0))
 895                 {
 896                   changes = 1;
 897                   break;
 898                 }
 899
 900               /* If we reach end of file, the files are the same.  */
 901               if (filevec[0].buffered_chars != buffer_size)
 902                 {
 903                   changes = 0;
 904                   break;
 905                 }
 906             }
 907         }
 908
 909       briefly_report (changes, filevec);
 910     }
 911   else
 912     {
 913       /* Allocate vectors for the results of comparison:
 914          a flag for each line of each file, saying whether that line
 915          is an insertion or deletion.
 916          Allocate an extra element, always zero, at each end of each vector.  */
 917
 918       size_t s = filevec[0].buffered_lines + filevec[1].buffered_lines + 4;
 919       filevec[0].changed_flag = xmalloc (s);
 920       bzero (filevec[0].changed_flag, s);
 921       filevec[0].changed_flag++;
 922       filevec[1].changed_flag = filevec[0].changed_flag
 923                                 + filevec[0].buffered_lines + 2;
 924
 925       /* Some lines are obviously insertions or deletions
 926          because they don't match anything.  Detect them now, and
 927          avoid even thinking about them in the main comparison algorithm.  */
 928
 929       discard_confusing_lines (filevec);
 930
 931       /* Now do the main comparison algorithm, considering just the
 932          undiscarded lines.  */
 933
 934       xvec = filevec[0].undiscarded;
 935       yvec = filevec[1].undiscarded;
 936       diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
 937       fdiag = (int *) xmalloc (diags * (2 * sizeof (int)));
 938       bdiag = fdiag + diags;
 939       fdiag += filevec[1].nondiscarded_lines + 1;
 940       bdiag += filevec[1].nondiscarded_lines + 1;
 941
 942       /* Set TOO_EXPENSIVE to be approximate square root of input size,
 943          bounded below by 256.  */
 944       too_expensive = 1;
 945       for (i = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines;
 946            i != 0; i >>= 2)
 947         too_expensive <<= 1;
 948       too_expensive = max (256, too_expensive);
 949
 950       files[0] = filevec[0];
 951       files[1] = filevec[1];
 952
 953       compareseq (0, filevec[0].nondiscarded_lines,
 954                   0, filevec[1].nondiscarded_lines, no_discards);
 955
 956       free (fdiag - (filevec[1].nondiscarded_lines + 1));
 957
 958       /* Modify the results slightly to make them prettier
 959          in cases where that can validly be done.  */
 960
 961       shift_boundaries (filevec);
 962
 963       /* Get the results of comparison in the form of a chain
 964          of `struct change's -- an edit script.  */
 965
 966       if (output_style == OUTPUT_ED)
 967         script = build_reverse_script (filevec);
 968       else
 969         script = build_script (filevec);
 970
 971       /* Set CHANGES if we had any diffs.
 972          If some changes are ignored, we must scan the script to decide.  */
 973       if (ignore_blank_lines_flag || ignore_regexp_list)
 974         {
 975           struct change *next = script;
 976           changes = 0;
 977
 978           while (next && changes == 0)
 979             {
 980               struct change *this, *end;
 981               int first0, last0, first1, last1, deletes, inserts;
 982
 983               /* Find a set of changes that belong together.  */
 984               this = next;
 985               end = find_change (next);
 986
 987               /* Disconnect them from the rest of the changes, making them
 988                  a hunk, and remember the rest for next iteration.  */
 989               next = end->link;
 990               end->link = 0;
 991
 992               /* Determine whether this hunk is really a difference.  */
 993               analyze_hunk (this, &first0, &last0, &first1, &last1,
 994                             &deletes, &inserts);
 995
 996               /* Reconnect the script so it will all be freed properly.  */
 997               end->link = next;
 998
 999               if (deletes || inserts)
1000                 changes = 1;
1001             }
1002         }
1003       else
1004         changes = (script != 0);
1005
1006       if (no_details_flag)
1007         briefly_report (changes, filevec);
1008       else
1009         {
1010           if (changes || ! no_diff_means_no_output)
1011             {
1012               /* Record info for starting up output,
1013                  to be used if and when we have some output to print.  */
1014               setup_output (files[0].name, files[1].name, depth);
1015
1016               switch (output_style)
1017                 {
1018                 case OUTPUT_CONTEXT:
1019                   print_context_script (script, 0);
1020                   break;
1021
1022                 case OUTPUT_UNIFIED:
1023                   print_context_script (script, 1);
1024                   break;
1025
1026                 case OUTPUT_ED:
1027                   print_ed_script (script);
1028                   break;
1029
1030                 case OUTPUT_FORWARD_ED:
1031                   pr_forward_ed_script (script);
1032                   break;
1033
1034                 case OUTPUT_RCS:
1035                   print_rcs_script (script);
1036                   break;
1037
1038                 case OUTPUT_NORMAL:
1039                   print_normal_script (script);
1040                   break;
1041
1042                 case OUTPUT_IFDEF:
1043                   print_ifdef_script (script);
1044                   break;
1045
1046                 case OUTPUT_SDIFF:
1047                   print_sdiff_script (script);
1048                 }
1049
1050               finish_output ();
1051             }
1052         }
1053
1054       free (filevec[0].undiscarded);
1055
1056       free (filevec[0].changed_flag - 1);
1057
1058       for (i = 1; i >= 0; --i)
1059         free (filevec[i].equivs);
1060
1061       for (i = 0; i < 2; ++i)
1062         free (filevec[i].linbuf + filevec[i].linbuf_base);
1063
1064       for (e = script; e; e = p)
1065         {
1066           p = e->link;
1067           free (e);
1068         }
1069
1070       if (! ROBUST_OUTPUT_STYLE (output_style))
1071         for (i = 0; i < 2; ++i)
1072           if (filevec[i].missing_newline)
1073             {
1074               error ("No newline at end of file %s", filevec[i].name, "");
1075               changes = 2;
1076             }
1077     }
1078
1079   if (filevec[0].buffer != filevec[1].buffer)
1080     free (filevec[0].buffer);
1081   free (filevec[1].buffer);
1082
1083   return changes;
1084 }