contrib/diff/src/analyze.c

   1 /* Analyze file differences for GNU DIFF.
   2
   3    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
   4    2004 Free Software Foundation, Inc.
   5
   6    This file is part of GNU DIFF.
   7
   8    GNU DIFF is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 2, or (at your option)
  11    any later version.
  12
  13    GNU DIFF is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; see the file COPYING.
  20    If not, write to the Free Software Foundation,
  21    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  22
  23 /* The basic algorithm is described in:
  24    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
  25    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
  26    see especially section 4.2, which describes the variation used below.
  27    Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
  28    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
  29    at the price of producing suboptimal output for large inputs with
  30    many differences.
  31
  32    The basic algorithm was independently discovered as described in:
  33    "Algorithms for Approximate String Matching", E. Ukkonen,
  34    Information and Control Vol. 64, 1985, pp. 100-118.  */
  35
  36 #include "diff.h"
  37 #include <cmpbuf.h>
  38 #include <error.h>
  39 #include <file-type.h>
  40 #include <xalloc.h>
  41
  42 static lin *xvec, *yvec;        /* Vectors being compared. */
  43 static lin *fdiag;              /* Vector, indexed by diagonal, containing
  44                                    1 + the X coordinate of the point furthest
  45                                    along the given diagonal in the forward
  46                                    search of the edit matrix. */
  47 static lin *bdiag;              /* Vector, indexed by diagonal, containing
  48                                    the X coordinate of the point furthest
  49                                    along the given diagonal in the backward
  50                                    search of the edit matrix. */
  51 static lin too_expensive;       /* Edit scripts longer than this are too
  52                                    expensive to compute.  */
  53
  54 #define SNAKE_LIMIT 20  /* Snakes bigger than this are considered `big'.  */
  55
  56 struct partition
  57 {
  58   lin xmid, ymid;       /* Midpoints of this partition.  */
  59   bool lo_minimal;      /* Nonzero if low half will be analyzed minimally.  */
  60   bool hi_minimal;      /* Likewise for high half.  */
  61 };
  62
  63 /* Find the midpoint of the shortest edit script for a specified
  64    portion of the two files.
  65
  66    Scan from the beginnings of the files, and simultaneously from the ends,
  67    doing a breadth-first search through the space of edit-sequence.
  68    When the two searches meet, we have found the midpoint of the shortest
  69    edit sequence.
  70
  71    If FIND_MINIMAL is nonzero, find the minimal edit script regardless
  72    of expense.  Otherwise, if the search is too expensive, use
  73    heuristics to stop the search and report a suboptimal answer.
  74
  75    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
  76    XMID - YMID equals the number of inserted lines minus the number
  77    of deleted lines (counting only lines before the midpoint).
  78
  79    Set PART->lo_minimal to true iff the minimal edit script for the
  80    left half of the partition is known; similarly for PART->hi_minimal.
  81
  82    This function assumes that the first lines of the specified portions
  83    of the two files do not match, and likewise that the last lines do not
  84    match.  The caller must trim matching lines from the beginning and end
  85    of the portions it is going to specify.
  86
  87    If we return the "wrong" partitions,
  88    the worst this can do is cause suboptimal diff output.
  89    It cannot cause incorrect diff output.  */
  90
  91 static void
  92 diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
  93       struct partition *part)
  94 {
  95   lin *const fd = fdiag;        /* Give the compiler a chance. */
  96   lin *const bd = bdiag;        /* Additional help for the compiler. */
  97   lin const *const xv = xvec;   /* Still more help for the compiler. */
  98   lin const *const yv = yvec;   /* And more and more . . . */
  99   lin const dmin = xoff - ylim; /* Minimum valid diagonal. */
 100   lin const dmax = xlim - yoff; /* Maximum valid diagonal. */
 101   lin const fmid = xoff - yoff; /* Center diagonal of top-down search. */
 102   lin const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
 103   lin fmin = fmid, fmax = fmid; /* Limits of top-down search. */
 104   lin bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */
 105   lin c;                        /* Cost. */
 106   bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
 107                                    diagonal with respect to the northwest. */
 108
 109   fd[fmid] = xoff;
 110   bd[bmid] = xlim;
 111
 112   for (c = 1;; ++c)
 113     {
 114       lin d;                    /* Active diagonal. */
 115       bool big_snake = false;
 116
 117       /* Extend the top-down search by an edit step in each diagonal. */
 118       fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
 119       fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
 120       for (d = fmax; d >= fmin; d -= 2)
 121         {
 122           lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
 123
 124           if (tlo >= thi)
 125             x = tlo + 1;
 126           else
 127             x = thi;
 128           oldx = x;
 129           y = x - d;
 130           while (x < xlim && y < ylim && xv[x] == yv[y])
 131             ++x, ++y;
 132           if (x - oldx > SNAKE_LIMIT)
 133             big_snake = true;
 134           fd[d] = x;
 135           if (odd && bmin <= d && d <= bmax && bd[d] <= x)
 136             {
 137               part->xmid = x;
 138               part->ymid = y;
 139               part->lo_minimal = part->hi_minimal = true;
 140               return;
 141             }
 142         }
 143
 144       /* Similarly extend the bottom-up search.  */
 145       bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
 146       bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
 147       for (d = bmax; d >= bmin; d -= 2)
 148         {
 149           lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
 150
 151           if (tlo < thi)
 152             x = tlo;
 153           else
 154             x = thi - 1;
 155           oldx = x;
 156           y = x - d;
 157           while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
 158             --x, --y;
 159           if (oldx - x > SNAKE_LIMIT)
 160             big_snake = true;
 161           bd[d] = x;
 162           if (!odd && fmin <= d && d <= fmax && x <= fd[d])
 163             {
 164               part->xmid = x;
 165               part->ymid = y;
 166               part->lo_minimal = part->hi_minimal = true;
 167               return;
 168             }
 169         }
 170
 171       if (find_minimal)
 172         continue;
 173
 174       /* Heuristic: check occasionally for a diagonal that has made
 175          lots of progress compared with the edit distance.
 176          If we have any such, find the one that has made the most
 177          progress and return it as if it had succeeded.
 178
 179          With this heuristic, for files with a constant small density
 180          of changes, the algorithm is linear in the file size.  */
 181
 182       if (200 < c && big_snake && speed_large_files)
 183         {
 184           lin best = 0;
 185
 186           for (d = fmax; d >= fmin; d -= 2)
 187             {
 188               lin dd = d - fmid;
 189               lin x = fd[d];
 190               lin y = x - d;
 191               lin v = (x - xoff) * 2 - dd;
 192               if (v > 12 * (c + (dd < 0 ? -dd : dd)))
 193                 {
 194                   if (v > best
 195                       && xoff + SNAKE_LIMIT <= x && x < xlim
 196                       && yoff + SNAKE_LIMIT <= y && y < ylim)
 197                     {
 198                       /* We have a good enough best diagonal;
 199                          now insist that it end with a significant snake.  */
 200                       int k;
 201
 202                       for (k = 1; xv[x - k] == yv[y - k]; k++)
 203                         if (k == SNAKE_LIMIT)
 204                           {
 205                             best = v;
 206                             part->xmid = x;
 207                             part->ymid = y;
 208                             break;
 209                           }
 210                     }
 211                 }
 212             }
 213           if (best > 0)
 214             {
 215               part->lo_minimal = true;
 216               part->hi_minimal = false;
 217               return;
 218             }
 219
 220           best = 0;
 221           for (d = bmax; d >= bmin; d -= 2)
 222             {
 223               lin dd = d - bmid;
 224               lin x = bd[d];
 225               lin y = x - d;
 226               lin v = (xlim - x) * 2 + dd;
 227               if (v > 12 * (c + (dd < 0 ? -dd : dd)))
 228                 {
 229                   if (v > best
 230                       && xoff < x && x <= xlim - SNAKE_LIMIT
 231                       && yoff < y && y <= ylim - SNAKE_LIMIT)
 232                     {
 233                       /* We have a good enough best diagonal;
 234                          now insist that it end with a significant snake.  */
 235                       int k;
 236
 237                       for (k = 0; xv[x + k] == yv[y + k]; k++)
 238                         if (k == SNAKE_LIMIT - 1)
 239                           {
 240                             best = v;
 241                             part->xmid = x;
 242                             part->ymid = y;
 243                             break;
 244                           }
 245                     }
 246                 }
 247             }
 248           if (best > 0)
 249             {
 250               part->lo_minimal = false;
 251               part->hi_minimal = true;
 252               return;
 253             }
 254         }
 255
 256       /* Heuristic: if we've gone well beyond the call of duty,
 257          give up and report halfway between our best results so far.  */
 258       if (c >= too_expensive)
 259         {
 260           lin fxybest, fxbest;
 261           lin bxybest, bxbest;
 262
 263           fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
 264
 265           /* Find forward diagonal that maximizes X + Y.  */
 266           fxybest = -1;
 267           for (d = fmax; d >= fmin; d -= 2)
 268             {
 269               lin x = MIN (fd[d], xlim);
 270               lin y = x - d;
 271               if (ylim < y)
 272                 x = ylim + d, y = ylim;
 273               if (fxybest < x + y)
 274                 {
 275                   fxybest = x + y;
 276                   fxbest = x;
 277                 }
 278             }
 279
 280           /* Find backward diagonal that minimizes X + Y.  */
 281           bxybest = LIN_MAX;
 282           for (d = bmax; d >= bmin; d -= 2)
 283             {
 284               lin x = MAX (xoff, bd[d]);
 285               lin y = x - d;
 286               if (y < yoff)
 287                 x = yoff + d, y = yoff;
 288               if (x + y < bxybest)
 289                 {
 290                   bxybest = x + y;
 291                   bxbest = x;
 292                 }
 293             }
 294
 295           /* Use the better of the two diagonals.  */
 296           if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
 297             {
 298               part->xmid = fxbest;
 299               part->ymid = fxybest - fxbest;
 300               part->lo_minimal = true;
 301               part->hi_minimal = false;
 302             }
 303           else
 304             {
 305               part->xmid = bxbest;
 306               part->ymid = bxybest - bxbest;
 307               part->lo_minimal = false;
 308               part->hi_minimal = true;
 309             }
 310           return;
 311         }
 312     }
 313 }
 314 \f
 315 /* Compare in detail contiguous subsequences of the two files
 316    which are known, as a whole, to match each other.
 317
 318    The results are recorded in the vectors files[N].changed, by
 319    storing 1 in the element for each line that is an insertion or deletion.
 320
 321    The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
 322
 323    Note that XLIM, YLIM are exclusive bounds.
 324    All line numbers are origin-0 and discarded lines are not counted.
 325
 326    If FIND_MINIMAL, find a minimal difference no matter how
 327    expensive it is.  */
 328
 329 static void
 330 compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
 331 {
 332   lin const *xv = xvec; /* Help the compiler.  */
 333   lin const *yv = yvec;
 334
 335   /* Slide down the bottom initial diagonal. */
 336   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
 337     ++xoff, ++yoff;
 338   /* Slide up the top initial diagonal. */
 339   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
 340     --xlim, --ylim;
 341
 342   /* Handle simple cases. */
 343   if (xoff == xlim)
 344     while (yoff < ylim)
 345       files[1].changed[files[1].realindexes[yoff++]] = 1;
 346   else if (yoff == ylim)
 347     while (xoff < xlim)
 348       files[0].changed[files[0].realindexes[xoff++]] = 1;
 349   else
 350     {
 351       struct partition part;
 352
 353       /* Find a point of correspondence in the middle of the files.  */
 354       diag (xoff, xlim, yoff, ylim, find_minimal, &part);
 355
 356       /* Use the partitions to split this problem into subproblems.  */
 357       compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
 358       compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
 359     }
 360 }
 361 \f
 362 /* Discard lines from one file that have no matches in the other file.
 363
 364    A line which is discarded will not be considered by the actual
 365    comparison algorithm; it will be as if that line were not in the file.
 366    The file's `realindexes' table maps virtual line numbers
 367    (which don't count the discarded lines) into real line numbers;
 368    this is how the actual comparison algorithm produces results
 369    that are comprehensible when the discarded lines are counted.
 370
 371    When we discard a line, we also mark it as a deletion or insertion
 372    so that it will be printed in the output.  */
 373
 374 static void
 375 discard_confusing_lines (struct file_data filevec[])
 376 {
 377   int f;
 378   lin i;
 379   char *discarded[2];
 380   lin *equiv_count[2];
 381   lin *p;
 382
 383   /* Allocate our results.  */
 384   p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
 385                * (2 * sizeof *p));
 386   for (f = 0; f < 2; f++)
 387     {
 388       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
 389       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
 390     }
 391
 392   /* Set up equiv_count[F][I] as the number of lines in file F
 393      that fall in equivalence class I.  */
 394
 395   p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
 396   equiv_count[0] = p;
 397   equiv_count[1] = p + filevec[0].equiv_max;
 398
 399   for (i = 0; i < filevec[0].buffered_lines; ++i)
 400     ++equiv_count[0][filevec[0].equivs[i]];
 401   for (i = 0; i < filevec[1].buffered_lines; ++i)
 402     ++equiv_count[1][filevec[1].equivs[i]];
 403
 404   /* Set up tables of which lines are going to be discarded.  */
 405
 406   discarded[0] = zalloc (filevec[0].buffered_lines
 407                          + filevec[1].buffered_lines);
 408   discarded[1] = discarded[0] + filevec[0].buffered_lines;
 409
 410   /* Mark to be discarded each line that matches no line of the other file.
 411      If a line matches many lines, mark it as provisionally discardable.  */
 412
 413   for (f = 0; f < 2; f++)
 414     {
 415       size_t end = filevec[f].buffered_lines;
 416       char *discards = discarded[f];
 417       lin *counts = equiv_count[1 - f];
 418       lin *equivs = filevec[f].equivs;
 419       size_t many = 5;
 420       size_t tem = end / 64;
 421
 422       /* Multiply MANY by approximate square root of number of lines.
 423          That is the threshold for provisionally discardable lines.  */
 424       while ((tem = tem >> 2) > 0)
 425         many *= 2;
 426
 427       for (i = 0; i < end; i++)
 428         {
 429           lin nmatch;
 430           if (equivs[i] == 0)
 431             continue;
 432           nmatch = counts[equivs[i]];
 433           if (nmatch == 0)
 434             discards[i] = 1;
 435           else if (nmatch > many)
 436             discards[i] = 2;
 437         }
 438     }
 439
 440   /* Don't really discard the provisional lines except when they occur
 441      in a run of discardables, with nonprovisionals at the beginning
 442      and end.  */
 443
 444   for (f = 0; f < 2; f++)
 445     {
 446       lin end = filevec[f].buffered_lines;
 447       register char *discards = discarded[f];
 448
 449       for (i = 0; i < end; i++)
 450         {
 451           /* Cancel provisional discards not in middle of run of discards.  */
 452           if (discards[i] == 2)
 453             discards[i] = 0;
 454           else if (discards[i] != 0)
 455             {
 456               /* We have found a nonprovisional discard.  */
 457               register lin j;
 458               lin length;
 459               lin provisional = 0;
 460
 461               /* Find end of this run of discardable lines.
 462                  Count how many are provisionally discardable.  */
 463               for (j = i; j < end; j++)
 464                 {
 465                   if (discards[j] == 0)
 466                     break;
 467                   if (discards[j] == 2)
 468                     ++provisional;
 469                 }
 470
 471               /* Cancel provisional discards at end, and shrink the run.  */
 472               while (j > i && discards[j - 1] == 2)
 473                 discards[--j] = 0, --provisional;
 474
 475               /* Now we have the length of a run of discardable lines
 476                  whose first and last are not provisional.  */
 477               length = j - i;
 478
 479               /* If 1/4 of the lines in the run are provisional,
 480                  cancel discarding of all provisional lines in the run.  */
 481               if (provisional * 4 > length)
 482                 {
 483                   while (j > i)
 484                     if (discards[--j] == 2)
 485                       discards[j] = 0;
 486                 }
 487               else
 488                 {
 489                   register lin consec;
 490                   lin minimum = 1;
 491                   lin tem = length >> 2;
 492
 493                   /* MINIMUM is approximate square root of LENGTH/4.
 494                      A subrun of two or more provisionals can stand
 495                      when LENGTH is at least 16.
 496                      A subrun of 4 or more can stand when LENGTH >= 64.  */
 497                   while (0 < (tem >>= 2))
 498                     minimum <<= 1;
 499                   minimum++;
 500
 501                   /* Cancel any subrun of MINIMUM or more provisionals
 502                      within the larger run.  */
 503                   for (j = 0, consec = 0; j < length; j++)
 504                     if (discards[i + j] != 2)
 505                       consec = 0;
 506                     else if (minimum == ++consec)
 507                       /* Back up to start of subrun, to cancel it all.  */
 508                       j -= consec;
 509                     else if (minimum < consec)
 510                       discards[i + j] = 0;
 511
 512                   /* Scan from beginning of run
 513                      until we find 3 or more nonprovisionals in a row
 514                      or until the first nonprovisional at least 8 lines in.
 515                      Until that point, cancel any provisionals.  */
 516                   for (j = 0, consec = 0; j < length; j++)
 517                     {
 518                       if (j >= 8 && discards[i + j] == 1)
 519                         break;
 520                       if (discards[i + j] == 2)
 521                         consec = 0, discards[i + j] = 0;
 522                       else if (discards[i + j] == 0)
 523                         consec = 0;
 524                       else
 525                         consec++;
 526                       if (consec == 3)
 527                         break;
 528                     }
 529
 530                   /* I advances to the last line of the run.  */
 531                   i += length - 1;
 532
 533                   /* Same thing, from end.  */
 534                   for (j = 0, consec = 0; j < length; j++)
 535                     {
 536                       if (j >= 8 && discards[i - j] == 1)
 537                         break;
 538                       if (discards[i - j] == 2)
 539                         consec = 0, discards[i - j] = 0;
 540                       else if (discards[i - j] == 0)
 541                         consec = 0;
 542                       else
 543                         consec++;
 544                       if (consec == 3)
 545                         break;
 546                     }
 547                 }
 548             }
 549         }
 550     }
 551
 552   /* Actually discard the lines. */
 553   for (f = 0; f < 2; f++)
 554     {
 555       char *discards = discarded[f];
 556       lin end = filevec[f].buffered_lines;
 557       lin j = 0;
 558       for (i = 0; i < end; ++i)
 559         if (minimal || discards[i] == 0)
 560           {
 561             filevec[f].undiscarded[j] = filevec[f].equivs[i];
 562             filevec[f].realindexes[j++] = i;
 563           }
 564         else
 565           filevec[f].changed[i] = 1;
 566       filevec[f].nondiscarded_lines = j;
 567     }
 568
 569   free (discarded[0]);
 570   free (equiv_count[0]);
 571 }
 572 \f
 573 /* Adjust inserts/deletes of identical lines to join changes
 574    as much as possible.
 575
 576    We do something when a run of changed lines include a
 577    line at one end and have an excluded, identical line at the other.
 578    We are free to choose which identical line is included.
 579    `compareseq' usually chooses the one at the beginning,
 580    but usually it is cleaner to consider the following identical line
 581    to be the "change".  */
 582
 583 static void
 584 shift_boundaries (struct file_data filevec[])
 585 {
 586   int f;
 587
 588   for (f = 0; f < 2; f++)
 589     {
 590       char *changed = filevec[f].changed;
 591       char *other_changed = filevec[1 - f].changed;
 592       lin const *equivs = filevec[f].equivs;
 593       lin i = 0;
 594       lin j = 0;
 595       lin i_end = filevec[f].buffered_lines;
 596
 597       while (1)
 598         {
 599           lin runlength, start, corresponding;
 600
 601           /* Scan forwards to find beginning of another run of changes.
 602              Also keep track of the corresponding point in the other file.  */
 603
 604           while (i < i_end && !changed[i])
 605             {
 606               while (other_changed[j++])
 607                 continue;
 608               i++;
 609             }
 610
 611           if (i == i_end)
 612             break;
 613
 614           start = i;
 615
 616           /* Find the end of this run of changes.  */
 617
 618           while (changed[++i])
 619             continue;
 620           while (other_changed[j])
 621             j++;
 622
 623           do
 624             {
 625               /* Record the length of this run of changes, so that
 626                  we can later determine whether the run has grown.  */
 627               runlength = i - start;
 628
 629               /* Move the changed region back, so long as the
 630                  previous unchanged line matches the last changed one.
 631                  This merges with previous changed regions.  */
 632
 633               while (start && equivs[start - 1] == equivs[i - 1])
 634                 {
 635                   changed[--start] = 1;
 636                   changed[--i] = 0;
 637                   while (changed[start - 1])
 638                     start--;
 639                   while (other_changed[--j])
 640                     continue;
 641                 }
 642
 643               /* Set CORRESPONDING to the end of the changed run, at the last
 644                  point where it corresponds to a changed run in the other file.
 645                  CORRESPONDING == I_END means no such point has been found.  */
 646               corresponding = other_changed[j - 1] ? i : i_end;
 647
 648               /* Move the changed region forward, so long as the
 649                  first changed line matches the following unchanged one.
 650                  This merges with following changed regions.
 651                  Do this second, so that if there are no merges,
 652                  the changed region is moved forward as far as possible.  */
 653
 654               while (i != i_end && equivs[start] == equivs[i])
 655                 {
 656                   changed[start++] = 0;
 657                   changed[i++] = 1;
 658                   while (changed[i])
 659                     i++;
 660                   while (other_changed[++j])
 661                     corresponding = i;
 662                 }
 663             }
 664           while (runlength != i - start);
 665
 666           /* If possible, move the fully-merged run of changes
 667              back to a corresponding run in the other file.  */
 668
 669           while (corresponding < i)
 670             {
 671               changed[--start] = 1;
 672               changed[--i] = 0;
 673               while (other_changed[--j])
 674                 continue;
 675             }
 676         }
 677     }
 678 }
 679 \f
 680 /* Cons an additional entry onto the front of an edit script OLD.
 681    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
 682    DELETED is the number of lines deleted here from file 0.
 683    INSERTED is the number of lines inserted here in file 1.
 684
 685    If DELETED is 0 then LINE0 is the number of the line before
 686    which the insertion was done; vice versa for INSERTED and LINE1.  */
 687
 688 static struct change *
 689 add_change (lin line0, lin line1, lin deleted, lin inserted,
 690             struct change *old)
 691 {
 692   struct change *new = xmalloc (sizeof *new);
 693
 694   new->line0 = line0;
 695   new->line1 = line1;
 696   new->inserted = inserted;
 697   new->deleted = deleted;
 698   new->link = old;
 699   return new;
 700 }
 701
 702 /* Scan the tables of which lines are inserted and deleted,
 703    producing an edit script in reverse order.  */
 704
 705 static struct change *
 706 build_reverse_script (struct file_data const filevec[])
 707 {
 708   struct change *script = 0;
 709   char *changed0 = filevec[0].changed;
 710   char *changed1 = filevec[1].changed;
 711   lin len0 = filevec[0].buffered_lines;
 712   lin len1 = filevec[1].buffered_lines;
 713
 714   /* Note that changedN[len0] does exist, and is 0.  */
 715
 716   lin i0 = 0, i1 = 0;
 717
 718   while (i0 < len0 || i1 < len1)
 719     {
 720       if (changed0[i0] | changed1[i1])
 721         {
 722           lin line0 = i0, line1 = i1;
 723
 724           /* Find # lines changed here in each file.  */
 725           while (changed0[i0]) ++i0;
 726           while (changed1[i1]) ++i1;
 727
 728           /* Record this change.  */
 729           script = add_change (line0, line1, i0 - line0, i1 - line1, script);
 730         }
 731
 732       /* We have reached lines in the two files that match each other.  */
 733       i0++, i1++;
 734     }
 735
 736   return script;
 737 }
 738
 739 /* Scan the tables of which lines are inserted and deleted,
 740    producing an edit script in forward order.  */
 741
 742 static struct change *
 743 build_script (struct file_data const filevec[])
 744 {
 745   struct change *script = 0;
 746   char *changed0 = filevec[0].changed;
 747   char *changed1 = filevec[1].changed;
 748   lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
 749
 750   /* Note that changedN[-1] does exist, and is 0.  */
 751
 752   while (i0 >= 0 || i1 >= 0)
 753     {
 754       if (changed0[i0 - 1] | changed1[i1 - 1])
 755         {
 756           lin line0 = i0, line1 = i1;
 757
 758           /* Find # lines changed here in each file.  */
 759           while (changed0[i0 - 1]) --i0;
 760           while (changed1[i1 - 1]) --i1;
 761
 762           /* Record this change.  */
 763           script = add_change (i0, i1, line0 - i0, line1 - i1, script);
 764         }
 765
 766       /* We have reached lines in the two files that match each other.  */
 767       i0--, i1--;
 768     }
 769
 770   return script;
 771 }
 772 \f
 773 /* If CHANGES, briefly report that two files differed.
 774    Return 2 if trouble, CHANGES otherwise.  */
 775 static int
 776 briefly_report (int changes, struct file_data const filevec[])
 777 {
 778   if (changes)
 779     {
 780       char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
 781       char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
 782       message ("Files %s and %s differ\n", label0, label1);
 783       if (! brief)
 784         changes = 2;
 785     }
 786
 787   return changes;
 788 }
 789
 790 /* Report the differences of two files.  */
 791 int
 792 diff_2_files (struct comparison *cmp)
 793 {
 794   lin diags;
 795   int f;
 796   struct change *e, *p;
 797   struct change *script;
 798   int changes;
 799
 800
 801   /* If we have detected that either file is binary,
 802      compare the two files as binary.  This can happen
 803      only when the first chunk is read.
 804      Also, --brief without any --ignore-* options means
 805      we can speed things up by treating the files as binary.  */
 806
 807   if (read_files (cmp->file, files_can_be_treated_as_binary))
 808     {
 809       /* Files with different lengths must be different.  */
 810       if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
 811           && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
 812           && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
 813         changes = 1;
 814
 815       /* Standard input equals itself.  */
 816       else if (cmp->file[0].desc == cmp->file[1].desc)
 817         changes = 0;
 818
 819       else
 820         /* Scan both files, a buffer at a time, looking for a difference.  */
 821         {
 822           /* Allocate same-sized buffers for both files.  */
 823           size_t lcm_max = PTRDIFF_MAX - 1;
 824           size_t buffer_size =
 825             buffer_lcm (sizeof (word),
 826                         buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
 827                                     STAT_BLOCKSIZE (cmp->file[1].stat),
 828                                     lcm_max),
 829                         lcm_max);
 830           for (f = 0; f < 2; f++)
 831             cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
 832
 833           for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
 834             {
 835               /* Read a buffer's worth from both files.  */
 836               for (f = 0; f < 2; f++)
 837                 if (0 <= cmp->file[f].desc)
 838                   file_block_read (&cmp->file[f],
 839                                    buffer_size - cmp->file[f].buffered);
 840
 841               /* If the buffers differ, the files differ.  */
 842               if (cmp->file[0].buffered != cmp->file[1].buffered
 843                   || memcmp (cmp->file[0].buffer,
 844                              cmp->file[1].buffer,
 845                              cmp->file[0].buffered))
 846                 {
 847                   changes = 1;
 848                   break;
 849                 }
 850
 851               /* If we reach end of file, the files are the same.  */
 852               if (cmp->file[0].buffered != buffer_size)
 853                 {
 854                   changes = 0;
 855                   break;
 856                 }
 857             }
 858         }
 859
 860       changes = briefly_report (changes, cmp->file);
 861     }
 862   else
 863     {
 864       /* Allocate vectors for the results of comparison:
 865          a flag for each line of each file, saying whether that line
 866          is an insertion or deletion.
 867          Allocate an extra element, always 0, at each end of each vector.  */
 868
 869       size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
 870       char *flag_space = zalloc (s);
 871       cmp->file[0].changed = flag_space + 1;
 872       cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
 873
 874       /* Some lines are obviously insertions or deletions
 875          because they don't match anything.  Detect them now, and
 876          avoid even thinking about them in the main comparison algorithm.  */
 877
 878       discard_confusing_lines (cmp->file);
 879
 880       /* Now do the main comparison algorithm, considering just the
 881          undiscarded lines.  */
 882
 883       xvec = cmp->file[0].undiscarded;
 884       yvec = cmp->file[1].undiscarded;
 885       diags = (cmp->file[0].nondiscarded_lines
 886                + cmp->file[1].nondiscarded_lines + 3);
 887       fdiag = xmalloc (diags * (2 * sizeof *fdiag));
 888       bdiag = fdiag + diags;
 889       fdiag += cmp->file[1].nondiscarded_lines + 1;
 890       bdiag += cmp->file[1].nondiscarded_lines + 1;
 891
 892       /* Set TOO_EXPENSIVE to be approximate square root of input size,
 893          bounded below by 256.  */
 894       too_expensive = 1;
 895       for (;  diags != 0;  diags >>= 2)
 896         too_expensive <<= 1;
 897       too_expensive = MAX (256, too_expensive);
 898
 899       files[0] = cmp->file[0];
 900       files[1] = cmp->file[1];
 901
 902       compareseq (0, cmp->file[0].nondiscarded_lines,
 903                   0, cmp->file[1].nondiscarded_lines, minimal);
 904
 905       free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
 906
 907       /* Modify the results slightly to make them prettier
 908          in cases where that can validly be done.  */
 909
 910       shift_boundaries (cmp->file);
 911
 912       /* Get the results of comparison in the form of a chain
 913          of `struct change's -- an edit script.  */
 914
 915       if (output_style == OUTPUT_ED)
 916         script = build_reverse_script (cmp->file);
 917       else
 918         script = build_script (cmp->file);
 919
 920       /* Set CHANGES if we had any diffs.
 921          If some changes are ignored, we must scan the script to decide.  */
 922       if (ignore_blank_lines || ignore_regexp.fastmap)
 923         {
 924           struct change *next = script;
 925           changes = 0;
 926
 927           while (next && changes == 0)
 928             {
 929               struct change *this, *end;
 930               lin first0, last0, first1, last1;
 931
 932               /* Find a set of changes that belong together.  */
 933               this = next;
 934               end = find_change (next);
 935
 936               /* Disconnect them from the rest of the changes, making them
 937                  a hunk, and remember the rest for next iteration.  */
 938               next = end->link;
 939               end->link = 0;
 940
 941               /* Determine whether this hunk is really a difference.  */
 942               if (analyze_hunk (this, &first0, &last0, &first1, &last1))
 943                 changes = 1;
 944
 945               /* Reconnect the script so it will all be freed properly.  */
 946               end->link = next;
 947             }
 948         }
 949       else
 950         changes = (script != 0);
 951
 952       if (brief)
 953         changes = briefly_report (changes, cmp->file);
 954       else
 955         {
 956           if (changes | !no_diff_means_no_output)
 957             {
 958               /* Record info for starting up output,
 959                  to be used if and when we have some output to print.  */
 960               setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
 961                             file_label[1] ? file_label[1] : cmp->file[1].name,
 962                             cmp->parent != 0);
 963
 964               switch (output_style)
 965                 {
 966                 case OUTPUT_CONTEXT:
 967                   print_context_script (script, false);
 968                   break;
 969
 970                 case OUTPUT_UNIFIED:
 971                   print_context_script (script, true);
 972                   break;
 973
 974                 case OUTPUT_ED:
 975                   print_ed_script (script);
 976                   break;
 977
 978                 case OUTPUT_FORWARD_ED:
 979                   pr_forward_ed_script (script);
 980                   break;
 981
 982                 case OUTPUT_RCS:
 983                   print_rcs_script (script);
 984                   break;
 985
 986                 case OUTPUT_NORMAL:
 987                   print_normal_script (script);
 988                   break;
 989
 990                 case OUTPUT_IFDEF:
 991                   print_ifdef_script (script);
 992                   break;
 993
 994                 case OUTPUT_SDIFF:
 995                   print_sdiff_script (script);
 996                   break;
 997
 998                 default:
 999                   abort ();
1000                 }
1001
1002               finish_output ();
1003             }
1004         }
1005
1006       free (cmp->file[0].undiscarded);
1007
1008       free (flag_space);
1009
1010       for (f = 0; f < 2; f++)
1011         {
1012           free (cmp->file[f].equivs);
1013           free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1014         }
1015
1016       for (e = script; e; e = p)
1017         {
1018           p = e->link;
1019           free (e);
1020         }
1021
1022       if (! ROBUST_OUTPUT_STYLE (output_style))
1023         for (f = 0; f < 2; ++f)
1024           if (cmp->file[f].missing_newline)
1025             {
1026               error (0, 0, "%s: %s\n",
1027                      file_label[f] ? file_label[f] : cmp->file[f].name,
1028                      _("No newline at end of file"));
1029               changes = 2;
1030             }
1031     }
1032
1033   if (cmp->file[0].buffer != cmp->file[1].buffer)
1034     free (cmp->file[0].buffer);
1035   free (cmp->file[1].buffer);
1036
1037   return changes;
1038 }