contrib/subversion/subversion/libsvn_diff/lcs.c

   1 /*
   2  * lcs.c :  routines for creating an lcs
   3  *
   4  * ====================================================================
   5  *    Licensed to the Apache Software Foundation (ASF) under one
   6  *    or more contributor license agreements.  See the NOTICE file
   7  *    distributed with this work for additional information
   8  *    regarding copyright ownership.  The ASF licenses this file
   9  *    to you under the Apache License, Version 2.0 (the
  10  *    "License"); you may not use this file except in compliance
  11  *    with the License.  You may obtain a copy of the License at
  12  *
  13  *      http://www.apache.org/licenses/LICENSE-2.0
  14  *
  15  *    Unless required by applicable law or agreed to in writing,
  16  *    software distributed under the License is distributed on an
  17  *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  18  *    KIND, either express or implied.  See the License for the
  19  *    specific language governing permissions and limitations
  20  *    under the License.
  21  * ====================================================================
  22  */
  23
  24
  25 #include <apr.h>
  26 #include <apr_pools.h>
  27 #include <apr_general.h>
  28
  29 #include "diff.h"
  30
  31
  32 /*
  33  * Calculate the Longest Common Subsequence (LCS) between two datasources.
  34  * This function is what makes the diff code tick.
  35  *
  36  * The LCS algorithm implemented here is based on the approach described
  37  * by Sun Wu, Udi Manber and Gene Meyers in "An O(NP) Sequence Comparison
  38  * Algorithm", but has been modified for better performance.
  39  *
  40  * Let M and N be the lengths (number of tokens) of the two sources
  41  * ('files'). The goal is to reach the end of both sources (files) with the
  42  * minimum number of insertions + deletions. Since there is a known length
  43  * difference N-M between the files, that is equivalent to just the minimum
  44  * number of deletions, or equivalently the minimum number of insertions.
  45  * For symmetry, we use the lesser number - deletions if M<N, insertions if
  46  * M>N.
  47  *
  48  * Let 'k' be the difference in remaining length between the files, i.e.
  49  * if we're at the beginning of both files, k=N-M, whereas k=0 for the
  50  * 'end state', at the end of both files. An insertion will increase k by
  51  * one, while a deletion decreases k by one. If k<0, then insertions are
  52  * 'free' - we need those to reach the end state k=0 anyway - but deletions
  53  * are costly: Adding a deletion means that we will have to add an additional
  54  * insertion later to reach the end state, so it doesn't matter if we count
  55  * deletions or insertions. Similarly, deletions are free for k>0.
  56  *
  57  * Let a 'state' be a given position in each file {pos1, pos2}. An array
  58  * 'fp' keeps track of the best possible state (largest values of
  59  * {pos1, pos2}) that can be achieved for a given cost 'p' (# moves away
  60  * from k=0), as well as a linked list of what matches were used to reach
  61  * that state. For each new value of p, we find for each value of k the
  62  * best achievable state for that k - either by doing a costly operation
  63  * (deletion if k<0) from a state achieved at a lower p, or doing a free
  64  * operation (insertion if k<0) from a state achieved at the same p -
  65  * and in both cases advancing past any matching regions found. This is
  66  * handled by running loops over k in order of descending absolute value.
  67  *
  68  * A recent improvement of the algorithm is to ignore tokens that are unique
  69  * to one file or the other, as those are known from the start to be
  70  * impossible to match.
  71  */
  72
  73 typedef struct svn_diff__snake_t svn_diff__snake_t;
  74
  75 struct svn_diff__snake_t
  76 {
  77     apr_off_t             y;
  78     svn_diff__lcs_t      *lcs;
  79     svn_diff__position_t *position[2];
  80 };
  81
  82 static APR_INLINE void
  83 svn_diff__snake(svn_diff__snake_t *fp_k,
  84                 svn_diff__token_index_t *token_counts[2],
  85                 svn_diff__lcs_t **freelist,
  86                 apr_pool_t *pool)
  87 {
  88   svn_diff__position_t *start_position[2];
  89   svn_diff__position_t *position[2];
  90   svn_diff__lcs_t *lcs;
  91   svn_diff__lcs_t *previous_lcs;
  92
  93   /* The previous entry at fp[k] is going to be replaced.  See if we
  94    * can mark that lcs node for reuse, because the sequence up to this
  95    * point was a dead end.
  96    */
  97   lcs = fp_k[0].lcs;
  98   while (lcs)
  99     {
 100       lcs->refcount--;
 101       if (lcs->refcount)
 102         break;
 103
 104       previous_lcs = lcs->next;
 105       lcs->next = *freelist;
 106       *freelist = lcs;
 107       lcs = previous_lcs;
 108     }
 109
 110   if (fp_k[-1].y >= fp_k[1].y)
 111     {
 112       start_position[0] = fp_k[-1].position[0];
 113       start_position[1] = fp_k[-1].position[1]->next;
 114
 115       previous_lcs = fp_k[-1].lcs;
 116     }
 117   else
 118     {
 119       start_position[0] = fp_k[1].position[0]->next;
 120       start_position[1] = fp_k[1].position[1];
 121
 122       previous_lcs = fp_k[1].lcs;
 123     }
 124
 125
 126   if (previous_lcs)
 127     {
 128       previous_lcs->refcount++;
 129     }
 130
 131   /* ### Optimization, skip all positions that don't have matchpoints
 132    * ### anyway. Beware of the sentinel, don't skip it!
 133    */
 134
 135   position[0] = start_position[0];
 136   position[1] = start_position[1];
 137
 138   while (1)
 139     {
 140       while (position[0]->token_index == position[1]->token_index)
 141         {
 142           position[0] = position[0]->next;
 143           position[1] = position[1]->next;
 144         }
 145
 146       if (position[1] != start_position[1])
 147         {
 148           lcs = *freelist;
 149           if (lcs)
 150             {
 151               *freelist = lcs->next;
 152             }
 153           else
 154             {
 155               lcs = apr_palloc(pool, sizeof(*lcs));
 156             }
 157
 158           lcs->position[0] = start_position[0];
 159           lcs->position[1] = start_position[1];
 160           lcs->length = position[1]->offset - start_position[1]->offset;
 161           lcs->next = previous_lcs;
 162           lcs->refcount = 1;
 163           previous_lcs = lcs;
 164           start_position[0] = position[0];
 165           start_position[1] = position[1];
 166         }
 167
 168       /* Skip any and all tokens that only occur in one of the files */
 169       if (position[0]->token_index >= 0
 170           && token_counts[1][position[0]->token_index] == 0)
 171         start_position[0] = position[0] = position[0]->next;
 172       else if (position[1]->token_index >= 0
 173                && token_counts[0][position[1]->token_index] == 0)
 174         start_position[1] = position[1] = position[1]->next;
 175       else
 176         break;
 177     }
 178
 179   fp_k[0].lcs = previous_lcs;
 180   fp_k[0].position[0] = position[0];
 181   fp_k[0].position[1] = position[1];
 182
 183   fp_k[0].y = position[1]->offset;
 184 }
 185
 186
 187 static svn_diff__lcs_t *
 188 svn_diff__lcs_reverse(svn_diff__lcs_t *lcs)
 189 {
 190   svn_diff__lcs_t *next;
 191   svn_diff__lcs_t *prev;
 192
 193   next = NULL;
 194   while (lcs != NULL)
 195     {
 196       prev = lcs->next;
 197       lcs->next = next;
 198       next = lcs;
 199       lcs = prev;
 200     }
 201
 202   return next;
 203 }
 204
 205
 206 /* Prepends a new lcs chunk for the amount of LINES at the given positions
 207  * POS0_OFFSET and POS1_OFFSET to the given LCS chain, and returns it.
 208  * This function assumes LINES > 0. */
 209 static svn_diff__lcs_t *
 210 prepend_lcs(svn_diff__lcs_t *lcs, apr_off_t lines,
 211             apr_off_t pos0_offset, apr_off_t pos1_offset,
 212             apr_pool_t *pool)
 213 {
 214   svn_diff__lcs_t *new_lcs;
 215
 216   SVN_ERR_ASSERT_NO_RETURN(lines > 0);
 217
 218   new_lcs = apr_palloc(pool, sizeof(*new_lcs));
 219   new_lcs->position[0] = apr_pcalloc(pool, sizeof(*new_lcs->position[0]));
 220   new_lcs->position[0]->offset = pos0_offset;
 221   new_lcs->position[1] = apr_pcalloc(pool, sizeof(*new_lcs->position[1]));
 222   new_lcs->position[1]->offset = pos1_offset;
 223   new_lcs->length = lines;
 224   new_lcs->refcount = 1;
 225   new_lcs->next = lcs;
 226
 227   return new_lcs;
 228 }
 229
 230
 231 svn_diff__lcs_t *
 232 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */
 233               svn_diff__position_t *position_list2, /* pointer to tail (ring) */
 234               svn_diff__token_index_t *token_counts_list1, /* array of counts */
 235               svn_diff__token_index_t *token_counts_list2, /* array of counts */
 236               svn_diff__token_index_t num_tokens,
 237               apr_off_t prefix_lines,
 238               apr_off_t suffix_lines,
 239               apr_pool_t *pool)
 240 {
 241   apr_off_t length[2];
 242   svn_diff__token_index_t *token_counts[2];
 243   svn_diff__token_index_t unique_count[2];
 244   svn_diff__token_index_t token_index;
 245   svn_diff__snake_t *fp;
 246   apr_off_t d;
 247   apr_off_t k;
 248   apr_off_t p = 0;
 249   svn_diff__lcs_t *lcs, *lcs_freelist = NULL;
 250
 251   svn_diff__position_t sentinel_position[2];
 252
 253   /* Since EOF is always a sync point we tack on an EOF link
 254    * with sentinel positions
 255    */
 256   lcs = apr_palloc(pool, sizeof(*lcs));
 257   lcs->position[0] = apr_pcalloc(pool, sizeof(*lcs->position[0]));
 258   lcs->position[0]->offset = position_list1
 259                              ? position_list1->offset + suffix_lines + 1
 260                              : prefix_lines + suffix_lines + 1;
 261   lcs->position[1] = apr_pcalloc(pool, sizeof(*lcs->position[1]));
 262   lcs->position[1]->offset = position_list2
 263                              ? position_list2->offset + suffix_lines + 1
 264                              : prefix_lines + suffix_lines + 1;
 265   lcs->length = 0;
 266   lcs->refcount = 1;
 267   lcs->next = NULL;
 268
 269   if (position_list1 == NULL || position_list2 == NULL)
 270     {
 271       if (suffix_lines)
 272         lcs = prepend_lcs(lcs, suffix_lines,
 273                           lcs->position[0]->offset - suffix_lines,
 274                           lcs->position[1]->offset - suffix_lines,
 275                           pool);
 276       if (prefix_lines)
 277         lcs = prepend_lcs(lcs, prefix_lines, 1, 1, pool);
 278
 279       return lcs;
 280     }
 281
 282   unique_count[1] = unique_count[0] = 0;
 283   for (token_index = 0; token_index < num_tokens; token_index++)
 284     {
 285       if (token_counts_list1[token_index] == 0)
 286         unique_count[1] += token_counts_list2[token_index];
 287       if (token_counts_list2[token_index] == 0)
 288         unique_count[0] += token_counts_list1[token_index];
 289     }
 290
 291   /* Calculate lengths M and N of the sequences to be compared. Do not
 292    * count tokens unique to one file, as those are ignored in __snake.
 293    */
 294   length[0] = position_list1->offset - position_list1->next->offset + 1
 295               - unique_count[0];
 296   length[1] = position_list2->offset - position_list2->next->offset + 1
 297               - unique_count[1];
 298
 299   /* strikerXXX: here we allocate the furthest point array, which is
 300    * strikerXXX: sized M + N + 3 (!)
 301    */
 302   fp = apr_pcalloc(pool,
 303                    sizeof(*fp) * (apr_size_t)(length[0] + length[1] + 3));
 304
 305   /* The origo of fp corresponds to the end state, where we are
 306    * at the end of both files. The valid states thus span from
 307    * -N (at end of first file and at the beginning of the second
 308    * file) to +M (the opposite :). Finally, svn_diff__snake needs
 309    * 1 extra slot on each side to work.
 310    */
 311   fp += length[1] + 1;
 312
 313   sentinel_position[0].next = position_list1->next;
 314   position_list1->next = &sentinel_position[0];
 315   sentinel_position[0].offset = position_list1->offset + 1;
 316   token_counts[0] = token_counts_list1;
 317
 318   sentinel_position[1].next = position_list2->next;
 319   position_list2->next = &sentinel_position[1];
 320   sentinel_position[1].offset = position_list2->offset + 1;
 321   token_counts[1] = token_counts_list2;
 322
 323   /* Negative indices will not be used elsewhere
 324    */
 325   sentinel_position[0].token_index = -1;
 326   sentinel_position[1].token_index = -2;
 327
 328   /* position d = M - N corresponds to the initial state, where
 329    * we are at the beginning of both files.
 330    */
 331   d = length[0] - length[1];
 332
 333   /* k = d - 1 will be the first to be used to get previous
 334    * position information from, make sure it holds sane
 335    * data
 336    */
 337   fp[d - 1].position[0] = sentinel_position[0].next;
 338   fp[d - 1].position[1] = &sentinel_position[1];
 339
 340   p = 0;
 341   do
 342     {
 343       /* For k < 0, insertions are free */
 344       for (k = (d < 0 ? d : 0) - p; k < 0; k++)
 345         {
 346           svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool);
 347         }
 348           /* for k > 0, deletions are free */
 349       for (k = (d > 0 ? d : 0) + p; k >= 0; k--)
 350         {
 351           svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool);
 352         }
 353
 354       p++;
 355     }
 356   while (fp[0].position[1] != &sentinel_position[1]);
 357
 358   if (suffix_lines)
 359     lcs->next = prepend_lcs(fp[0].lcs, suffix_lines,
 360                             lcs->position[0]->offset - suffix_lines,
 361                             lcs->position[1]->offset - suffix_lines,
 362                             pool);
 363   else
 364     lcs->next = fp[0].lcs;
 365
 366   lcs = svn_diff__lcs_reverse(lcs);
 367
 368   position_list1->next = sentinel_position[0].next;
 369   position_list2->next = sentinel_position[1].next;
 370
 371   if (prefix_lines)
 372     return prepend_lcs(lcs, prefix_lines, 1, 1, pool);
 373   else
 374     return lcs;
 375 }