contrib/subversion/subversion/libsvn_delta/xdelta.c

   1 /*
   2  * xdelta.c:  xdelta generator.
   3  *
   4  * ====================================================================
   5  *    Licensed to the Apache Software Foundation (ASF) under one
   6  *    or more contributor license agreements.  See the NOTICE file
   7  *    distributed with this work for additional information
   8  *    regarding copyright ownership.  The ASF licenses this file
   9  *    to you under the Apache License, Version 2.0 (the
  10  *    "License"); you may not use this file except in compliance
  11  *    with the License.  You may obtain a copy of the License at
  12  *
  13  *      http://www.apache.org/licenses/LICENSE-2.0
  14  *
  15  *    Unless required by applicable law or agreed to in writing,
  16  *    software distributed under the License is distributed on an
  17  *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  18  *    KIND, either express or implied.  See the License for the
  19  *    specific language governing permissions and limitations
  20  *    under the License.
  21  * ====================================================================
  22  */
  23
  24
  25 #include <assert.h>
  26
  27 #include <apr_general.h>        /* for APR_INLINE */
  28 #include <apr_hash.h>
  29
  30 #include "svn_hash.h"
  31 #include "svn_delta.h"
  32 #include "private/svn_string_private.h"
  33 #include "delta.h"
  34 \f
  35 /* This is pseudo-adler32. It is adler32 without the prime modulus.
  36    The idea is borrowed from monotone, and is a translation of the C++
  37    code.  Graydon Hoare, the author of the original code, gave his
  38    explicit permission to use it under these terms at 8:02pm on
  39    Friday, February 11th, 2005.  */
  40
  41 /* Size of the blocks we compute checksums for. This was chosen out of
  42    thin air.  Monotone used 64, xdelta1 used 64, rsync uses 128.
  43    However, later optimizations assume it to be 256 or less.
  44  */
  45 #define MATCH_BLOCKSIZE 64
  46
  47 /* Size of the checksum presence FLAGS array in BLOCKS_T.  With standard
  48    MATCH_BLOCKSIZE and SVN_DELTA_WINDOW_SIZE, 32k entries is about 20x
  49    the number of checksums that actually occur, i.e. we expect a >95%
  50    probability that non-matching checksums get already detected by checking
  51    against the FLAGS array.
  52    Must be a power of 2.
  53  */
  54 #define FLAGS_COUNT (32 * 1024)
  55
  56 /* "no" / "invalid" / "unused" value for positions within the delta windows
  57  */
  58 #define NO_POSITION ((apr_uint32_t)-1)
  59
  60 /* Feed C_IN into the adler32 checksum and remove C_OUT at the same time.
  61  * This function may (and will) only be called for characters that are
  62  * MATCH_BLOCKSIZE positions apart.
  63  *
  64  * Please note that the lower 16 bits cannot overflow in neither direction.
  65  * Therefore, we don't need to split the value into separate values for
  66  * sum(char) and sum(sum(char)).
  67  */
  68 static APR_INLINE apr_uint32_t
  69 adler32_replace(apr_uint32_t adler32, const char c_out, const char c_in)
  70 {
  71   adler32 -= (MATCH_BLOCKSIZE * 0x10000u * ((unsigned char) c_out));
  72
  73   adler32 -= (unsigned char)c_out;
  74   adler32 += (unsigned char)c_in;
  75
  76   return adler32 + adler32 * 0x10000;
  77 }
  78
  79 /* Calculate an pseudo-adler32 checksum for MATCH_BLOCKSIZE bytes starting
  80    at DATA.  Return the checksum value.  */
  81
  82 static APR_INLINE apr_uint32_t
  83 init_adler32(const char *data)
  84 {
  85   const unsigned char *input = (const unsigned char *)data;
  86   const unsigned char *last = input + MATCH_BLOCKSIZE;
  87
  88   apr_uint32_t s1 = 0;
  89   apr_uint32_t s2 = 0;
  90
  91   for (; input < last; input += 8)
  92     {
  93       s1 += input[0]; s2 += s1;
  94       s1 += input[1]; s2 += s1;
  95       s1 += input[2]; s2 += s1;
  96       s1 += input[3]; s2 += s1;
  97       s1 += input[4]; s2 += s1;
  98       s1 += input[5]; s2 += s1;
  99       s1 += input[6]; s2 += s1;
 100       s1 += input[7]; s2 += s1;
 101     }
 102
 103   return s2 * 0x10000 + s1;
 104 }
 105
 106 /* Information for a block of the delta source.  The length of the
 107    block is the smaller of MATCH_BLOCKSIZE and the difference between
 108    the size of the source data and the position of this block. */
 109 struct block
 110 {
 111   apr_uint32_t adlersum;
 112
 113 /* Even in 64 bit systems, store only 32 bit offsets in our hash table
 114    (our delta window size much much smaller then 4GB).
 115    That reduces the hash table size by 50% from 32to 16KB
 116    and makes it easier to fit into the CPU's L1 cache. */
 117   apr_uint32_t pos;    /* NO_POSITION -> block is not used */
 118 };
 119
 120 /* A hash table, using open addressing, of the blocks of the source. */
 121 struct blocks
 122 {
 123   /* The largest valid index of slots.
 124      This value has an upper bound proportionate to the text delta
 125      window size, so unless we dramatically increase the window size,
 126      it's safe to make this a 32-bit value.  In any case, it has to be
 127      hte same width as the block position index, (struct
 128      block).pos. */
 129   apr_uint32_t max;
 130
 131   /* Source buffer that the positions in SLOTS refer to. */
 132   const char* data;
 133
 134   /* Bit array indicating whether there may be a matching slot for a given
 135      adler32 checksum.  Since FLAGS has much more entries than SLOTS, this
 136      will indicate most cases of non-matching checksums with a "0" bit, i.e.
 137      as "known not to have a match".
 138      The mapping of adler32 checksum bits is [0..2][16..27] (LSB -> MSB),
 139      i.e. address the byte by the multiplicative part of adler32 and address
 140      the bits in that byte by the additive part of adler32. */
 141   char flags[FLAGS_COUNT / 8];
 142
 143   /* The vector of blocks.  A pos value of NO_POSITION represents an unused
 144      slot. */
 145   struct block *slots;
 146 };
 147
 148
 149 /* Return a hash value calculated from the adler32 SUM, suitable for use with
 150    our hash table. */
 151 static apr_uint32_t hash_func(apr_uint32_t sum)
 152 {
 153   /* Since the adl32 checksum have a bad distribution for the 11th to 16th
 154      bits when used for our small block size, we add some bits from the
 155      other half of the checksum. */
 156   return sum ^ (sum >> 12);
 157 }
 158
 159 /* Return the offset in BLOCKS.FLAGS for the adler32 SUM. */
 160 static apr_uint32_t hash_flags(apr_uint32_t sum)
 161 {
 162   /* The upper half of SUM has a wider value range than the lower 16 bit.
 163      Also, we want to a different folding than HASH_FUNC to minimize
 164      correlation between different hash levels. */
 165   return (sum >> 16) & ((FLAGS_COUNT / 8) - 1);
 166 }
 167
 168 /* Insert a block with the checksum ADLERSUM at position POS in the source
 169    data into the table BLOCKS.  Ignore true duplicates, i.e. blocks with
 170    actually the same content. */
 171 static void
 172 add_block(struct blocks *blocks, apr_uint32_t adlersum, apr_uint32_t pos)
 173 {
 174   apr_uint32_t h = hash_func(adlersum) & blocks->max;
 175
 176   /* This will terminate, since we know that we will not fill the table. */
 177   for (; blocks->slots[h].pos != NO_POSITION; h = (h + 1) & blocks->max)
 178     if (blocks->slots[h].adlersum == adlersum)
 179       if (memcmp(blocks->data + blocks->slots[h].pos, blocks->data + pos,
 180                  MATCH_BLOCKSIZE) == 0)
 181         return;
 182
 183   blocks->slots[h].adlersum = adlersum;
 184   blocks->slots[h].pos = pos;
 185   blocks->flags[hash_flags(adlersum)] |= 1 << (adlersum & 7);
 186 }
 187
 188 /* Find a block in BLOCKS with the checksum ADLERSUM and matching the content
 189    at DATA, returning its position in the source data.  If there is no such
 190    block, return NO_POSITION. */
 191 static apr_uint32_t
 192 find_block(const struct blocks *blocks,
 193            apr_uint32_t adlersum,
 194            const char* data)
 195 {
 196   apr_uint32_t h = hash_func(adlersum) & blocks->max;
 197
 198   for (; blocks->slots[h].pos != NO_POSITION; h = (h + 1) & blocks->max)
 199     if (blocks->slots[h].adlersum == adlersum)
 200       if (memcmp(blocks->data + blocks->slots[h].pos, data,
 201                  MATCH_BLOCKSIZE) == 0)
 202         return blocks->slots[h].pos;
 203
 204   return NO_POSITION;
 205 }
 206
 207 /* Initialize the matches table from DATA of size DATALEN.  This goes
 208    through every block of MATCH_BLOCKSIZE bytes in the source and
 209    checksums it, inserting the result into the BLOCKS table.  */
 210 static void
 211 init_blocks_table(const char *data,
 212                   apr_size_t datalen,
 213                   struct blocks *blocks,
 214                   apr_pool_t *pool)
 215 {
 216   apr_size_t nblocks;
 217   apr_size_t wnslots = 1;
 218   apr_uint32_t nslots;
 219   apr_uint32_t i;
 220
 221   /* Be pessimistic about the block count. */
 222   nblocks = datalen / MATCH_BLOCKSIZE + 1;
 223   /* Find nearest larger power of two. */
 224   while (wnslots <= nblocks)
 225     wnslots *= 2;
 226   /* Double the number of slots to avoid a too high load. */
 227   wnslots *= 2;
 228   /* Narrow the number of slots to 32 bits, which is the size of the
 229      block position index in the hash table.
 230      Sanity check: On 64-bit platforms, apr_size_t is likely to be
 231      larger than apr_uint32_t. Make sure that the number of slots
 232      actually fits into blocks->max.  It's safe to use a hard assert
 233      here, because the largest possible value for nslots is
 234      proportional to the text delta window size and is therefore much
 235      smaller than the range of an apr_uint32_t.  If we ever happen to
 236      increase the window size too much, this assertion will get
 237      triggered by the test suite. */
 238   nslots = (apr_uint32_t) wnslots;
 239   SVN_ERR_ASSERT_NO_RETURN(wnslots == nslots);
 240   blocks->max = nslots - 1;
 241   blocks->data = data;
 242   blocks->slots = apr_palloc(pool, nslots * sizeof(*(blocks->slots)));
 243   for (i = 0; i < nslots; ++i)
 244     {
 245       /* Avoid using an indeterminate value in the lookup. */
 246       blocks->slots[i].adlersum = 0;
 247       blocks->slots[i].pos = NO_POSITION;
 248     }
 249
 250   /* No checksum entries in SLOTS, yet => reset all checksum flags. */
 251   memset(blocks->flags, 0, sizeof(blocks->flags));
 252
 253   /* If there is an odd block at the end of the buffer, we will
 254      not use that shorter block for deltification (only indirectly
 255      as an extension of some previous block). */
 256   for (i = 0; i + MATCH_BLOCKSIZE <= datalen; i += MATCH_BLOCKSIZE)
 257     add_block(blocks, init_adler32(data + i), i);
 258 }
 259
 260 /* Try to find a match for the target data B in BLOCKS, and then
 261    extend the match as long as data in A and B at the match position
 262    continues to match.  We set the position in A we ended up in (in
 263    case we extended it backwards) in APOSP and update the corresponding
 264    position within B given in BPOSP. PENDING_INSERT_START sets the
 265    lower limit to BPOSP.
 266    Return number of matching bytes starting at ASOP.  Return 0 if
 267    no match has been found.
 268  */
 269 static apr_size_t
 270 find_match(const struct blocks *blocks,
 271            const apr_uint32_t rolling,
 272            const char *a,
 273            apr_size_t asize,
 274            const char *b,
 275            apr_size_t bsize,
 276            apr_size_t *bposp,
 277            apr_size_t *aposp,
 278            apr_size_t pending_insert_start)
 279 {
 280   apr_size_t apos, bpos = *bposp;
 281   apr_size_t delta, max_delta;
 282
 283   apos = find_block(blocks, rolling, b + bpos);
 284
 285   /* See if we have a match.  */
 286   if (apos == NO_POSITION)
 287     return 0;
 288
 289   /* Extend the match forward as far as possible */
 290   max_delta = asize - apos - MATCH_BLOCKSIZE < bsize - bpos - MATCH_BLOCKSIZE
 291             ? asize - apos - MATCH_BLOCKSIZE
 292             : bsize - bpos - MATCH_BLOCKSIZE;
 293   delta = svn_cstring__match_length(a + apos + MATCH_BLOCKSIZE,
 294                                     b + bpos + MATCH_BLOCKSIZE,
 295                                     max_delta);
 296
 297   /* See if we can extend backwards (max MATCH_BLOCKSIZE-1 steps because A's
 298      content has been sampled only every MATCH_BLOCKSIZE positions).  */
 299   while (apos > 0 && bpos > pending_insert_start && a[apos-1] == b[bpos-1])
 300     {
 301       --apos;
 302       --bpos;
 303       ++delta;
 304     }
 305
 306   *aposp = apos;
 307   *bposp = bpos;
 308
 309   return MATCH_BLOCKSIZE + delta;
 310 }
 311
 312 /* Utility for compute_delta() that compares the range B[START,BSIZE) with
 313  * the range of similar size before A[ASIZE]. Create corresponding copy and
 314  * insert operations.
 315  *
 316  * BUILD_BATON and POOL will be passed through from compute_delta().
 317  */
 318 static void
 319 store_delta_trailer(svn_txdelta__ops_baton_t *build_baton,
 320                     const char *a,
 321                     apr_size_t asize,
 322                     const char *b,
 323                     apr_size_t bsize,
 324                     apr_size_t start,
 325                     apr_pool_t *pool)
 326 {
 327   apr_size_t end_match;
 328   apr_size_t max_len = asize > (bsize - start) ? bsize - start : asize;
 329   if (max_len == 0)
 330     return;
 331
 332   end_match = svn_cstring__reverse_match_length(a + asize, b + bsize,
 333                                                 max_len);
 334   if (end_match <= 4)
 335     end_match = 0;
 336
 337   if (bsize - start > end_match)
 338     svn_txdelta__insert_op(build_baton, svn_txdelta_new,
 339                            start, bsize - start - end_match, b + start, pool);
 340   if (end_match)
 341     svn_txdelta__insert_op(build_baton, svn_txdelta_source,
 342                            asize - end_match, end_match, NULL, pool);
 343 }
 344
 345
 346 /* Compute a delta from A to B using xdelta.
 347
 348    The basic xdelta algorithm is as follows:
 349
 350    1. Go through the source data, checksumming every MATCH_BLOCKSIZE
 351       block of bytes using adler32, and inserting the checksum into a
 352       match table with the position of the match.
 353    2. Go through the target byte by byte, seeing if that byte starts a
 354       match that we have in the match table.
 355       2a. If so, try to extend the match as far as possible both
 356           forwards and backwards, and then insert a source copy
 357           operation into the delta ops builder for the match.
 358       2b. If not, insert the byte as new data using an insert delta op.
 359
 360    Our implementation doesn't immediately insert "insert" operations,
 361    it waits until we have another copy, or we are done.  The reasoning
 362    is twofold:
 363
 364    1. Otherwise, we would just be building a ton of 1 byte insert
 365       operations
 366    2. So that we can extend a source match backwards into a pending
 367      insert operation, and possibly remove the need for the insert
 368      entirely.  This can happen due to stream alignment.
 369 */
 370 static void
 371 compute_delta(svn_txdelta__ops_baton_t *build_baton,
 372               const char *a,
 373               apr_size_t asize,
 374               const char *b,
 375               apr_size_t bsize,
 376               apr_pool_t *pool)
 377 {
 378   struct blocks blocks;
 379   apr_uint32_t rolling;
 380   apr_size_t lo = 0, pending_insert_start = 0, upper;
 381
 382   /* Optimization: directly compare window starts. If more than 4
 383    * bytes match, we can immediately create a matching windows.
 384    * Shorter sequences result in a net data increase. */
 385   lo = svn_cstring__match_length(a, b, asize > bsize ? bsize : asize);
 386   if ((lo > 4) || (lo == bsize))
 387     {
 388       svn_txdelta__insert_op(build_baton, svn_txdelta_source,
 389                              0, lo, NULL, pool);
 390       pending_insert_start = lo;
 391     }
 392   else
 393     lo = 0;
 394
 395   /* If the size of the target is smaller than the match blocksize, just
 396      insert the entire target.  */
 397   if ((bsize - lo < MATCH_BLOCKSIZE) || (asize < MATCH_BLOCKSIZE))
 398     {
 399       store_delta_trailer(build_baton, a, asize, b, bsize, lo, pool);
 400       return;
 401     }
 402
 403   upper = bsize - MATCH_BLOCKSIZE; /* this is now known to be >= LO */
 404
 405   /* Initialize the matches table.  */
 406   init_blocks_table(a, asize, &blocks, pool);
 407
 408   /* Initialize our rolling checksum.  */
 409   rolling = init_adler32(b + lo);
 410   while (lo < upper)
 411     {
 412       apr_size_t matchlen;
 413       apr_size_t apos;
 414
 415       /* Quickly skip positions whose respective ROLLING checksums
 416          definitely do not match any SLOT in BLOCKS. */
 417       while (!(blocks.flags[hash_flags(rolling)] & (1 << (rolling & 7)))
 418              && lo < upper)
 419         {
 420           rolling = adler32_replace(rolling, b[lo], b[lo+MATCH_BLOCKSIZE]);
 421           lo++;
 422         }
 423
 424       /* LO is still <= UPPER, i.e. the following lookup is legal:
 425          Closely check whether we've got a match for the current location.
 426          Due to the above pre-filter, chances are that we find one. */
 427       matchlen = find_match(&blocks, rolling, a, asize, b, bsize,
 428                             &lo, &apos, pending_insert_start);
 429
 430       /* If we didn't find a real match, insert the byte at the target
 431          position into the pending insert.  */
 432       if (matchlen == 0)
 433         {
 434           /* move block one position forward. Short blocks at the end of
 435              the buffer cannot be used as the beginning of a new match */
 436           if (lo + MATCH_BLOCKSIZE < bsize)
 437             rolling = adler32_replace(rolling, b[lo], b[lo+MATCH_BLOCKSIZE]);
 438
 439           lo++;
 440         }
 441       else
 442         {
 443           /* store the sequence of B that is between the matches */
 444           if (lo - pending_insert_start > 0)
 445             svn_txdelta__insert_op(build_baton, svn_txdelta_new,
 446                                    0, lo - pending_insert_start,
 447                                    b + pending_insert_start, pool);
 448           else
 449             {
 450               /* the match borders on the previous op. Maybe, we found a
 451                * match that is better than / overlapping the previous one. */
 452               apr_size_t len = svn_cstring__reverse_match_length
 453                                  (a + apos, b + lo, apos < lo ? apos : lo);
 454               if (len > 0)
 455                 {
 456                   len = svn_txdelta__remove_copy(build_baton, len);
 457                   apos -= len;
 458                   matchlen += len;
 459                   lo -= len;
 460                 }
 461             }
 462
 463           /* Reset the pending insert start to immediately after the
 464              match. */
 465           lo += matchlen;
 466           pending_insert_start = lo;
 467           svn_txdelta__insert_op(build_baton, svn_txdelta_source,
 468                                  apos, matchlen, NULL, pool);
 469
 470           /* Calculate the Adler32 sum for the first block behind the match.
 471            * Ignore short buffers at the end of B.
 472            */
 473           if (lo + MATCH_BLOCKSIZE <= bsize)
 474             rolling = init_adler32(b + lo);
 475         }
 476     }
 477
 478   /* If we still have an insert pending at the end, throw it in.  */
 479   store_delta_trailer(build_baton, a, asize, b, bsize, pending_insert_start, pool);
 480 }
 481
 482 void
 483 svn_txdelta__xdelta(svn_txdelta__ops_baton_t *build_baton,
 484                     const char *data,
 485                     apr_size_t source_len,
 486                     apr_size_t target_len,
 487                     apr_pool_t *pool)
 488 {
 489   /*  We should never be asked to compute something when the source_len is 0;
 490       we just use a single insert op there (and rely on zlib for
 491       compression). */
 492   assert(source_len != 0);
 493   compute_delta(build_baton, data, source_len,
 494                 data + source_len, target_len,
 495                 pool);
 496 }