contrib/subversion/subversion/libsvn_subr/utf8proc.c

   1 /*
   2  * utf8proc.c:  Wrappers for the utf8proc library
   3  *
   4  * ====================================================================
   5  *    Licensed to the Apache Software Foundation (ASF) under one
   6  *    or more contributor license agreements.  See the NOTICE file
   7  *    distributed with this work for additional information
   8  *    regarding copyright ownership.  The ASF licenses this file
   9  *    to you under the Apache License, Version 2.0 (the
  10  *    "License"); you may not use this file except in compliance
  11  *    with the License.  You may obtain a copy of the License at
  12  *
  13  *      http://www.apache.org/licenses/LICENSE-2.0
  14  *
  15  *    Unless required by applicable law or agreed to in writing,
  16  *    software distributed under the License is distributed on an
  17  *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  18  *    KIND, either express or implied.  See the License for the
  19  *    specific language governing permissions and limitations
  20  *    under the License.
  21  * ====================================================================
  22  */
  23
  24
  25 \f
  26 #include <apr_fnmatch.h>
  27
  28 #include "private/svn_string_private.h"
  29 #include "private/svn_utf_private.h"
  30 #include "svn_private_config.h"
  31
  32 #define UTF8PROC_INLINE
  33 /* Somehow utf8proc thinks it is nice to use strlen as an argument name,
  34    while this function is already defined via apr.h */
  35 #define strlen svn__strlen_var
  36 #include "utf8proc/utf8proc.c"
  37 #undef strlen
  38
  39 \f
  40
  41 const char *
  42 svn_utf__utf8proc_compiled_version(void)
  43 {
  44   static const char utf8proc_version[] =
  45                                   APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "."
  46                                   APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "."
  47                                   APR_STRINGIFY(UTF8PROC_VERSION_PATCH);
  48   return utf8proc_version;
  49 }
  50
  51 const char *
  52 svn_utf__utf8proc_runtime_version(void)
  53 {
  54   /* Unused static function warning removal hack. */
  55   SVN_UNUSED(utf8proc_NFD);
  56   SVN_UNUSED(utf8proc_NFC);
  57   SVN_UNUSED(utf8proc_NFKD);
  58   SVN_UNUSED(utf8proc_NFKC);
  59
  60   return utf8proc_version();
  61 }
  62
  63
  64
  65 /* Fill the given BUFFER with decomposed UCS-4 representation of the
  66  * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
  67  * is NUL-terminated; otherwise look only at the first LENGTH bytes in
  68  * STRING. Upon return, BUFFER->data points at an array of UCS-4
  69  * characters, and return the length of the array. TRANSFORM_FLAGS
  70  * define exactly how the decomposition is performed.
  71  *
  72  * A negative return value is an utf8proc error code and may indicate
  73  * that STRING contains invalid UTF-8 or was so long that an overflow
  74  * occurred.
  75  */
  76 static ssize_t
  77 unicode_decomposition(int transform_flags,
  78                       const char *string, apr_size_t length,
  79                       svn_membuf_t *buffer)
  80 {
  81   const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
  82                         ? UTF8PROC_NULLTERM : 0);
  83
  84   for (;;)
  85     {
  86       apr_int32_t *const ucs4buf = buffer->data;
  87       const ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
  88       const ssize_t result =
  89         utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
  90                            UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
  91                            | transform_flags | nullterm);
  92
  93       if (result < 0 || result <= ucs4len)
  94         return result;
  95
  96       /* Increase the decomposition buffer size and retry */
  97       svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
  98     }
  99 }
 100
 101 /* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
 102  * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
 103  * NUL-terminated; otherwise look only at the first LENGTH bytes in
 104  * STRING. Upon return, BUFFER->data points at an array of UCS-4
 105  * characters and *RESULT_LENGTH contains the length of the array.
 106  *
 107  * A returned error may indicate that STRING contains invalid UTF-8 or
 108  * invalid Unicode codepoints. Any error message comes from utf8proc.
 109  */
 110 static svn_error_t *
 111 decompose_normalized(apr_size_t *result_length,
 112                      const char *string, apr_size_t length,
 113                      svn_membuf_t *buffer)
 114 {
 115   ssize_t result = unicode_decomposition(0, string, length, buffer);
 116   if (result < 0)
 117     return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
 118                             gettext(utf8proc_errmsg(result)));
 119   *result_length = result;
 120   return SVN_NO_ERROR;
 121 }
 122
 123 /* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
 124  * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
 125  * NUL-terminated; otherwise look only at the first LENGTH bytes in
 126  * STRING. Upon return, BUFFER->data points at a NUL-terminated string
 127  * of UTF-8 characters.
 128  *
 129  * A returned error may indicate that STRING contains invalid UTF-8 or
 130  * invalid Unicode codepoints. Any error message comes from utf8proc.
 131  */
 132 static svn_error_t *
 133 normalize_cstring(apr_size_t *result_length,
 134                   const char *string, apr_size_t length,
 135                   svn_membuf_t *buffer)
 136 {
 137   ssize_t result = unicode_decomposition(0, string, length, buffer);
 138   if (result >= 0)
 139     {
 140       svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
 141       result = utf8proc_reencode(buffer->data, result,
 142                                  UTF8PROC_COMPOSE | UTF8PROC_STABLE);
 143     }
 144   if (result < 0)
 145     return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
 146                             gettext(utf8proc_errmsg(result)));
 147   *result_length = result;
 148   return SVN_NO_ERROR;
 149 }
 150
 151 /* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
 152  * length LENB. Return 0 if they're equal, a negative value if BUFA is
 153  * less than BUFB, otherwise a positive value.
 154  *
 155  * Yes, this is strcmp for known-length UCS-4 strings.
 156  */
 157 static int
 158 ucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
 159         const apr_int32_t *bufb, apr_size_t lenb)
 160 {
 161   const apr_size_t len = (lena < lenb ? lena : lenb);
 162   apr_size_t i;
 163
 164   for (i = 0; i < len; ++i)
 165     {
 166       const int diff = bufa[i] - bufb[i];
 167       if (diff)
 168         return diff;
 169     }
 170   return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
 171 }
 172
 173 svn_error_t *
 174 svn_utf__normcmp(int *result,
 175                  const char *str1, apr_size_t len1,
 176                  const char *str2, apr_size_t len2,
 177                  svn_membuf_t *buf1, svn_membuf_t *buf2)
 178 {
 179   apr_size_t buflen1;
 180   apr_size_t buflen2;
 181
 182   /* Shortcut-circuit the decision if at least one of the strings is empty. */
 183   const svn_boolean_t empty1 =
 184     (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
 185   const svn_boolean_t empty2 =
 186     (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
 187   if (empty1 || empty2)
 188     {
 189       *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
 190       return SVN_NO_ERROR;
 191     }
 192
 193   SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
 194   SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
 195   *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
 196   return SVN_NO_ERROR;
 197 }
 198
 199 svn_error_t*
 200 svn_utf__normalize(const char **result,
 201                    const char *str, apr_size_t len,
 202                    svn_membuf_t *buf)
 203 {
 204   apr_size_t result_length;
 205   SVN_ERR(normalize_cstring(&result_length, str, len, buf));
 206   *result = (const char*)(buf->data);
 207   return SVN_NO_ERROR;
 208 }
 209
 210 /* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
 211  * Assume BUFFER is already filled to *LENGTH and return the new size there.
 212  * This function does *not* nul-terminate the stringbuf!
 213  *
 214  * A returned error indicates that the codepoint is invalid.
 215  */
 216 static svn_error_t *
 217 encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
 218 {
 219   apr_size_t utf8len;
 220
 221   if (buffer->size - *length < 4)
 222     svn_membuf__resize(buffer, buffer->size + 4);
 223
 224   utf8len = utf8proc_encode_char(ucs4chr, ((uint8_t*)buffer->data + *length));
 225   if (!utf8len)
 226     return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
 227                              _("Invalid Unicode character U+%04lX"),
 228                              (long)ucs4chr);
 229   *length += utf8len;
 230   return SVN_NO_ERROR;
 231 }
 232
 233 svn_error_t *
 234 svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
 235                             const apr_int32_t *ucs4str,
 236                             apr_size_t length,
 237                             apr_size_t *result_length)
 238 {
 239   *result_length = 0;
 240   while (length-- > 0)
 241     SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
 242   svn_membuf__resize(buffer, *result_length + 1);
 243   ((char*)buffer->data)[*result_length] = '\0';
 244   return SVN_NO_ERROR;
 245 }
 246
 247
 248 svn_error_t *
 249 svn_utf__glob(svn_boolean_t *match,
 250               const char *pattern, apr_size_t pattern_len,
 251               const char *string, apr_size_t string_len,
 252               const char *escape, apr_size_t escape_len,
 253               svn_boolean_t sql_like,
 254               svn_membuf_t *pattern_buf,
 255               svn_membuf_t *string_buf,
 256               svn_membuf_t *temp_buf)
 257 {
 258   apr_size_t patternbuf_len;
 259   apr_size_t tempbuf_len;
 260
 261   /* If we're in GLOB mode, we don't do custom escape chars. */
 262   if (escape && !sql_like)
 263     return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
 264                             _("Cannot use a custom escape token"
 265                               " in glob matching mode"));
 266
 267   /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
 268      because apr_fnmatch can't handle it.*/
 269   SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
 270   if (!sql_like)
 271     SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
 272                                         tempbuf_len, &patternbuf_len));
 273   else
 274     {
 275       /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
 276       const apr_int32_t *like = temp_buf->data;
 277       apr_int32_t ucs4esc;
 278       svn_boolean_t escaped;
 279       apr_size_t i;
 280
 281       if (!escape)
 282         ucs4esc = -1;           /* Definitely an invalid UCS-4 character. */
 283       else
 284         {
 285           const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
 286                                 ? UTF8PROC_NULLTERM : 0);
 287           ssize_t result =
 288             utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
 289                                UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
 290           if (result < 0)
 291             return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
 292                                     gettext(utf8proc_errmsg(result)));
 293           if (result == 0 || result > 1)
 294             return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
 295                                     _("Escape token must be one character"));
 296           if ((ucs4esc & 0xFF) != ucs4esc)
 297             return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
 298                                      _("Invalid escape character U+%04lX"),
 299                                      (long)ucs4esc);
 300         }
 301
 302       patternbuf_len = 0;
 303       svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
 304       for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
 305         {
 306           if (*like == ucs4esc && !escaped)
 307             {
 308               svn_membuf__resize(pattern_buf, patternbuf_len + 1);
 309               ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
 310               escaped = TRUE;
 311             }
 312           else if (escaped)
 313             {
 314               SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
 315               escaped = FALSE;
 316             }
 317           else
 318             {
 319               if ((*like == '[' || *like == '\\') && !escaped)
 320                 {
 321                   /* Escape brackets and backslashes which are always
 322                      literals in LIKE patterns. */
 323                   svn_membuf__resize(pattern_buf, patternbuf_len + 1);
 324                   ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
 325                   escaped = TRUE;
 326                   --i; --like;
 327                   continue;
 328                 }
 329
 330               /* Replace LIKE wildcards with their GLOB equivalents. */
 331               if (*like == '%' || *like == '_')
 332                 {
 333                   const char wildcard = (*like == '%' ? '*' : '?');
 334                   svn_membuf__resize(pattern_buf, patternbuf_len + 1);
 335                   ((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
 336                 }
 337               else
 338                 SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
 339             }
 340         }
 341       svn_membuf__resize(pattern_buf, patternbuf_len + 1);
 342       ((char*)pattern_buf->data)[patternbuf_len] = '\0';
 343     }
 344
 345   /* Now normalize the string */
 346   SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
 347   SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
 348                                       tempbuf_len, &tempbuf_len));
 349
 350   *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
 351   return SVN_NO_ERROR;
 352 }
 353
 354 svn_boolean_t
 355 svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
 356 {
 357   svn_error_t *err;
 358   svn_membuf_t buffer;
 359   apr_size_t result_length;
 360   const apr_size_t length = strlen(string);
 361   svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
 362   err = normalize_cstring(&result_length, string, length, &buffer);
 363   if (err)
 364     {
 365       svn_error_clear(err);
 366       return FALSE;
 367     }
 368   return (length == result_length && 0 == strcmp(string, buffer.data));
 369 }
 370
 371 const char *
 372 svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
 373 {
 374   /* Hexadecimal digits for code conversion. */
 375   static const char digits[] = "0123456789ABCDEF";
 376
 377   /* Flags used for Unicode decomposition. */
 378   static const int decomp_flags = (
 379       UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
 380       | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
 381
 382   svn_stringbuf_t *result;
 383   svn_membuf_t buffer;
 384   ssize_t decomp_length;
 385   ssize_t len;
 386
 387   /* Decompose to a non-reversible compatibility format. */
 388   svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
 389   decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
 390   if (decomp_length < 0)
 391     {
 392       svn_membuf_t part;
 393       apr_size_t done, prev;
 394
 395       /* The only other error we can receive here indicates an integer
 396          overflow due to the length of the input string. Not very
 397          likely, but we certainly shouldn't continue in that case. */
 398       SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
 399
 400       /* Break the decomposition into parts that are valid UTF-8, and
 401          bytes that are not. Represent the invalid bytes in the target
 402          erray by their negative value. This works because utf8proc
 403          will not generate Unicode code points with values larger than
 404          U+10FFFF. */
 405       svn_membuf__create(&part, sizeof(apr_int32_t), pool);
 406       decomp_length = 0;
 407       done = prev = 0;
 408       while (done < length)
 409         {
 410           apr_int32_t uc;
 411
 412           while (done < length)
 413             {
 414               len = utf8proc_iterate((uint8_t*)src + done, length - done, &uc);
 415               if (len < 0)
 416                 break;
 417               done += len;
 418             }
 419
 420           /* Decompose the valid part */
 421           if (done > prev)
 422             {
 423               len = unicode_decomposition(
 424                   decomp_flags, src + prev, done - prev, &part);
 425               SVN_ERR_ASSERT_NO_RETURN(len > 0);
 426               svn_membuf__resize(
 427                   &buffer, (decomp_length + len) * sizeof(apr_int32_t));
 428               memcpy((apr_int32_t*)buffer.data + decomp_length,
 429                      part.data, len * sizeof(apr_int32_t));
 430               decomp_length += len;
 431               prev = done;
 432             }
 433
 434           /* What follows could be a valid UTF-8 sequence, but not
 435              a valid Unicode character. */
 436           if (done < length)
 437             {
 438               const char *last;
 439
 440               /* Determine the length of the UTF-8 sequence */
 441               const char *const p = src + done;
 442               len = utf8proc_utf8class[(uint8_t)*p];
 443
 444               /* Check if the multi-byte sequence is valid UTF-8. */
 445               if (len > 1 && len <= (apr_ssize_t)(length - done))
 446                 last = svn_utf__last_valid(p, len);
 447               else
 448                 last = NULL;
 449
 450               /* Might not be a valid UTF-8 sequence at all */
 451               if (!last || (last && last - p < len))
 452                 {
 453                   uc = -((apr_int32_t)(*p & 0xff));
 454                   len = 1;
 455                 }
 456               else
 457                 {
 458                   switch (len)
 459                     {
 460                       /* Decode the UTF-8 sequence without validation. */
 461                     case 2:
 462                       uc = ((p[0] & 0x1f) <<  6) + (p[1] & 0x3f);
 463                       break;
 464                     case 3:
 465                       uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) <<  6)
 466                             + (p[2] & 0x3f));
 467                       break;
 468                     case 4:
 469                       uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
 470                             + ((p[2] & 0x3f) <<  6) + (p[3] & 0x3f));
 471                       break;
 472                     default:
 473                       SVN_ERR_ASSERT_NO_RETURN(
 474                           !"Unexpected invalid UTF-8 byte");
 475                     }
 476
 477                 }
 478
 479               svn_membuf__resize(
 480                   &buffer, (decomp_length + 1) * sizeof(apr_int32_t));
 481               ((apr_int32_t*)buffer.data)[decomp_length++] = uc;
 482               done += len;
 483               prev = done;
 484             }
 485         }
 486     }
 487
 488   /* Scan the result and deleting any combining diacriticals and
 489      inserting placeholders where any non-ascii characters remain.  */
 490   result = svn_stringbuf_create_ensure(decomp_length, pool);
 491   for (len = 0; len < decomp_length; ++len)
 492     {
 493       const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
 494       if (cp > 0 && cp < 127)
 495         svn_stringbuf_appendbyte(result, (char)cp);
 496       else if (cp == 0)
 497         svn_stringbuf_appendcstr(result, "\\0");
 498       else if (cp < 0)
 499         {
 500           const apr_int32_t rcp = ((-cp) & 0xff);
 501           svn_stringbuf_appendcstr(result, "?\\");
 502           svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
 503           svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
 504         }
 505       else
 506         {
 507           if (utf8proc_codepoint_valid(cp))
 508             {
 509               const utf8proc_property_t *prop = utf8proc_get_property(cp);
 510               if (prop->combining_class != 0)
 511                 continue;           /* Combining mark; ignore */
 512               svn_stringbuf_appendcstr(result, "{U+");
 513             }
 514           else
 515             svn_stringbuf_appendcstr(result, "{U?");
 516           if (cp > 0xffff)
 517             {
 518               svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
 519               svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
 520             }
 521           svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
 522           svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
 523           svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
 524           svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
 525           svn_stringbuf_appendbyte(result, '}');
 526         }
 527     }
 528
 529   return result->data;
 530 }