contrib/subversion/subversion/libsvn_subr/utf_validate.c

   1 /*
   2  * utf_validate.c:  Validate a UTF-8 string
   3  *
   4  * ====================================================================
   5  *    Licensed to the Apache Software Foundation (ASF) under one
   6  *    or more contributor license agreements.  See the NOTICE file
   7  *    distributed with this work for additional information
   8  *    regarding copyright ownership.  The ASF licenses this file
   9  *    to you under the Apache License, Version 2.0 (the
  10  *    "License"); you may not use this file except in compliance
  11  *    with the License.  You may obtain a copy of the License at
  12  *
  13  *      http://www.apache.org/licenses/LICENSE-2.0
  14  *
  15  *    Unless required by applicable law or agreed to in writing,
  16  *    software distributed under the License is distributed on an
  17  *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  18  *    KIND, either express or implied.  See the License for the
  19  *    specific language governing permissions and limitations
  20  *    under the License.
  21  * ====================================================================
  22  */
  23
  24 /* Validate a UTF-8 string according to the rules in
  25  *
  26  *    Table 3-6. Well-Formed UTF-8 Bytes Sequences
  27  *
  28  * in
  29  *
  30  *    The Unicode Standard, Version 4.0
  31  *
  32  * which is available at
  33  *
  34  *    http://www.unicode.org/
  35  *
  36  * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
  37  * is a subset of that enconding.  The Unicode enconding prohibits things
  38  * like non-shortest encodings (some characters can be represented by more
  39  * than one multi-byte encoding) and the encodings for the surrogate code
  40  * points.  RFC-3629 superceeds RFC-2279 and adopts the same well-formed
  41  * rules as Unicode.  This is the ABNF in RFC-3629 that describes
  42  * well-formed UTF-8 rules:
  43  *
  44  *   UTF8-octets = *( UTF8-char )
  45  *   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
  46  *   UTF8-1      = %x00-7F
  47  *   UTF8-2      = %xC2-DF UTF8-tail
  48  *   UTF8-3      = %xE0 %xA0-BF UTF8-tail /
  49  *                 %xE1-EC 2( UTF8-tail ) /
  50  *                 %xED %x80-9F UTF8-tail /
  51  *                 %xEE-EF 2( UTF8-tail )
  52  *   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
  53  *                 %xF1-F3 3( UTF8-tail ) /
  54  *                 %xF4 %x80-8F 2( UTF8-tail )
  55  *   UTF8-tail   = %x80-BF
  56  *
  57  */
  58
  59 #include "private/svn_utf_private.h"
  60 #include "private/svn_eol_private.h"
  61 #include "private/svn_dep_compat.h"
  62
  63 /* Lookup table to categorise each octet in the string. */
  64 static const char octet_category[256] = {
  65   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0x00-0x7f */
  66   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  67   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  68   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  69   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  70   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  71   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  72   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  73   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, /* 0x80-0x8f */
  74   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, /* 0x90-0x9f */
  75   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, /* 0xa0-0xbf */
  76   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
  77   4,  4,                                                         /* 0xc0-0xc1 */
  78           5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, /* 0xc2-0xdf */
  79   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
  80   6,                                                             /* 0xe0 */
  81       7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,             /* 0xe1-0xec */
  82                                                       8,         /* 0xed */
  83                                                           9,  9, /* 0xee-0xef */
  84   10,                                                            /* 0xf0 */
  85       11, 11, 11,                                                /* 0xf1-0xf3 */
  86                   12,                                            /* 0xf4 */
  87                       13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
  88 };
  89
  90 /* Machine states */
  91 #define FSM_START         0
  92 #define FSM_80BF          1
  93 #define FSM_A0BF          2
  94 #define FSM_80BF80BF      3
  95 #define FSM_809F          4
  96 #define FSM_90BF          5
  97 #define FSM_80BF80BF80BF  6
  98 #define FSM_808F          7
  99 #define FSM_ERROR         8
 100
 101 /* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
 102    same transitions, as do categories 0xe1-0xec and 0xee-0xef.  I wonder if
 103    there is any great benefit in combining categories?  It would reduce the
 104    memory footprint of the transition table by 16 bytes, but might it be
 105    harder to understand?  */
 106
 107 /* Machine transition table */
 108 static const char machine [9][14] = {
 109   /* FSM_START */
 110   {FSM_START,         /* 0x00-0x7f */
 111    FSM_ERROR,         /* 0x80-0x8f */
 112    FSM_ERROR,         /* 0x90-0x9f */
 113    FSM_ERROR,         /* 0xa0-0xbf */
 114    FSM_ERROR,         /* 0xc0-0xc1 */
 115    FSM_80BF,          /* 0xc2-0xdf */
 116    FSM_A0BF,          /* 0xe0 */
 117    FSM_80BF80BF,      /* 0xe1-0xec */
 118    FSM_809F,          /* 0xed */
 119    FSM_80BF80BF,      /* 0xee-0xef */
 120    FSM_90BF,          /* 0xf0 */
 121    FSM_80BF80BF80BF,  /* 0xf1-0xf3 */
 122    FSM_808F,          /* 0xf4 */
 123    FSM_ERROR},        /* 0xf5-0xff */
 124
 125   /* FSM_80BF */
 126   {FSM_ERROR,         /* 0x00-0x7f */
 127    FSM_START,         /* 0x80-0x8f */
 128    FSM_START,         /* 0x90-0x9f */
 129    FSM_START,         /* 0xa0-0xbf */
 130    FSM_ERROR,         /* 0xc0-0xc1 */
 131    FSM_ERROR,         /* 0xc2-0xdf */
 132    FSM_ERROR,         /* 0xe0 */
 133    FSM_ERROR,         /* 0xe1-0xec */
 134    FSM_ERROR,         /* 0xed */
 135    FSM_ERROR,         /* 0xee-0xef */
 136    FSM_ERROR,         /* 0xf0 */
 137    FSM_ERROR,         /* 0xf1-0xf3 */
 138    FSM_ERROR,         /* 0xf4 */
 139    FSM_ERROR},        /* 0xf5-0xff */
 140
 141   /* FSM_A0BF */
 142   {FSM_ERROR,         /* 0x00-0x7f */
 143    FSM_ERROR,         /* 0x80-0x8f */
 144    FSM_ERROR,         /* 0x90-0x9f */
 145    FSM_80BF,          /* 0xa0-0xbf */
 146    FSM_ERROR,         /* 0xc0-0xc1 */
 147    FSM_ERROR,         /* 0xc2-0xdf */
 148    FSM_ERROR,         /* 0xe0 */
 149    FSM_ERROR,         /* 0xe1-0xec */
 150    FSM_ERROR,         /* 0xed */
 151    FSM_ERROR,         /* 0xee-0xef */
 152    FSM_ERROR,         /* 0xf0 */
 153    FSM_ERROR,         /* 0xf1-0xf3 */
 154    FSM_ERROR,         /* 0xf4 */
 155    FSM_ERROR},        /* 0xf5-0xff */
 156
 157   /* FSM_80BF80BF */
 158   {FSM_ERROR,         /* 0x00-0x7f */
 159    FSM_80BF,          /* 0x80-0x8f */
 160    FSM_80BF,          /* 0x90-0x9f */
 161    FSM_80BF,          /* 0xa0-0xbf */
 162    FSM_ERROR,         /* 0xc0-0xc1 */
 163    FSM_ERROR,         /* 0xc2-0xdf */
 164    FSM_ERROR,         /* 0xe0 */
 165    FSM_ERROR,         /* 0xe1-0xec */
 166    FSM_ERROR,         /* 0xed */
 167    FSM_ERROR,         /* 0xee-0xef */
 168    FSM_ERROR,         /* 0xf0 */
 169    FSM_ERROR,         /* 0xf1-0xf3 */
 170    FSM_ERROR,         /* 0xf4 */
 171    FSM_ERROR},        /* 0xf5-0xff */
 172
 173   /* FSM_809F */
 174   {FSM_ERROR,         /* 0x00-0x7f */
 175    FSM_80BF,          /* 0x80-0x8f */
 176    FSM_80BF,          /* 0x90-0x9f */
 177    FSM_ERROR,         /* 0xa0-0xbf */
 178    FSM_ERROR,         /* 0xc0-0xc1 */
 179    FSM_ERROR,         /* 0xc2-0xdf */
 180    FSM_ERROR,         /* 0xe0 */
 181    FSM_ERROR,         /* 0xe1-0xec */
 182    FSM_ERROR,         /* 0xed */
 183    FSM_ERROR,         /* 0xee-0xef */
 184    FSM_ERROR,         /* 0xf0 */
 185    FSM_ERROR,         /* 0xf1-0xf3 */
 186    FSM_ERROR,         /* 0xf4 */
 187    FSM_ERROR},        /* 0xf5-0xff */
 188
 189   /* FSM_90BF */
 190   {FSM_ERROR,         /* 0x00-0x7f */
 191    FSM_ERROR,         /* 0x80-0x8f */
 192    FSM_80BF80BF,      /* 0x90-0x9f */
 193    FSM_80BF80BF,      /* 0xa0-0xbf */
 194    FSM_ERROR,         /* 0xc0-0xc1 */
 195    FSM_ERROR,         /* 0xc2-0xdf */
 196    FSM_ERROR,         /* 0xe0 */
 197    FSM_ERROR,         /* 0xe1-0xec */
 198    FSM_ERROR,         /* 0xed */
 199    FSM_ERROR,         /* 0xee-0xef */
 200    FSM_ERROR,         /* 0xf0 */
 201    FSM_ERROR,         /* 0xf1-0xf3 */
 202    FSM_ERROR,         /* 0xf4 */
 203    FSM_ERROR},        /* 0xf5-0xff */
 204
 205   /* FSM_80BF80BF80BF */
 206   {FSM_ERROR,         /* 0x00-0x7f */
 207    FSM_80BF80BF,      /* 0x80-0x8f */
 208    FSM_80BF80BF,      /* 0x90-0x9f */
 209    FSM_80BF80BF,      /* 0xa0-0xbf */
 210    FSM_ERROR,         /* 0xc0-0xc1 */
 211    FSM_ERROR,         /* 0xc2-0xdf */
 212    FSM_ERROR,         /* 0xe0 */
 213    FSM_ERROR,         /* 0xe1-0xec */
 214    FSM_ERROR,         /* 0xed */
 215    FSM_ERROR,         /* 0xee-0xef */
 216    FSM_ERROR,         /* 0xf0 */
 217    FSM_ERROR,         /* 0xf1-0xf3 */
 218    FSM_ERROR,         /* 0xf4 */
 219    FSM_ERROR},        /* 0xf5-0xff */
 220
 221   /* FSM_808F */
 222   {FSM_ERROR,         /* 0x00-0x7f */
 223    FSM_80BF80BF,      /* 0x80-0x8f */
 224    FSM_ERROR,         /* 0x90-0x9f */
 225    FSM_ERROR,         /* 0xa0-0xbf */
 226    FSM_ERROR,         /* 0xc0-0xc1 */
 227    FSM_ERROR,         /* 0xc2-0xdf */
 228    FSM_ERROR,         /* 0xe0 */
 229    FSM_ERROR,         /* 0xe1-0xec */
 230    FSM_ERROR,         /* 0xed */
 231    FSM_ERROR,         /* 0xee-0xef */
 232    FSM_ERROR,         /* 0xf0 */
 233    FSM_ERROR,         /* 0xf1-0xf3 */
 234    FSM_ERROR,         /* 0xf4 */
 235    FSM_ERROR},        /* 0xf5-0xff */
 236
 237   /* FSM_ERROR */
 238   {FSM_ERROR,         /* 0x00-0x7f */
 239    FSM_ERROR,         /* 0x80-0x8f */
 240    FSM_ERROR,         /* 0x90-0x9f */
 241    FSM_ERROR,         /* 0xa0-0xbf */
 242    FSM_ERROR,         /* 0xc0-0xc1 */
 243    FSM_ERROR,         /* 0xc2-0xdf */
 244    FSM_ERROR,         /* 0xe0 */
 245    FSM_ERROR,         /* 0xe1-0xec */
 246    FSM_ERROR,         /* 0xed */
 247    FSM_ERROR,         /* 0xee-0xef */
 248    FSM_ERROR,         /* 0xf0 */
 249    FSM_ERROR,         /* 0xf1-0xf3 */
 250    FSM_ERROR,         /* 0xf4 */
 251    FSM_ERROR},        /* 0xf5-0xff */
 252 };
 253
 254 /* Scan MAX_LEN bytes in *DATA for chars that are not in the octet
 255  * category 0 (FSM_START).  Return the position of the first such char
 256  * or DATA + MAX_LEN if all were cat 0.
 257  */
 258 static const char *
 259 first_non_fsm_start_char(const char *data, apr_size_t max_len)
 260 {
 261 #if !SVN_UNALIGNED_ACCESS_IS_OK
 262
 263   /* On some systems, we need to make sure that buf is properly aligned
 264    * for chunky data access.
 265    */
 266   if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1))
 267     {
 268       apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1);
 269       if (len > max_len)
 270         len = max_len;
 271       max_len -= len;
 272
 273       for (; len > 0; ++data, --len)
 274         if (*data < 0 || *data >= 0x80)
 275           return data;
 276     }
 277
 278 #endif
 279
 280   /* Scan the input one machine word at a time. */
 281   for (; max_len > sizeof(apr_uintptr_t)
 282        ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t))
 283     if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET)
 284       break;
 285
 286   /* The remaining odd bytes will be examined the naive way: */
 287   for (; max_len > 0; ++data, --max_len)
 288     if (*data < 0 || *data >= 0x80)
 289       break;
 290
 291   return data;
 292 }
 293
 294 /* Scan the C string in *DATA for chars that are not in the octet
 295  * category 0 (FSM_START).  Return the position of either the such
 296  * char or of the terminating NUL.
 297  */
 298 static const char *
 299 first_non_fsm_start_char_cstring(const char *data)
 300 {
 301   /* We need to make sure that BUF is properly aligned for chunky data
 302    * access because we don't know the string's length. Unaligned chunk
 303    * read access beyond the NUL terminator could therefore result in a
 304    * segfault.
 305    */
 306   for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data)
 307     if (*data <= 0 || *data >= 0x80)
 308       return data;
 309
 310   /* Scan the input one machine word at a time. */
 311 #ifndef SVN_UTF_NO_UNINITIALISED_ACCESS
 312   /* This may read allocated but initialised bytes beyond the
 313      terminating null.  Any such bytes are always readable and this
 314      code operates correctly whatever the uninitialised values happen
 315      to be.  However memory checking tools such as valgrind and GCC
 316      4.8's address santitizer will object so this bit of code can be
 317      disabled at compile time. */
 318   for (; ; data += sizeof(apr_uintptr_t))
 319     {
 320       /* Check for non-ASCII chars: */
 321       apr_uintptr_t chunk = *(const apr_uintptr_t *)data;
 322       if (chunk & SVN__BIT_7_SET)
 323         break;
 324
 325       /* This is the well-known strlen test: */
 326       chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET;
 327       if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET)
 328         break;
 329     }
 330 #endif
 331
 332   /* The remaining odd bytes will be examined the naive way: */
 333   for (; ; ++data)
 334     if (*data <= 0 || *data >= 0x80)
 335       break;
 336
 337   return data;
 338 }
 339
 340 const char *
 341 svn_utf__last_valid(const char *data, apr_size_t len)
 342 {
 343   const char *start = first_non_fsm_start_char(data, len);
 344   const char *end = data + len;
 345   int state = FSM_START;
 346
 347   data = start;
 348   while (data < end)
 349     {
 350       unsigned char octet = *data++;
 351       int category = octet_category[octet];
 352       state = machine[state][category];
 353       if (state == FSM_START)
 354         start = data;
 355     }
 356   return start;
 357 }
 358
 359 svn_boolean_t
 360 svn_utf__cstring_is_valid(const char *data)
 361 {
 362   int state = FSM_START;
 363
 364   if (!data)
 365     return FALSE;
 366
 367   data = first_non_fsm_start_char_cstring(data);
 368
 369   while (*data)
 370     {
 371       unsigned char octet = *data++;
 372       int category = octet_category[octet];
 373       state = machine[state][category];
 374     }
 375   return state == FSM_START;
 376 }
 377
 378 svn_boolean_t
 379 svn_utf__is_valid(const char *data, apr_size_t len)
 380 {
 381   const char *end = data + len;
 382   int state = FSM_START;
 383
 384   if (!data)
 385     return FALSE;
 386
 387   data = first_non_fsm_start_char(data, len);
 388
 389   while (data < end)
 390     {
 391       unsigned char octet = *data++;
 392       int category = octet_category[octet];
 393       state = machine[state][category];
 394     }
 395   return state == FSM_START;
 396 }
 397
 398 const char *
 399 svn_utf__last_valid2(const char *data, apr_size_t len)
 400 {
 401   const char *start = first_non_fsm_start_char(data, len);
 402   const char *end = data + len;
 403   int state = FSM_START;
 404
 405   data = start;
 406   while (data < end)
 407     {
 408       unsigned char octet = *data++;
 409       switch (state)
 410         {
 411         case FSM_START:
 412           if (octet <= 0x7F)
 413             break;
 414           else if (octet <= 0xC1)
 415             state = FSM_ERROR;
 416           else if (octet <= 0xDF)
 417             state = FSM_80BF;
 418           else if (octet == 0xE0)
 419             state = FSM_A0BF;
 420           else if (octet <= 0xEC)
 421             state = FSM_80BF80BF;
 422           else if (octet == 0xED)
 423             state = FSM_809F;
 424           else if (octet <= 0xEF)
 425             state = FSM_80BF80BF;
 426           else if (octet == 0xF0)
 427             state = FSM_90BF;
 428           else if (octet <= 0xF3)
 429             state = FSM_80BF80BF80BF;
 430           else if (octet <= 0xF4)
 431             state = FSM_808F;
 432           else
 433             state = FSM_ERROR;
 434           break;
 435         case FSM_80BF:
 436           if (octet >= 0x80 && octet <= 0xBF)
 437             state = FSM_START;
 438           else
 439             state = FSM_ERROR;
 440           break;
 441         case FSM_A0BF:
 442           if (octet >= 0xA0 && octet <= 0xBF)
 443             state = FSM_80BF;
 444           else
 445             state = FSM_ERROR;
 446           break;
 447         case FSM_80BF80BF:
 448           if (octet >= 0x80 && octet <= 0xBF)
 449             state = FSM_80BF;
 450           else
 451             state = FSM_ERROR;
 452           break;
 453         case FSM_809F:
 454           if (octet >= 0x80 && octet <= 0x9F)
 455             state = FSM_80BF;
 456           else
 457             state = FSM_ERROR;
 458           break;
 459         case FSM_90BF:
 460           if (octet >= 0x90 && octet <= 0xBF)
 461             state = FSM_80BF80BF;
 462           else
 463             state = FSM_ERROR;
 464           break;
 465         case FSM_80BF80BF80BF:
 466           if (octet >= 0x80 && octet <= 0xBF)
 467             state = FSM_80BF80BF;
 468           else
 469             state = FSM_ERROR;
 470           break;
 471         case FSM_808F:
 472           if (octet >= 0x80 && octet <= 0x8F)
 473             state = FSM_80BF80BF;
 474           else
 475             state = FSM_ERROR;
 476           break;
 477         default:
 478         case FSM_ERROR:
 479           return start;
 480         }
 481       if (state == FSM_START)
 482         start = data;
 483     }
 484   return start;
 485 }