contrib/subversion/subversion/libsvn_subr/utf8proc/utf8proc.c

   1 /*
   2  *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
   3  *
   4  *  Permission is hereby granted, free of charge, to any person obtaining a
   5  *  copy of this software and associated documentation files (the "Software"),
   6  *  to deal in the Software without restriction, including without limitation
   7  *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  *  and/or sell copies of the Software, and to permit persons to whom the
   9  *  Software is furnished to do so, subject to the following conditions:
  10  *
  11  *  The above copyright notice and this permission notice shall be included in
  12  *  all copies or substantial portions of the Software.
  13  *
  14  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17  *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  *  DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 /*
  24  *  This library contains derived data from a modified version of the
  25  *  Unicode data files.
  26  *
  27  *  The original data files are available at
  28  *  http://www.unicode.org/Public/UNIDATA/
  29  *
  30  *  Please notice the copyright statement in the file "utf8proc_data.c".
  31  */
  32
  33
  34 /*
  35  *  File name:    utf8proc.c
  36  *
  37  *  Description:
  38  *  Implementation of libutf8proc.
  39  */
  40
  41
  42 #include "utf8proc.h"
  43 #include "utf8proc_data.c"
  44
  45
  46 UTF8PROC_DATA
  47 const int8_t utf8proc_utf8class[256] = {
  48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  50   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  51   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  52   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  53   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  54   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  55   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  56   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  57   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  58   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  59   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  60   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  61   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  62   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  63   4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
  64
  65 #define UTF8PROC_HANGUL_SBASE 0xAC00
  66 #define UTF8PROC_HANGUL_LBASE 0x1100
  67 #define UTF8PROC_HANGUL_VBASE 0x1161
  68 #define UTF8PROC_HANGUL_TBASE 0x11A7
  69 #define UTF8PROC_HANGUL_LCOUNT 19
  70 #define UTF8PROC_HANGUL_VCOUNT 21
  71 #define UTF8PROC_HANGUL_TCOUNT 28
  72 #define UTF8PROC_HANGUL_NCOUNT 588
  73 #define UTF8PROC_HANGUL_SCOUNT 11172
  74 /* END is exclusive */
  75 #define UTF8PROC_HANGUL_L_START  0x1100
  76 #define UTF8PROC_HANGUL_L_END    0x115A
  77 #define UTF8PROC_HANGUL_L_FILLER 0x115F
  78 #define UTF8PROC_HANGUL_V_START  0x1160
  79 #define UTF8PROC_HANGUL_V_END    0x11A3
  80 #define UTF8PROC_HANGUL_T_START  0x11A8
  81 #define UTF8PROC_HANGUL_T_END    0x11FA
  82 #define UTF8PROC_HANGUL_S_START  0xAC00
  83 #define UTF8PROC_HANGUL_S_END    0xD7A4
  84
  85
  86 #define UTF8PROC_BOUNDCLASS_START    0
  87 #define UTF8PROC_BOUNDCLASS_OTHER    1
  88 #define UTF8PROC_BOUNDCLASS_CR       2
  89 #define UTF8PROC_BOUNDCLASS_LF       3
  90 #define UTF8PROC_BOUNDCLASS_CONTROL  4
  91 #define UTF8PROC_BOUNDCLASS_EXTEND   5
  92 #define UTF8PROC_BOUNDCLASS_L        6
  93 #define UTF8PROC_BOUNDCLASS_V        7
  94 #define UTF8PROC_BOUNDCLASS_T        8
  95 #define UTF8PROC_BOUNDCLASS_LV       9
  96 #define UTF8PROC_BOUNDCLASS_LVT     10
  97
  98
  99 UTF8PROC_API
 100 const char *utf8proc_version(void) {
 101   return "1.1.5";
 102 }
 103
 104 /*
 105  * This macro tells translators that string X should be translated,
 106  * but does not look up the translation at run time.  This is standard
 107  * GNU gettext notation for annotating compile-time constant strings.
 108  */
 109 #ifndef N_
 110 #define N_(x) x
 111 #endif
 112
 113 UTF8PROC_API
 114 const char *utf8proc_errmsg(ssize_t errcode) {
 115   switch (errcode) {
 116     case UTF8PROC_ERROR_NOMEM:
 117     return N_("Memory for processing UTF-8 data could not be allocated.");
 118     case UTF8PROC_ERROR_OVERFLOW:
 119     return N_("UTF-8 string is too long to be processed.");
 120     case UTF8PROC_ERROR_INVALIDUTF8:
 121     return N_("Invalid UTF-8 string");
 122     case UTF8PROC_ERROR_NOTASSIGNED:
 123     return N_("Unassigned Unicode code point found in UTF-8 string.");
 124     case UTF8PROC_ERROR_INVALIDOPTS:
 125     return N_("Invalid options for UTF-8 processing chosen.");
 126     default:
 127     return N_("An unknown error occured while processing UTF-8 data.");
 128   }
 129 }
 130
 131 UTF8PROC_API
 132 ssize_t utf8proc_iterate(
 133   const uint8_t *str, ssize_t strlen, int32_t *dst
 134 ) {
 135   int length;
 136   int i;
 137   int32_t uc = -1;
 138   *dst = -1;
 139   if (!strlen) return 0;
 140   length = utf8proc_utf8class[str[0]];
 141   if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
 142   if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
 143   for (i=1; i<length; i++) {
 144     if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
 145   }
 146   switch (length) {
 147     case 1:
 148     uc = str[0];
 149     break;
 150     case 2:
 151     uc = ((str[0] & 0x1F) <<  6) + (str[1] & 0x3F);
 152     if (uc < 0x80) uc = -1;
 153     break;
 154     case 3:
 155     uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) <<  6)
 156       + (str[2] & 0x3F);
 157     if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
 158       (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
 159     break;
 160     case 4:
 161     uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
 162       + ((str[2] & 0x3F) <<  6) + (str[3] & 0x3F);
 163     if (uc < 0x10000 || uc >= 0x110000) uc = -1;
 164     break;
 165   }
 166   if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
 167     return UTF8PROC_ERROR_INVALIDUTF8;
 168   *dst = uc;
 169   return length;
 170 }
 171
 172 UTF8PROC_API
 173 bool utf8proc_codepoint_valid(int32_t uc) {
 174   if (uc < 0 || uc >= 0x110000 ||
 175     ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
 176     (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
 177   else return true;
 178 }
 179
 180 UTF8PROC_API
 181 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
 182   if (uc < 0x00) {
 183     return 0;
 184   } else if (uc < 0x80) {
 185     dst[0] = (uint8_t)uc;
 186     return 1;
 187   } else if (uc < 0x800) {
 188     dst[0] = 0xC0 + (uint8_t)(uc >> 6);
 189     dst[1] = 0x80 + (uc & 0x3F);
 190     return 2;
 191   } else if (uc == 0xFFFF) {
 192     dst[0] = 0xFF;
 193     return 1;
 194   } else if (uc == 0xFFFE) {
 195     dst[0] = 0xFE;
 196     return 1;
 197   } else if (uc < 0x10000) {
 198     dst[0] = 0xE0 + (uint8_t)(uc >> 12);
 199     dst[1] = 0x80 + ((uc >> 6) & 0x3F);
 200     dst[2] = 0x80 + (uc & 0x3F);
 201     return 3;
 202   } else if (uc < 0x110000) {
 203     dst[0] = 0xF0 + (uint8_t)(uc >> 18);
 204     dst[1] = 0x80 + ((uc >> 12) & 0x3F);
 205     dst[2] = 0x80 + ((uc >> 6) & 0x3F);
 206     dst[3] = 0x80 + (uc & 0x3F);
 207     return 4;
 208   } else return 0;
 209 }
 210
 211 UTF8PROC_API
 212 const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
 213   /* ASSERT: uc >= 0 && uc < 0x110000 */
 214   return utf8proc_properties + (
 215     utf8proc_stage2table[
 216       utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
 217     ]
 218   );
 219 }
 220
 221 #define utf8proc_decompose_lump(replacement_uc) \
 222   return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
 223   options & ~UTF8PROC_LUMP, last_boundclass)
 224
 225 UTF8PROC_API
 226 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
 227     int options, int *last_boundclass) {
 228   /* ASSERT: uc >= 0 && uc < 0x110000 */
 229   const utf8proc_property_t *property;
 230   utf8proc_propval_t category;
 231   int32_t hangul_sindex;
 232   property = utf8proc_get_property(uc);
 233   category = property->category;
 234   hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
 235   if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
 236     if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
 237       int32_t hangul_tindex;
 238       if (bufsize >= 1) {
 239         dst[0] = UTF8PROC_HANGUL_LBASE +
 240           hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
 241         if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
 242           (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
 243       }
 244       hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
 245       if (!hangul_tindex) return 2;
 246       if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
 247       return 3;
 248     }
 249   }
 250   if (options & UTF8PROC_REJECTNA) {
 251     if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
 252   }
 253   if (options & UTF8PROC_IGNORE) {
 254     if (property->ignorable) return 0;
 255   }
 256   if (options & UTF8PROC_LUMP) {
 257     if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
 258     if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
 259       utf8proc_decompose_lump(0x0027);
 260     if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
 261       utf8proc_decompose_lump(0x002D);
 262     if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
 263     if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
 264     if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
 265       utf8proc_decompose_lump(0x003C);
 266     if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
 267       utf8proc_decompose_lump(0x003E);
 268     if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
 269     if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
 270       utf8proc_decompose_lump(0x005E);
 271     if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
 272       utf8proc_decompose_lump(0x005F);
 273     if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
 274     if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
 275     if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
 276     if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
 277       if (category == UTF8PROC_CATEGORY_ZL ||
 278           category == UTF8PROC_CATEGORY_ZP)
 279         utf8proc_decompose_lump(0x000A);
 280     }
 281   }
 282   if (options & UTF8PROC_STRIPMARK) {
 283     if (category == UTF8PROC_CATEGORY_MN ||
 284       category == UTF8PROC_CATEGORY_MC ||
 285       category == UTF8PROC_CATEGORY_ME) return 0;
 286   }
 287   if (options & UTF8PROC_CASEFOLD) {
 288     if (property->casefold_mapping) {
 289       const int32_t *casefold_entry;
 290       ssize_t written = 0;
 291       for (casefold_entry = property->casefold_mapping;
 292           *casefold_entry >= 0; casefold_entry++) {
 293         written += utf8proc_decompose_char(*casefold_entry, dst+written,
 294           (bufsize > written) ? (bufsize - written) : 0, options,
 295           last_boundclass);
 296         if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
 297       }
 298       return written;
 299     }
 300   }
 301   if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
 302     if (property->decomp_mapping &&
 303         (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
 304       const int32_t *decomp_entry;
 305       ssize_t written = 0;
 306       for (decomp_entry = property->decomp_mapping;
 307           *decomp_entry >= 0; decomp_entry++) {
 308         written += utf8proc_decompose_char(*decomp_entry, dst+written,
 309           (bufsize > written) ? (bufsize - written) : 0, options,
 310         last_boundclass);
 311         if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
 312       }
 313       return written;
 314     }
 315   }
 316   if (options & UTF8PROC_CHARBOUND) {
 317     bool boundary;
 318     int tbc, lbc;
 319     tbc =
 320       (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
 321       (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
 322       ((category == UTF8PROC_CATEGORY_ZL ||
 323         category == UTF8PROC_CATEGORY_ZP ||
 324         category == UTF8PROC_CATEGORY_CC ||
 325         category == UTF8PROC_CATEGORY_CF) &&
 326         !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
 327       property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
 328       ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
 329         uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
 330       (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
 331         UTF8PROC_BOUNDCLASS_V :
 332       (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
 333         UTF8PROC_BOUNDCLASS_T :
 334       (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
 335         ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
 336           UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
 337       ) :
 338       UTF8PROC_BOUNDCLASS_OTHER;
 339     lbc = *last_boundclass;
 340     boundary =
 341       (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
 342       (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
 343       (lbc == UTF8PROC_BOUNDCLASS_CR &&
 344        tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
 345       (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
 346       (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
 347       (lbc == UTF8PROC_BOUNDCLASS_L &&
 348        (tbc == UTF8PROC_BOUNDCLASS_L ||
 349         tbc == UTF8PROC_BOUNDCLASS_V ||
 350         tbc == UTF8PROC_BOUNDCLASS_LV ||
 351         tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
 352       ((lbc == UTF8PROC_BOUNDCLASS_LV ||
 353         lbc == UTF8PROC_BOUNDCLASS_V) &&
 354        (tbc == UTF8PROC_BOUNDCLASS_V ||
 355         tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
 356       ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
 357         lbc == UTF8PROC_BOUNDCLASS_T) &&
 358        tbc == UTF8PROC_BOUNDCLASS_T) ? false :
 359        true;
 360     *last_boundclass = tbc;
 361     if (boundary) {
 362       if (bufsize >= 1) dst[0] = 0xFFFF;
 363       if (bufsize >= 2) dst[1] = uc;
 364       return 2;
 365     }
 366   }
 367   if (bufsize >= 1) *dst = uc;
 368   return 1;
 369 }
 370
 371 UTF8PROC_API
 372 ssize_t utf8proc_decompose(
 373   const uint8_t *str, ssize_t strlen,
 374   int32_t *buffer, ssize_t bufsize, int options
 375 ) {
 376   /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
 377   ssize_t wpos = 0;
 378   if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
 379     return UTF8PROC_ERROR_INVALIDOPTS;
 380   if ((options & UTF8PROC_STRIPMARK) &&
 381       !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
 382     return UTF8PROC_ERROR_INVALIDOPTS;
 383   {
 384     int32_t uc;
 385     ssize_t rpos = 0;
 386     ssize_t decomp_result;
 387     int boundclass = UTF8PROC_BOUNDCLASS_START;
 388     while (1) {
 389       if (options & UTF8PROC_NULLTERM) {
 390         rpos += utf8proc_iterate(str + rpos, -1, &uc);
 391         /* checking of return value is not neccessary,
 392            as 'uc' is < 0 in case of error */
 393         if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
 394         if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
 395         if (uc == 0) break;
 396       } else {
 397         if (rpos >= strlen) break;
 398         rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
 399         if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
 400       }
 401       decomp_result = utf8proc_decompose_char(
 402         uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
 403         &boundclass
 404       );
 405       if (decomp_result < 0) return decomp_result;
 406       wpos += decomp_result;
 407       /* prohibiting integer overflows due to too long strings: */
 408       if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
 409         return UTF8PROC_ERROR_OVERFLOW;
 410     }
 411   }
 412   if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
 413     ssize_t pos = 0;
 414     while (pos < wpos-1) {
 415       int32_t uc1, uc2;
 416       const utf8proc_property_t *property1, *property2;
 417       uc1 = buffer[pos];
 418       uc2 = buffer[pos+1];
 419       property1 = utf8proc_get_property(uc1);
 420       property2 = utf8proc_get_property(uc2);
 421       if (property1->combining_class > property2->combining_class &&
 422           property2->combining_class > 0) {
 423         buffer[pos] = uc2;
 424         buffer[pos+1] = uc1;
 425         if (pos > 0) pos--; else pos++;
 426       } else {
 427         pos++;
 428       }
 429     }
 430   }
 431   return wpos;
 432 }
 433
 434 UTF8PROC_API
 435 ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
 436   /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
 437      ASSERT: 'buffer' has one spare byte of free space at the end! */
 438   if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
 439     ssize_t rpos;
 440     ssize_t wpos = 0;
 441     int32_t uc;
 442     for (rpos = 0; rpos < length; rpos++) {
 443       uc = buffer[rpos];
 444       if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
 445       if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
 446           ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
 447         if (options & UTF8PROC_NLF2LS) {
 448           if (options & UTF8PROC_NLF2PS) {
 449             buffer[wpos++] = 0x000A;
 450           } else {
 451             buffer[wpos++] = 0x2028;
 452           }
 453         } else {
 454           if (options & UTF8PROC_NLF2PS) {
 455             buffer[wpos++] = 0x2029;
 456           } else {
 457             buffer[wpos++] = 0x0020;
 458           }
 459         }
 460       } else if ((options & UTF8PROC_STRIPCC) &&
 461           (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
 462         if (uc == 0x0009) buffer[wpos++] = 0x0020;
 463       } else {
 464         buffer[wpos++] = uc;
 465       }
 466     }
 467     length = wpos;
 468   }
 469   if (options & UTF8PROC_COMPOSE) {
 470     int32_t *starter = NULL;
 471     int32_t current_char;
 472     const utf8proc_property_t *starter_property = NULL, *current_property;
 473     utf8proc_propval_t max_combining_class = -1;
 474     ssize_t rpos;
 475     ssize_t wpos = 0;
 476     int32_t composition;
 477     for (rpos = 0; rpos < length; rpos++) {
 478       current_char = buffer[rpos];
 479       current_property = utf8proc_get_property(current_char);
 480       if (starter && current_property->combining_class > max_combining_class) {
 481         /* combination perhaps possible */
 482         int32_t hangul_lindex;
 483         int32_t hangul_sindex;
 484         hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
 485         if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
 486           int32_t hangul_vindex;
 487           hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
 488           if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
 489             *starter = UTF8PROC_HANGUL_SBASE +
 490               (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
 491               UTF8PROC_HANGUL_TCOUNT;
 492             starter_property = NULL;
 493             continue;
 494           }
 495         }
 496         hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
 497         if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
 498             (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
 499           int32_t hangul_tindex;
 500           hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
 501           if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
 502             *starter += hangul_tindex;
 503             starter_property = NULL;
 504             continue;
 505           }
 506         }
 507         if (!starter_property) {
 508           starter_property = utf8proc_get_property(*starter);
 509         }
 510         if (starter_property->comb1st_index >= 0 &&
 511             current_property->comb2nd_index >= 0) {
 512           composition = utf8proc_combinations[
 513             starter_property->comb1st_index +
 514             current_property->comb2nd_index
 515           ];
 516           if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
 517               !(utf8proc_get_property(composition)->comp_exclusion))) {
 518             *starter = composition;
 519             starter_property = NULL;
 520             continue;
 521           }
 522         }
 523       }
 524       buffer[wpos] = current_char;
 525       if (current_property->combining_class) {
 526         if (current_property->combining_class > max_combining_class) {
 527           max_combining_class = current_property->combining_class;
 528         }
 529       } else {
 530         starter = buffer + wpos;
 531         starter_property = NULL;
 532         max_combining_class = -1;
 533       }
 534       wpos++;
 535     }
 536     length = wpos;
 537   }
 538   {
 539     ssize_t rpos, wpos = 0;
 540     int32_t uc;
 541     for (rpos = 0; rpos < length; rpos++) {
 542       uc = buffer[rpos];
 543       wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
 544     }
 545     ((uint8_t *)buffer)[wpos] = 0;
 546     return wpos;
 547   }
 548 }
 549
 550 UTF8PROC_API
 551 ssize_t utf8proc_map(
 552   const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
 553 ) {
 554   int32_t *buffer;
 555   ssize_t result;
 556   *dstptr = NULL;
 557   result = utf8proc_decompose(str, strlen, NULL, 0, options);
 558   if (result < 0) return result;
 559   buffer = malloc(result * sizeof(int32_t) + 1);
 560   if (!buffer) return UTF8PROC_ERROR_NOMEM;
 561   result = utf8proc_decompose(str, strlen, buffer, result, options);
 562   if (result < 0) {
 563     free(buffer);
 564     return result;
 565   }
 566   result = utf8proc_reencode(buffer, result, options);
 567   if (result < 0) {
 568     free(buffer);
 569     return result;
 570   }
 571   {
 572     int32_t *newptr;
 573     newptr = realloc(buffer, (size_t)result+1);
 574     if (newptr) buffer = newptr;
 575   }
 576   *dstptr = (uint8_t *)buffer;
 577   return result;
 578 }
 579
 580 UTF8PROC_API
 581 uint8_t *utf8proc_NFD(const uint8_t *str) {
 582   uint8_t *retval;
 583   utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
 584     UTF8PROC_DECOMPOSE);
 585   return retval;
 586 }
 587
 588 UTF8PROC_API
 589 uint8_t *utf8proc_NFC(const uint8_t *str) {
 590   uint8_t *retval;
 591   utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
 592     UTF8PROC_COMPOSE);
 593   return retval;
 594 }
 595
 596 UTF8PROC_API
 597 uint8_t *utf8proc_NFKD(const uint8_t *str) {
 598   uint8_t *retval;
 599   utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
 600     UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
 601   return retval;
 602 }
 603
 604 UTF8PROC_API
 605 uint8_t *utf8proc_NFKC(const uint8_t *str) {
 606   uint8_t *retval;
 607   utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
 608     UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
 609   return retval;
 610 }
 611