contrib/subversion/subversion/libsvn_subr/utf8proc/utf8proc.h

   1 /*
   2  *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
   3  *
   4  *  Permission is hereby granted, free of charge, to any person obtaining a
   5  *  copy of this software and associated documentation files (the "Software"),
   6  *  to deal in the Software without restriction, including without limitation
   7  *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  *  and/or sell copies of the Software, and to permit persons to whom the
   9  *  Software is furnished to do so, subject to the following conditions:
  10  *
  11  *  The above copyright notice and this permission notice shall be included in
  12  *  all copies or substantial portions of the Software.
  13  *
  14  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17  *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  *  DEALINGS IN THE SOFTWARE.
  21  */
  22
  23
  24 /*
  25  *  File name:    utf8proc.h
  26  *
  27  *  Description:
  28  *  Header files for libutf8proc, which is a mapping tool for UTF-8 strings
  29  *  with following features:
  30  *  - decomposing and composing of strings
  31  *  - replacing compatibility characters with their equivalents
  32  *  - stripping of "default ignorable characters"
  33  *    like SOFT-HYPHEN or ZERO-WIDTH-SPACE
  34  *  - folding of certain characters for string comparison
  35  *    (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-")
  36  *    (see "LUMP" option)
  37  *  - optional rejection of strings containing non-assigned code points
  38  *  - stripping of control characters
  39  *  - stripping of character marks (accents, etc.)
  40  *  - transformation of LF, CRLF, CR and NEL to line-feed (LF)
  41  *    or to the unicode chararacters for paragraph separation (PS)
  42  *    or line separation (LS).
  43  *  - unicode case folding (for case insensitive string comparisons)
  44  *  - rejection of illegal UTF-8 data
  45  *    (i.e. UTF-8 encoded UTF-16 surrogates)
  46  *  - support for korean hangul characters
  47  *  Unicode Version 5.0.0 is supported.
  48  */
  49
  50
  51 #ifndef UTF8PROC_H
  52 #define UTF8PROC_H
  53
  54 /** @name API version
  55  *
  56  * The utf8proc API version MAJOR.MINOR.PATCH, following
  57  * semantic-versioning rules (http://semver.org) based on API
  58  * compatibility.
  59  *
  60  * This is also returned at runtime by @ref utf8proc_version; however, the
  61  * runtime version may append a string like "-dev" to the version number
  62  * for prerelease versions.
  63  *
  64  * @note The shared-library version number in the Makefile may be different,
  65  *       being based on ABI compatibility rather than API compatibility.
  66  */
  67 /** @{ */
  68 /** The MAJOR version number (increased when backwards API compatibility is broken). */
  69 #define UTF8PROC_VERSION_MAJOR 1
  70 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
  71 #define UTF8PROC_VERSION_MINOR 1
  72 /** The PATCH version (increased for fixes that do not change the API). */
  73 #define UTF8PROC_VERSION_PATCH 5
  74 /** @} */
  75
  76 /*
  77  * Define UTF8PROC_INLINE and include utf8proc.c to embed a static
  78  * version of utf8proc in your program or library without exporting
  79  * any of its symbols.
  80  */
  81 #ifdef UTF8PROC_INLINE
  82 #define UTF8PROC_API static
  83 #undef  UTF8PROC_DATA_EXPORT
  84 #define UTF8PROC_DATA static
  85 #else
  86 #define UTF8PROC_API
  87 #define UTF8PROC_DATA_EXPORT
  88 #define UTF8PROC_DATA
  89 #endif
  90
  91
  92 #include <stdlib.h>
  93 #include <sys/types.h>
  94 #ifdef _MSC_VER
  95 # if _MSC_VER >= 1900
  96 #   include <stdbool.h>
  97 #   include <stdint.h>
  98 # else
  99     typedef signed char int8_t;
 100     typedef unsigned char uint8_t;
 101     typedef short int16_t;
 102     typedef unsigned short uint16_t;
 103     typedef int int32_t;
 104     typedef unsigned char bool;
 105     enum {false, true};
 106 # endif
 107 # ifdef _WIN64
 108 #   define ssize_t __int64
 109 # else
 110 #   define ssize_t int
 111 # endif
 112 #elif defined(HAVE_STDBOOL_H) && defined(HAVE_INTTYPES_H)
 113 #include <stdbool.h>
 114 #include <inttypes.h>
 115 #else
 116 #include <apr.h>
 117 typedef uint8_t bool;
 118 enum {false, true};
 119 #endif
 120 #include <limits.h>
 121
 122 #ifdef __cplusplus
 123 extern "C" {
 124 #endif
 125
 126 #ifndef SSIZE_MAX
 127 #define SSIZE_MAX ((size_t)SIZE_MAX/2)
 128 #endif
 129
 130 #define UTF8PROC_NULLTERM  (1<<0)
 131 #define UTF8PROC_STABLE    (1<<1)
 132 #define UTF8PROC_COMPAT    (1<<2)
 133 #define UTF8PROC_COMPOSE   (1<<3)
 134 #define UTF8PROC_DECOMPOSE (1<<4)
 135 #define UTF8PROC_IGNORE    (1<<5)
 136 #define UTF8PROC_REJECTNA  (1<<6)
 137 #define UTF8PROC_NLF2LS    (1<<7)
 138 #define UTF8PROC_NLF2PS    (1<<8)
 139 #define UTF8PROC_NLF2LF    (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
 140 #define UTF8PROC_STRIPCC   (1<<9)
 141 #define UTF8PROC_CASEFOLD  (1<<10)
 142 #define UTF8PROC_CHARBOUND (1<<11)
 143 #define UTF8PROC_LUMP      (1<<12)
 144 #define UTF8PROC_STRIPMARK (1<<13)
 145 /*
 146  *  Flags being regarded by several functions in the library:
 147  *  NULLTERM:  The given UTF-8 input is NULL terminated.
 148  *  STABLE:    Unicode Versioning Stability has to be respected.
 149  *  COMPAT:    Compatiblity decomposition
 150  *             (i.e. formatting information is lost)
 151  *  COMPOSE:   Return a result with composed characters.
 152  *  DECOMPOSE: Return a result with decomposed characters.
 153  *  IGNORE:    Strip "default ignorable characters"
 154  *  REJECTNA:  Return an error, if the input contains unassigned
 155  *             code points.
 156  *  NLF2LS:    Indicating that NLF-sequences (LF, CRLF, CR, NEL) are
 157  *             representing a line break, and should be converted to the
 158  *             unicode character for line separation (LS).
 159  *  NLF2PS:    Indicating that NLF-sequences are representing a paragraph
 160  *             break, and should be converted to the unicode character for
 161  *             paragraph separation (PS).
 162  *  NLF2LF:    Indicating that the meaning of NLF-sequences is unknown.
 163  *  STRIPCC:   Strips and/or convers control characters.
 164  *             NLF-sequences are transformed into space, except if one of
 165  *             the NLF2LS/PS/LF options is given.
 166  *             HorizontalTab (HT) and FormFeed (FF) are treated as a
 167  *             NLF-sequence in this case.
 168  *             All other control characters are simply removed.
 169  *  CASEFOLD:  Performs unicode case folding, to be able to do a
 170  *             case-insensitive string comparison.
 171  *  CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which
 172  *             is representing a single grapheme cluster (see UAX#29).
 173  *  LUMP:      Lumps certain characters together
 174  *             (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
 175  *             (See lump.txt for details.)
 176  *             If NLF2LF is set, this includes a transformation of
 177  *             paragraph and line separators to ASCII line-feed (LF).
 178  *  STRIPMARK: Strips all character markings
 179  *             (non-spacing, spacing and enclosing) (i.e. accents)
 180  *             NOTE: this option works only with COMPOSE or DECOMPOSE
 181  */
 182
 183 #define UTF8PROC_ERROR_NOMEM -1
 184 #define UTF8PROC_ERROR_OVERFLOW -2
 185 #define UTF8PROC_ERROR_INVALIDUTF8 -3
 186 #define UTF8PROC_ERROR_NOTASSIGNED -4
 187 #define UTF8PROC_ERROR_INVALIDOPTS -5
 188 /*
 189  *  Error codes being returned by almost all functions:
 190  *  ERROR_NOMEM:       Memory could not be allocated.
 191  *  ERROR_OVERFLOW:    The given string is too long to be processed.
 192  *  ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string.
 193  *  ERROR_NOTASSIGNED: The REJECTNA flag was set,
 194  *                     and an unassigned code point was found.
 195  *  ERROR_INVALIDOPTS: Invalid options have been used.
 196  */
 197
 198 typedef int16_t utf8proc_propval_t;
 199 typedef struct utf8proc_property_struct {
 200   utf8proc_propval_t category;
 201   utf8proc_propval_t combining_class;
 202   utf8proc_propval_t bidi_class;
 203   utf8proc_propval_t decomp_type;
 204   const int32_t *decomp_mapping;
 205   unsigned bidi_mirrored:1;
 206   int32_t uppercase_mapping;
 207   int32_t lowercase_mapping;
 208   int32_t titlecase_mapping;
 209   int32_t comb1st_index;
 210   int32_t comb2nd_index;
 211   unsigned comp_exclusion:1;
 212   unsigned ignorable:1;
 213   unsigned control_boundary:1;
 214   unsigned extend:1;
 215   const int32_t *casefold_mapping;
 216 } utf8proc_property_t;
 217
 218 #define UTF8PROC_CATEGORY_LU  1
 219 #define UTF8PROC_CATEGORY_LL  2
 220 #define UTF8PROC_CATEGORY_LT  3
 221 #define UTF8PROC_CATEGORY_LM  4
 222 #define UTF8PROC_CATEGORY_LO  5
 223 #define UTF8PROC_CATEGORY_MN  6
 224 #define UTF8PROC_CATEGORY_MC  7
 225 #define UTF8PROC_CATEGORY_ME  8
 226 #define UTF8PROC_CATEGORY_ND  9
 227 #define UTF8PROC_CATEGORY_NL 10
 228 #define UTF8PROC_CATEGORY_NO 11
 229 #define UTF8PROC_CATEGORY_PC 12
 230 #define UTF8PROC_CATEGORY_PD 13
 231 #define UTF8PROC_CATEGORY_PS 14
 232 #define UTF8PROC_CATEGORY_PE 15
 233 #define UTF8PROC_CATEGORY_PI 16
 234 #define UTF8PROC_CATEGORY_PF 17
 235 #define UTF8PROC_CATEGORY_PO 18
 236 #define UTF8PROC_CATEGORY_SM 19
 237 #define UTF8PROC_CATEGORY_SC 20
 238 #define UTF8PROC_CATEGORY_SK 21
 239 #define UTF8PROC_CATEGORY_SO 22
 240 #define UTF8PROC_CATEGORY_ZS 23
 241 #define UTF8PROC_CATEGORY_ZL 24
 242 #define UTF8PROC_CATEGORY_ZP 25
 243 #define UTF8PROC_CATEGORY_CC 26
 244 #define UTF8PROC_CATEGORY_CF 27
 245 #define UTF8PROC_CATEGORY_CS 28
 246 #define UTF8PROC_CATEGORY_CO 29
 247 #define UTF8PROC_CATEGORY_CN 30
 248 #define UTF8PROC_BIDI_CLASS_L    1
 249 #define UTF8PROC_BIDI_CLASS_LRE  2
 250 #define UTF8PROC_BIDI_CLASS_LRO  3
 251 #define UTF8PROC_BIDI_CLASS_R    4
 252 #define UTF8PROC_BIDI_CLASS_AL   5
 253 #define UTF8PROC_BIDI_CLASS_RLE  6
 254 #define UTF8PROC_BIDI_CLASS_RLO  7
 255 #define UTF8PROC_BIDI_CLASS_PDF  8
 256 #define UTF8PROC_BIDI_CLASS_EN   9
 257 #define UTF8PROC_BIDI_CLASS_ES  10
 258 #define UTF8PROC_BIDI_CLASS_ET  11
 259 #define UTF8PROC_BIDI_CLASS_AN  12
 260 #define UTF8PROC_BIDI_CLASS_CS  13
 261 #define UTF8PROC_BIDI_CLASS_NSM 14
 262 #define UTF8PROC_BIDI_CLASS_BN  15
 263 #define UTF8PROC_BIDI_CLASS_B   16
 264 #define UTF8PROC_BIDI_CLASS_S   17
 265 #define UTF8PROC_BIDI_CLASS_WS  18
 266 #define UTF8PROC_BIDI_CLASS_ON  19
 267 #define UTF8PROC_DECOMP_TYPE_FONT      1
 268 #define UTF8PROC_DECOMP_TYPE_NOBREAK   2
 269 #define UTF8PROC_DECOMP_TYPE_INITIAL   3
 270 #define UTF8PROC_DECOMP_TYPE_MEDIAL    4
 271 #define UTF8PROC_DECOMP_TYPE_FINAL     5
 272 #define UTF8PROC_DECOMP_TYPE_ISOLATED  6
 273 #define UTF8PROC_DECOMP_TYPE_CIRCLE    7
 274 #define UTF8PROC_DECOMP_TYPE_SUPER     8
 275 #define UTF8PROC_DECOMP_TYPE_SUB       9
 276 #define UTF8PROC_DECOMP_TYPE_VERTICAL 10
 277 #define UTF8PROC_DECOMP_TYPE_WIDE     11
 278 #define UTF8PROC_DECOMP_TYPE_NARROW   12
 279 #define UTF8PROC_DECOMP_TYPE_SMALL    13
 280 #define UTF8PROC_DECOMP_TYPE_SQUARE   14
 281 #define UTF8PROC_DECOMP_TYPE_FRACTION 15
 282 #define UTF8PROC_DECOMP_TYPE_COMPAT   16
 283
 284 #ifdef UTF8PROC_DATA_EXPORT
 285 extern const int8_t utf8proc_utf8class[256];
 286 #endif
 287
 288 UTF8PROC_API
 289 const char *utf8proc_version(void);
 290
 291 UTF8PROC_API
 292 const char *utf8proc_errmsg(ssize_t errcode);
 293 /*
 294  *  Returns a static error string for the given error code.
 295  */
 296
 297 UTF8PROC_API
 298 ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
 299 /*
 300  *  Reads a single char from the UTF-8 sequence being pointed to by 'str'.
 301  *  The maximum number of bytes read is 'strlen', unless 'strlen' is
 302  *  negative.
 303  *  If a valid unicode char could be read, it is stored in the variable
 304  *  being pointed to by 'dst', otherwise that variable will be set to -1.
 305  *  In case of success the number of bytes read is returned, otherwise a
 306  *  negative error code is returned.
 307  */
 308
 309 UTF8PROC_API
 310 bool utf8proc_codepoint_valid(int32_t uc);
 311 /*
 312  *  Returns 1, if the given unicode code-point is valid, otherwise 0.
 313  */
 314
 315 UTF8PROC_API
 316 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
 317 /*
 318  *  Encodes the unicode char with the code point 'uc' as an UTF-8 string in
 319  *  the byte array being pointed to by 'dst'. This array has to be at least
 320  *  4 bytes long.
 321  *  In case of success the number of bytes written is returned,
 322  *  otherwise 0.
 323  *  This function does not check if 'uc' is a valid unicode code point.
 324  */
 325
 326 UTF8PROC_API
 327 const utf8proc_property_t *utf8proc_get_property(int32_t uc);
 328 /*
 329  *  Returns a pointer to a (constant) struct containing information about
 330  *  the unicode char with the given code point 'uc'.
 331  *  If the character is not existent a pointer to a special struct is
 332  *  returned, where 'category' is a NULL pointer.
 333  *  WARNING: The parameter 'uc' has to be in the range of 0x0000 to
 334  *           0x10FFFF, otherwise the program might crash!
 335  */
 336
 337 UTF8PROC_API
 338 ssize_t utf8proc_decompose_char(
 339   int32_t uc, int32_t *dst, ssize_t bufsize,
 340   int options, int *last_boundclass
 341 );
 342 /*
 343  *  Writes a decomposition of the unicode char 'uc' into the array being
 344  *  pointed to by 'dst'.
 345  *  Following flags in the 'options' field are regarded:
 346  *  REJECTNA:  an unassigned unicode code point leads to an error
 347  *  IGNORE:    "default ignorable" chars are stripped
 348  *  CASEFOLD:  unicode casefolding is applied
 349  *  COMPAT:    replace certain characters with their
 350  *             compatibility decomposition
 351  *  CHARBOUND: Inserts 0xFF bytes before each grapheme cluster
 352  *  LUMP:      lumps certain different characters together
 353  *  STRIPMARK: removes all character marks
 354  *  The pointer 'last_boundclass' has to point to an integer variable which
 355  *  is storing the last character boundary class, if the CHARBOUND option
 356  *  is used.
 357  *  In case of success the number of chars written is returned,
 358  *  in case of an error, a negative error code is returned.
 359  *  If the number of written chars would be bigger than 'bufsize',
 360  *  the buffer (up to 'bufsize') has inpredictable data, and the needed
 361  *  buffer size is returned.
 362  *  WARNING: The parameter 'uc' has to be in the range of 0x0000 to
 363  *           0x10FFFF, otherwise the program might crash!
 364  */
 365
 366 UTF8PROC_API
 367 ssize_t utf8proc_decompose(
 368   const uint8_t *str, ssize_t strlen,
 369   int32_t *buffer, ssize_t bufsize, int options
 370 );
 371 /*
 372  *  Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8
 373  *  string, and orders the decomposed sequences correctly.
 374  *  If the NULLTERM flag in 'options' is set, processing will be stopped,
 375  *  when a NULL byte is encounted, otherwise 'strlen' bytes are processed.
 376  *  The result in form of unicode code points is written into the buffer
 377  *  being pointed to by 'buffer', having the length of 'bufsize' entries.
 378  *  In case of success the number of chars written is returned,
 379  *  in case of an error, a negative error code is returned.
 380  *  If the number of written chars would be bigger than 'bufsize',
 381  *  the buffer (up to 'bufsize') has inpredictable data, and the needed
 382  *  buffer size is returned.
 383  */
 384
 385 UTF8PROC_API
 386 ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
 387 /*
 388  *  Reencodes the sequence of unicode characters given by the pointer
 389  *  'buffer' and 'length' as UTF-8.
 390  *  The result is stored in the same memory area where the data is read.
 391  *  Following flags in the 'options' field are regarded:
 392  *  NLF2LS:  converts LF, CRLF, CR and NEL into LS
 393  *  NLF2PS:  converts LF, CRLF, CR and NEL into PS
 394  *  NLF2LF:  converts LF, CRLF, CR and NEL into LF
 395  *  STRIPCC: strips or converts all non-affected control characters
 396  *  COMPOSE: tries to combine decomposed characters into composite
 397  *           characters
 398  *  STABLE:  prohibits combining characters which would violate
 399  *           the unicode versioning stability
 400  *  In case of success the length of the resulting UTF-8 string is
 401  *  returned, otherwise a negative error code is returned.
 402  *  WARNING: The amount of free space being pointed to by 'buffer', has to
 403  *           exceed the amount of the input data by one byte, and the
 404  *           entries of the array pointed to by 'str' have to be in the
 405  *           range of 0x0000 to 0x10FFFF, otherwise the program might
 406  *           crash!
 407  */
 408
 409 UTF8PROC_API
 410 ssize_t utf8proc_map(
 411   const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
 412 );
 413 /*
 414  *  Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8
 415  *  string, which is allocated dynamically, and afterwards pointed to by
 416  *  the pointer being pointed to by 'dstptr'.
 417  *  If the NULLTERM flag in the 'options' field is set, the length is
 418  *  determined by a NULL terminator, otherwise the parameter 'strlen' is
 419  *  evaluated to determine the string length, but in any case the result
 420  *  will be NULL terminated (though it might contain NULL characters
 421  *  before). Other flags in the 'options' field are passed to the functions
 422  *  defined above, and regarded as described.
 423  *  In case of success the length of the new string is returned,
 424  *  otherwise a negative error code is returned.
 425  *  NOTICE: The memory of the new UTF-8 string will have been allocated with
 426  *          'malloc', and has theirfore to be freed with 'free'.
 427  */
 428
 429 UTF8PROC_API
 430 uint8_t *utf8proc_NFD(const uint8_t *str);
 431 UTF8PROC_API
 432 uint8_t *utf8proc_NFC(const uint8_t *str);
 433 UTF8PROC_API
 434 uint8_t *utf8proc_NFKD(const uint8_t *str);
 435 UTF8PROC_API
 436 uint8_t *utf8proc_NFKC(const uint8_t *str);
 437 /*
 438  *  Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
 439  *  normalized version of the null-terminated string 'str'.
 440  */
 441
 442 #ifdef __cplusplus
 443 }
 444 #endif
 445
 446 #endif
 447