contrib/subversion/subversion/include/private/svn_utf_private.h

   1 /**
   2  * @copyright
   3  * ====================================================================
   4  *    Licensed to the Apache Software Foundation (ASF) under one
   5  *    or more contributor license agreements.  See the NOTICE file
   6  *    distributed with this work for additional information
   7  *    regarding copyright ownership.  The ASF licenses this file
   8  *    to you under the Apache License, Version 2.0 (the
   9  *    "License"); you may not use this file except in compliance
  10  *    with the License.  You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *    Unless required by applicable law or agreed to in writing,
  15  *    software distributed under the License is distributed on an
  16  *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  17  *    KIND, either express or implied.  See the License for the
  18  *    specific language governing permissions and limitations
  19  *    under the License.
  20  * ====================================================================
  21  * @endcopyright
  22  *
  23  * @file svn_utf_private.h
  24  * @brief UTF validation and normalization routines
  25  */
  26
  27 #ifndef SVN_UTF_PRIVATE_H
  28 #define SVN_UTF_PRIVATE_H
  29
  30 #include <apr.h>
  31 #include <apr_pools.h>
  32
  33 #include "svn_types.h"
  34 #include "svn_string.h"
  35 #include "svn_string_private.h"
  36
  37 #ifdef __cplusplus
  38 extern "C" {
  39 #endif /* __cplusplus */
  40
  41
  42 /* Return TRUE if the string SRC of length LEN is a valid UTF-8 encoding
  43  * according to the rules laid down by the Unicode 4.0 standard, FALSE
  44  * otherwise.  This function is faster than svn_utf__last_valid().
  45  */
  46 svn_boolean_t
  47 svn_utf__is_valid(const char *src, apr_size_t len);
  48
  49 /* As for svn_utf__is_valid but SRC is NULL terminated. */
  50 svn_boolean_t
  51 svn_utf__cstring_is_valid(const char *src);
  52
  53 /* Return a pointer to the first character after the last valid UTF-8
  54  * potentially multi-byte character in the string SRC of length LEN.
  55  * Validity of bytes from SRC to SRC+LEN-1, inclusively, is checked.
  56  * If SRC is a valid UTF-8, the return value will point to the byte SRC+LEN,
  57  * otherwise it will point to the start of the first invalid character.
  58  * In either case all the characters between SRC and the return pointer - 1,
  59  * inclusively, are valid UTF-8.
  60  *
  61  * See also svn_utf__is_valid().
  62  */
  63 const char *
  64 svn_utf__last_valid(const char *src, apr_size_t len);
  65
  66 /* As for svn_utf__last_valid but uses a different implementation without
  67    lookup tables.  It avoids the table memory use (about 400 bytes) but the
  68    function is longer (about 200 bytes extra) and likely to be slower when
  69    the string is valid.  If the string is invalid this function may be
  70    faster since it returns immediately rather than continuing to the end of
  71    the string.  The main reason this function exists is to test the table
  72    driven implementation.  */
  73 const char *
  74 svn_utf__last_valid2(const char *src, apr_size_t len);
  75
  76 /* Copy LENGTH bytes of SRC, converting characters as follows:
  77     - Pass characters from the ASCII subset to the result
  78     - Strip all combining marks from the string
  79     - Represent other valid Unicode chars as {U+XXXX}
  80     - Replace invalid Unicode chars with {U?XXXX}
  81     - Represent chars that are not valid UTF-8 as ?\XX
  82     - Replace codes outside the Unicode range with a sequence of ?\XX
  83     - Represent the null byte as \0
  84    Allocate the result in POOL. */
  85 const char *
  86 svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool);
  87
  88 const char *
  89 svn_utf__cstring_from_utf8_fuzzy(const char *src,
  90                                  apr_pool_t *pool,
  91                                  svn_error_t *(*convert_from_utf8)
  92                                               (const char **,
  93                                                const char *,
  94                                                apr_pool_t *));
  95
  96
  97 #if defined(WIN32)
  98 /* On Windows: Convert the UTF-8 string SRC to UTF-16.
  99    If PREFIX is not NULL, prepend it to the converted result.
 100    The result, if not empty, will be allocated in RESULT_POOL. */
 101 svn_error_t *
 102 svn_utf__win32_utf8_to_utf16(const WCHAR **result,
 103                              const char *src,
 104                              const WCHAR *prefix,
 105                              apr_pool_t *result_pool);
 106
 107 /* On Windows: Convert the UTF-16 string SRC to UTF-8.
 108    If PREFIX is not NULL, prepend it to the converted result.
 109    The result, if not empty, will be allocated in RESULT_POOL. */
 110 svn_error_t *
 111 svn_utf__win32_utf16_to_utf8(const char **result,
 112                              const WCHAR *src,
 113                              const char *prefix,
 114                              apr_pool_t *result_pool);
 115 #endif /* WIN32*/
 116
 117
 118 /* A constant used for many length parameters in the utf8proc wrappers
 119  * to indicate that the length of a string is unknonw. */
 120 #define SVN_UTF__UNKNOWN_LENGTH ((apr_size_t) -1)
 121
 122
 123 /* Compare two UTF-8 strings, ignoring normalization, using buffers
 124  * BUF1 and BUF2 for temporary storage. If either of LEN1 or LEN2 is
 125  * SVN_UTF__UNKNOWN_LENGTH, assume the associated string is
 126  * null-terminated; otherwise, consider the string only up to the
 127  * given length.
 128  *
 129  * Return compare value in *RESULT.
 130  */
 131 svn_error_t *
 132 svn_utf__normcmp(int *result,
 133                  const char *str1, apr_size_t len1,
 134                  const char *str2, apr_size_t len2,
 135                  svn_membuf_t *buf1, svn_membuf_t *buf2);
 136
 137 /* Normalize the UTF-8 string STR to form C, using BUF for temporary
 138  * storage. If LEN is SVN_UTF__UNKNOWN_LENGTH, assume STR is
 139  * null-terminated; otherwise, consider the string only up to the
 140  * given length.
 141  *
 142  * Return the normalized string in *RESULT, which shares storage with
 143  * BUF and is valid only until the next time BUF is modified.
 144  *
 145  * A returned error may indicate that STRING contains invalid UTF-8 or
 146  * invalid Unicode codepoints.
 147  */
 148 svn_error_t*
 149 svn_utf__normalize(const char **result,
 150                    const char *str, apr_size_t len,
 151                    svn_membuf_t *buf);
 152
 153 /* Check if STRING is a valid, NFC-normalized UTF-8 string.  Note that
 154  * a FALSE return value may indicate that STRING is not valid UTF-8 at
 155  * all.
 156  *
 157  * Use SCRATCH_POOL for temporary allocations.
 158  */
 159 svn_boolean_t
 160 svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool);
 161
 162 /* Encode an UCS-4 string to UTF-8, placing the result into BUFFER.
 163  * While utf8proc does have a similar function, it does more checking
 164  * and processing than we want here; this function does not attempt
 165  * any normalizations but just encodes the individual code points.
 166  * The encoded string will always be NUL-terminated.
 167  *
 168  * Return the length of the result (excluding the NUL terminator) in
 169  * *result_length.
 170  *
 171  * A returned error indicates that a codepoint is invalid.
 172  */
 173 svn_error_t *
 174 svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
 175                             const apr_int32_t *ucs4str,
 176                             apr_size_t length,
 177                             apr_size_t *result_length);
 178
 179 /* Pattern matching similar to the the SQLite LIKE and GLOB
 180  * operators. PATTERN, KEY and ESCAPE must all point to UTF-8
 181  * strings. Furthermore, ESCAPE, if provided, must be a character from
 182  * the ASCII subset.
 183  *
 184  * If any of PATTERN_LEN, STRING_LEN or ESCAPE_LEN are
 185  * SVN_UTF__UNKNOWN_LENGTH, assume the associated string is
 186  * null-terminated; otherwise, consider the string only up to the
 187  * given length.
 188  *
 189  * Use buffers PATTERN_BUF, STRING_BUF and TEMP_BUF for temporary storage.
 190  *
 191  * If SQL_LIKE is true, interpret PATTERN as a pattern used by the SQL
 192  * LIKE operator and notice ESCAPE. Otherwise it's a Unix fileglob
 193  * pattern, and ESCAPE must be NULL.
 194  *
 195  * Set *MATCH to the result of the comparison.
 196 */
 197 svn_error_t *
 198 svn_utf__glob(svn_boolean_t *match,
 199               const char *pattern, apr_size_t pattern_len,
 200               const char *string, apr_size_t string_len,
 201               const char *escape, apr_size_t escape_len,
 202               svn_boolean_t sql_like,
 203               svn_membuf_t *pattern_buf,
 204               svn_membuf_t *string_buf,
 205               svn_membuf_t *temp_buf);
 206
 207 /* Return the compiled version of the wrapped utf8proc library. */
 208 const char *
 209 svn_utf__utf8proc_compiled_version(void);
 210
 211 /* Return the runtime version of the wrapped utf8proc library. */
 212 const char *
 213 svn_utf__utf8proc_runtime_version(void);
 214
 215 /* Convert an UTF-16 (or UCS-2) string to UTF-8, returning the pointer
 216  * in RESULT. If BIG_ENDIAN is set, then UTF16STR is big-endian;
 217  * otherwise, it's little-endian.
 218  *
 219  * If UTF16LEN is SVN_UTF__UNKNOWN_LENGTH, then UTF16STR must be
 220  * terminated with a zero; otherwise, it is the number of 16-bit codes
 221  * to convert, and the source string may contain NUL values.
 222  *
 223  * Allocate RESULT in RESULT_POOL and use SCRATCH_POOL for
 224  * intermediate allocation.
 225  *
 226  * This function combines UTF-16 surrogate pairs into single code
 227  * points, but will leave single lead or trail surrogates unchanged.
 228  */
 229 svn_error_t *
 230 svn_utf__utf16_to_utf8(const svn_string_t **result,
 231                        const apr_uint16_t *utf16str,
 232                        apr_size_t utf16len,
 233                        svn_boolean_t big_endian,
 234                        apr_pool_t *result_pool,
 235                        apr_pool_t *scratch_pool);
 236
 237 /* Convert an UTF-32 string to UTF-8, returning the pointer in
 238  * RESULT. If BIG_ENDIAN is set, then UTF32STR is big-endian;
 239  * otherwise, it's little-endian.
 240  *
 241  * If UTF32LEN is SVN_UTF__UNKNOWN_LENGTH, then UTF32STR must be
 242  * terminated with a zero; otherwise, it is the number of 32-bit codes
 243  * to convert, and the source string may contain NUL values.
 244  *
 245  * Allocate RESULT in RESULT_POOL and use SCRATCH_POOL for
 246  * intermediate allocation.
 247  */
 248 svn_error_t *
 249 svn_utf__utf32_to_utf8(const svn_string_t **result,
 250                        const apr_int32_t *utf32str,
 251                        apr_size_t utf32len,
 252                        svn_boolean_t big_endian,
 253                        apr_pool_t *result_pool,
 254                        apr_pool_t *scratch_pool);
 255
 256
 257 #ifdef __cplusplus
 258 }
 259 #endif /* __cplusplus */
 260
 261 #endif /* SVN_UTF_PRIVATE_H */