2 * utf8proc.c: Wrappers for the utf8proc library
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
21 * ====================================================================
26 #include <apr_fnmatch.h>
28 #include "private/svn_string_private.h"
29 #include "private/svn_utf_private.h"
30 #include "svn_private_config.h"
32 #define UTF8PROC_INLINE
33 /* Somehow utf8proc thinks it is nice to use strlen as an argument name,
34 while this function is already defined via apr.h */
35 #define strlen svn__strlen_var
36 #include "utf8proc/utf8proc.c"
42 svn_utf__utf8proc_compiled_version(void)
44 static const char utf8proc_version[] =
45 APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "."
46 APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "."
47 APR_STRINGIFY(UTF8PROC_VERSION_PATCH);
48 return utf8proc_version;
52 svn_utf__utf8proc_runtime_version(void)
54 /* Unused static function warning removal hack. */
55 SVN_UNUSED(utf8proc_NFD);
56 SVN_UNUSED(utf8proc_NFC);
57 SVN_UNUSED(utf8proc_NFKD);
58 SVN_UNUSED(utf8proc_NFKC);
60 return utf8proc_version();
65 /* Fill the given BUFFER with decomposed UCS-4 representation of the
66 * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
67 * is NUL-terminated; otherwise look only at the first LENGTH bytes in
68 * STRING. Upon return, BUFFER->data points at an array of UCS-4
69 * characters, and return the length of the array. TRANSFORM_FLAGS
70 * define exactly how the decomposition is performed.
72 * A negative return value is an utf8proc error code and may indicate
73 * that STRING contains invalid UTF-8 or was so long that an overflow
77 unicode_decomposition(int transform_flags,
78 const char *string, apr_size_t length,
81 const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
82 ? UTF8PROC_NULLTERM : 0);
86 apr_int32_t *const ucs4buf = buffer->data;
87 const ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
88 const ssize_t result =
89 utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
90 UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
91 | transform_flags | nullterm);
93 if (result < 0 || result <= ucs4len)
96 /* Increase the decomposition buffer size and retry */
97 svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
101 /* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
102 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
103 * NUL-terminated; otherwise look only at the first LENGTH bytes in
104 * STRING. Upon return, BUFFER->data points at an array of UCS-4
105 * characters and *RESULT_LENGTH contains the length of the array.
107 * A returned error may indicate that STRING contains invalid UTF-8 or
108 * invalid Unicode codepoints. Any error message comes from utf8proc.
111 decompose_normalized(apr_size_t *result_length,
112 const char *string, apr_size_t length,
113 svn_membuf_t *buffer)
115 ssize_t result = unicode_decomposition(0, string, length, buffer);
117 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
118 gettext(utf8proc_errmsg(result)));
119 *result_length = result;
123 /* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
124 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
125 * NUL-terminated; otherwise look only at the first LENGTH bytes in
126 * STRING. Upon return, BUFFER->data points at a NUL-terminated string
127 * of UTF-8 characters.
129 * A returned error may indicate that STRING contains invalid UTF-8 or
130 * invalid Unicode codepoints. Any error message comes from utf8proc.
133 normalize_cstring(apr_size_t *result_length,
134 const char *string, apr_size_t length,
135 svn_membuf_t *buffer)
137 ssize_t result = unicode_decomposition(0, string, length, buffer);
140 svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
141 result = utf8proc_reencode(buffer->data, result,
142 UTF8PROC_COMPOSE | UTF8PROC_STABLE);
145 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
146 gettext(utf8proc_errmsg(result)));
147 *result_length = result;
151 /* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
152 * length LENB. Return 0 if they're equal, a negative value if BUFA is
153 * less than BUFB, otherwise a positive value.
155 * Yes, this is strcmp for known-length UCS-4 strings.
158 ucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
159 const apr_int32_t *bufb, apr_size_t lenb)
161 const apr_size_t len = (lena < lenb ? lena : lenb);
164 for (i = 0; i < len; ++i)
166 const int diff = bufa[i] - bufb[i];
170 return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
174 svn_utf__normcmp(int *result,
175 const char *str1, apr_size_t len1,
176 const char *str2, apr_size_t len2,
177 svn_membuf_t *buf1, svn_membuf_t *buf2)
182 /* Shortcut-circuit the decision if at least one of the strings is empty. */
183 const svn_boolean_t empty1 =
184 (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
185 const svn_boolean_t empty2 =
186 (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
187 if (empty1 || empty2)
189 *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
193 SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
194 SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
195 *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
200 svn_utf__normalize(const char **result,
201 const char *str, apr_size_t len,
204 apr_size_t result_length;
205 SVN_ERR(normalize_cstring(&result_length, str, len, buf));
206 *result = (const char*)(buf->data);
210 /* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
211 * Assume BUFFER is already filled to *LENGTH and return the new size there.
212 * This function does *not* nul-terminate the stringbuf!
214 * A returned error indicates that the codepoint is invalid.
217 encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
221 if (buffer->size - *length < 4)
222 svn_membuf__resize(buffer, buffer->size + 4);
224 utf8len = utf8proc_encode_char(ucs4chr, ((uint8_t*)buffer->data + *length));
226 return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
227 _("Invalid Unicode character U+%04lX"),
234 svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
235 const apr_int32_t *ucs4str,
237 apr_size_t *result_length)
241 SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
242 svn_membuf__resize(buffer, *result_length + 1);
243 ((char*)buffer->data)[*result_length] = '\0';
249 svn_utf__glob(svn_boolean_t *match,
250 const char *pattern, apr_size_t pattern_len,
251 const char *string, apr_size_t string_len,
252 const char *escape, apr_size_t escape_len,
253 svn_boolean_t sql_like,
254 svn_membuf_t *pattern_buf,
255 svn_membuf_t *string_buf,
256 svn_membuf_t *temp_buf)
258 apr_size_t patternbuf_len;
259 apr_size_t tempbuf_len;
261 /* If we're in GLOB mode, we don't do custom escape chars. */
262 if (escape && !sql_like)
263 return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
264 _("Cannot use a custom escape token"
265 " in glob matching mode"));
267 /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
268 because apr_fnmatch can't handle it.*/
269 SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
271 SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
272 tempbuf_len, &patternbuf_len));
275 /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
276 const apr_int32_t *like = temp_buf->data;
278 svn_boolean_t escaped;
282 ucs4esc = -1; /* Definitely an invalid UCS-4 character. */
285 const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
286 ? UTF8PROC_NULLTERM : 0);
288 utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
289 UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
291 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
292 gettext(utf8proc_errmsg(result)));
293 if (result == 0 || result > 1)
294 return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
295 _("Escape token must be one character"));
296 if ((ucs4esc & 0xFF) != ucs4esc)
297 return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
298 _("Invalid escape character U+%04lX"),
303 svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
304 for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
306 if (*like == ucs4esc && !escaped)
308 svn_membuf__resize(pattern_buf, patternbuf_len + 1);
309 ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
314 SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
319 if ((*like == '[' || *like == '\\') && !escaped)
321 /* Escape brackets and backslashes which are always
322 literals in LIKE patterns. */
323 svn_membuf__resize(pattern_buf, patternbuf_len + 1);
324 ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
330 /* Replace LIKE wildcards with their GLOB equivalents. */
331 if (*like == '%' || *like == '_')
333 const char wildcard = (*like == '%' ? '*' : '?');
334 svn_membuf__resize(pattern_buf, patternbuf_len + 1);
335 ((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
338 SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
341 svn_membuf__resize(pattern_buf, patternbuf_len + 1);
342 ((char*)pattern_buf->data)[patternbuf_len] = '\0';
345 /* Now normalize the string */
346 SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
347 SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
348 tempbuf_len, &tempbuf_len));
350 *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
355 svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
359 apr_size_t result_length;
360 const apr_size_t length = strlen(string);
361 svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
362 err = normalize_cstring(&result_length, string, length, &buffer);
365 svn_error_clear(err);
368 return (length == result_length && 0 == strcmp(string, buffer.data));
372 svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
374 /* Hexadecimal digits for code conversion. */
375 static const char digits[] = "0123456789ABCDEF";
377 /* Flags used for Unicode decomposition. */
378 static const int decomp_flags = (
379 UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
380 | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
382 svn_stringbuf_t *result;
384 ssize_t decomp_length;
387 /* Decompose to a non-reversible compatibility format. */
388 svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
389 decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
390 if (decomp_length < 0)
393 apr_size_t done, prev;
395 /* The only other error we can receive here indicates an integer
396 overflow due to the length of the input string. Not very
397 likely, but we certainly shouldn't continue in that case. */
398 SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
400 /* Break the decomposition into parts that are valid UTF-8, and
401 bytes that are not. Represent the invalid bytes in the target
402 erray by their negative value. This works because utf8proc
403 will not generate Unicode code points with values larger than
405 svn_membuf__create(&part, sizeof(apr_int32_t), pool);
408 while (done < length)
412 while (done < length)
414 len = utf8proc_iterate((uint8_t*)src + done, length - done, &uc);
420 /* Decompose the valid part */
423 len = unicode_decomposition(
424 decomp_flags, src + prev, done - prev, &part);
425 SVN_ERR_ASSERT_NO_RETURN(len > 0);
427 &buffer, (decomp_length + len) * sizeof(apr_int32_t));
428 memcpy((apr_int32_t*)buffer.data + decomp_length,
429 part.data, len * sizeof(apr_int32_t));
430 decomp_length += len;
434 /* What follows could be a valid UTF-8 sequence, but not
435 a valid Unicode character. */
440 /* Determine the length of the UTF-8 sequence */
441 const char *const p = src + done;
442 len = utf8proc_utf8class[(uint8_t)*p];
444 /* Check if the multi-byte sequence is valid UTF-8. */
445 if (len > 1 && len <= (apr_ssize_t)(length - done))
446 last = svn_utf__last_valid(p, len);
450 /* Might not be a valid UTF-8 sequence at all */
451 if (!last || (last && last - p < len))
453 uc = -((apr_int32_t)(*p & 0xff));
460 /* Decode the UTF-8 sequence without validation. */
462 uc = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
465 uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
469 uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
470 + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
473 SVN_ERR_ASSERT_NO_RETURN(
474 !"Unexpected invalid UTF-8 byte");
480 &buffer, (decomp_length + 1) * sizeof(apr_int32_t));
481 ((apr_int32_t*)buffer.data)[decomp_length++] = uc;
488 /* Scan the result and deleting any combining diacriticals and
489 inserting placeholders where any non-ascii characters remain. */
490 result = svn_stringbuf_create_ensure(decomp_length, pool);
491 for (len = 0; len < decomp_length; ++len)
493 const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
494 if (cp > 0 && cp < 127)
495 svn_stringbuf_appendbyte(result, (char)cp);
497 svn_stringbuf_appendcstr(result, "\\0");
500 const apr_int32_t rcp = ((-cp) & 0xff);
501 svn_stringbuf_appendcstr(result, "?\\");
502 svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
503 svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
507 if (utf8proc_codepoint_valid(cp))
509 const utf8proc_property_t *prop = utf8proc_get_property(cp);
510 if (prop->combining_class != 0)
511 continue; /* Combining mark; ignore */
512 svn_stringbuf_appendcstr(result, "{U+");
515 svn_stringbuf_appendcstr(result, "{U?");
518 svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
519 svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
521 svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
522 svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
523 svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
524 svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
525 svn_stringbuf_appendbyte(result, '}');