2 * utf_validate.c: Validate a UTF-8 string
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
21 * ====================================================================
24 /* Validate a UTF-8 string according to the rules in
26 * Table 3-6. Well-Formed UTF-8 Bytes Sequences
30 * The Unicode Standard, Version 4.0
32 * which is available at
34 * http://www.unicode.org/
36 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
37 * is a subset of that enconding. The Unicode enconding prohibits things
38 * like non-shortest encodings (some characters can be represented by more
39 * than one multi-byte encoding) and the encodings for the surrogate code
40 * points. RFC-3629 superceeds RFC-2279 and adopts the same well-formed
41 * rules as Unicode. This is the ABNF in RFC-3629 that describes
42 * well-formed UTF-8 rules:
44 * UTF8-octets = *( UTF8-char )
45 * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
47 * UTF8-2 = %xC2-DF UTF8-tail
48 * UTF8-3 = %xE0 %xA0-BF UTF8-tail /
49 * %xE1-EC 2( UTF8-tail ) /
50 * %xED %x80-9F UTF8-tail /
51 * %xEE-EF 2( UTF8-tail )
52 * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
53 * %xF1-F3 3( UTF8-tail ) /
54 * %xF4 %x80-8F 2( UTF8-tail )
59 #include "private/svn_utf_private.h"
60 #include "private/svn_eol_private.h"
61 #include "private/svn_dep_compat.h"
63 /* Lookup table to categorise each octet in the string. */
64 static const char octet_category[256] = {
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00-0x7f */
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80-0x8f */
74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0x90-0x9f */
75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xa0-0xbf */
76 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
78 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 0xc2-0xdf */
79 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
81 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 0xe1-0xec */
85 11, 11, 11, /* 0xf1-0xf3 */
87 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
94 #define FSM_80BF80BF 3
97 #define FSM_80BF80BF80BF 6
101 /* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
102 same transitions, as do categories 0xe1-0xec and 0xee-0xef. I wonder if
103 there is any great benefit in combining categories? It would reduce the
104 memory footprint of the transition table by 16 bytes, but might it be
105 harder to understand? */
107 /* Machine transition table */
108 static const char machine [9][14] = {
110 {FSM_START, /* 0x00-0x7f */
111 FSM_ERROR, /* 0x80-0x8f */
112 FSM_ERROR, /* 0x90-0x9f */
113 FSM_ERROR, /* 0xa0-0xbf */
114 FSM_ERROR, /* 0xc0-0xc1 */
115 FSM_80BF, /* 0xc2-0xdf */
117 FSM_80BF80BF, /* 0xe1-0xec */
119 FSM_80BF80BF, /* 0xee-0xef */
121 FSM_80BF80BF80BF, /* 0xf1-0xf3 */
123 FSM_ERROR}, /* 0xf5-0xff */
126 {FSM_ERROR, /* 0x00-0x7f */
127 FSM_START, /* 0x80-0x8f */
128 FSM_START, /* 0x90-0x9f */
129 FSM_START, /* 0xa0-0xbf */
130 FSM_ERROR, /* 0xc0-0xc1 */
131 FSM_ERROR, /* 0xc2-0xdf */
132 FSM_ERROR, /* 0xe0 */
133 FSM_ERROR, /* 0xe1-0xec */
134 FSM_ERROR, /* 0xed */
135 FSM_ERROR, /* 0xee-0xef */
136 FSM_ERROR, /* 0xf0 */
137 FSM_ERROR, /* 0xf1-0xf3 */
138 FSM_ERROR, /* 0xf4 */
139 FSM_ERROR}, /* 0xf5-0xff */
142 {FSM_ERROR, /* 0x00-0x7f */
143 FSM_ERROR, /* 0x80-0x8f */
144 FSM_ERROR, /* 0x90-0x9f */
145 FSM_80BF, /* 0xa0-0xbf */
146 FSM_ERROR, /* 0xc0-0xc1 */
147 FSM_ERROR, /* 0xc2-0xdf */
148 FSM_ERROR, /* 0xe0 */
149 FSM_ERROR, /* 0xe1-0xec */
150 FSM_ERROR, /* 0xed */
151 FSM_ERROR, /* 0xee-0xef */
152 FSM_ERROR, /* 0xf0 */
153 FSM_ERROR, /* 0xf1-0xf3 */
154 FSM_ERROR, /* 0xf4 */
155 FSM_ERROR}, /* 0xf5-0xff */
158 {FSM_ERROR, /* 0x00-0x7f */
159 FSM_80BF, /* 0x80-0x8f */
160 FSM_80BF, /* 0x90-0x9f */
161 FSM_80BF, /* 0xa0-0xbf */
162 FSM_ERROR, /* 0xc0-0xc1 */
163 FSM_ERROR, /* 0xc2-0xdf */
164 FSM_ERROR, /* 0xe0 */
165 FSM_ERROR, /* 0xe1-0xec */
166 FSM_ERROR, /* 0xed */
167 FSM_ERROR, /* 0xee-0xef */
168 FSM_ERROR, /* 0xf0 */
169 FSM_ERROR, /* 0xf1-0xf3 */
170 FSM_ERROR, /* 0xf4 */
171 FSM_ERROR}, /* 0xf5-0xff */
174 {FSM_ERROR, /* 0x00-0x7f */
175 FSM_80BF, /* 0x80-0x8f */
176 FSM_80BF, /* 0x90-0x9f */
177 FSM_ERROR, /* 0xa0-0xbf */
178 FSM_ERROR, /* 0xc0-0xc1 */
179 FSM_ERROR, /* 0xc2-0xdf */
180 FSM_ERROR, /* 0xe0 */
181 FSM_ERROR, /* 0xe1-0xec */
182 FSM_ERROR, /* 0xed */
183 FSM_ERROR, /* 0xee-0xef */
184 FSM_ERROR, /* 0xf0 */
185 FSM_ERROR, /* 0xf1-0xf3 */
186 FSM_ERROR, /* 0xf4 */
187 FSM_ERROR}, /* 0xf5-0xff */
190 {FSM_ERROR, /* 0x00-0x7f */
191 FSM_ERROR, /* 0x80-0x8f */
192 FSM_80BF80BF, /* 0x90-0x9f */
193 FSM_80BF80BF, /* 0xa0-0xbf */
194 FSM_ERROR, /* 0xc0-0xc1 */
195 FSM_ERROR, /* 0xc2-0xdf */
196 FSM_ERROR, /* 0xe0 */
197 FSM_ERROR, /* 0xe1-0xec */
198 FSM_ERROR, /* 0xed */
199 FSM_ERROR, /* 0xee-0xef */
200 FSM_ERROR, /* 0xf0 */
201 FSM_ERROR, /* 0xf1-0xf3 */
202 FSM_ERROR, /* 0xf4 */
203 FSM_ERROR}, /* 0xf5-0xff */
205 /* FSM_80BF80BF80BF */
206 {FSM_ERROR, /* 0x00-0x7f */
207 FSM_80BF80BF, /* 0x80-0x8f */
208 FSM_80BF80BF, /* 0x90-0x9f */
209 FSM_80BF80BF, /* 0xa0-0xbf */
210 FSM_ERROR, /* 0xc0-0xc1 */
211 FSM_ERROR, /* 0xc2-0xdf */
212 FSM_ERROR, /* 0xe0 */
213 FSM_ERROR, /* 0xe1-0xec */
214 FSM_ERROR, /* 0xed */
215 FSM_ERROR, /* 0xee-0xef */
216 FSM_ERROR, /* 0xf0 */
217 FSM_ERROR, /* 0xf1-0xf3 */
218 FSM_ERROR, /* 0xf4 */
219 FSM_ERROR}, /* 0xf5-0xff */
222 {FSM_ERROR, /* 0x00-0x7f */
223 FSM_80BF80BF, /* 0x80-0x8f */
224 FSM_ERROR, /* 0x90-0x9f */
225 FSM_ERROR, /* 0xa0-0xbf */
226 FSM_ERROR, /* 0xc0-0xc1 */
227 FSM_ERROR, /* 0xc2-0xdf */
228 FSM_ERROR, /* 0xe0 */
229 FSM_ERROR, /* 0xe1-0xec */
230 FSM_ERROR, /* 0xed */
231 FSM_ERROR, /* 0xee-0xef */
232 FSM_ERROR, /* 0xf0 */
233 FSM_ERROR, /* 0xf1-0xf3 */
234 FSM_ERROR, /* 0xf4 */
235 FSM_ERROR}, /* 0xf5-0xff */
238 {FSM_ERROR, /* 0x00-0x7f */
239 FSM_ERROR, /* 0x80-0x8f */
240 FSM_ERROR, /* 0x90-0x9f */
241 FSM_ERROR, /* 0xa0-0xbf */
242 FSM_ERROR, /* 0xc0-0xc1 */
243 FSM_ERROR, /* 0xc2-0xdf */
244 FSM_ERROR, /* 0xe0 */
245 FSM_ERROR, /* 0xe1-0xec */
246 FSM_ERROR, /* 0xed */
247 FSM_ERROR, /* 0xee-0xef */
248 FSM_ERROR, /* 0xf0 */
249 FSM_ERROR, /* 0xf1-0xf3 */
250 FSM_ERROR, /* 0xf4 */
251 FSM_ERROR}, /* 0xf5-0xff */
254 /* Scan MAX_LEN bytes in *DATA for chars that are not in the octet
255 * category 0 (FSM_START). Return the position of the first such char
256 * or DATA + MAX_LEN if all were cat 0.
259 first_non_fsm_start_char(const char *data, apr_size_t max_len)
261 #if SVN_UNALIGNED_ACCESS_IS_OK
263 /* Scan the input one machine word at a time. */
264 for (; max_len > sizeof(apr_uintptr_t)
265 ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t))
266 if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET)
271 /* The remaining odd bytes will be examined the naive way: */
272 for (; max_len > 0; ++data, --max_len)
273 if ((unsigned char)*data >= 0x80)
280 svn_utf__last_valid(const char *data, apr_size_t len)
282 const char *start = first_non_fsm_start_char(data, len);
283 const char *end = data + len;
284 int state = FSM_START;
289 unsigned char octet = *data++;
290 int category = octet_category[octet];
291 state = machine[state][category];
292 if (state == FSM_START)
299 svn_utf__cstring_is_valid(const char *data)
304 return svn_utf__is_valid(data, strlen(data));
308 svn_utf__is_valid(const char *data, apr_size_t len)
310 const char *end = data + len;
311 int state = FSM_START;
316 data = first_non_fsm_start_char(data, len);
320 unsigned char octet = *data++;
321 int category = octet_category[octet];
322 state = machine[state][category];
324 return state == FSM_START;
328 svn_utf__last_valid2(const char *data, apr_size_t len)
330 const char *start = first_non_fsm_start_char(data, len);
331 const char *end = data + len;
332 int state = FSM_START;
337 unsigned char octet = *data++;
343 else if (octet <= 0xC1)
345 else if (octet <= 0xDF)
347 else if (octet == 0xE0)
349 else if (octet <= 0xEC)
350 state = FSM_80BF80BF;
351 else if (octet == 0xED)
353 else if (octet <= 0xEF)
354 state = FSM_80BF80BF;
355 else if (octet == 0xF0)
357 else if (octet <= 0xF3)
358 state = FSM_80BF80BF80BF;
359 else if (octet <= 0xF4)
365 if (octet >= 0x80 && octet <= 0xBF)
371 if (octet >= 0xA0 && octet <= 0xBF)
377 if (octet >= 0x80 && octet <= 0xBF)
383 if (octet >= 0x80 && octet <= 0x9F)
389 if (octet >= 0x90 && octet <= 0xBF)
390 state = FSM_80BF80BF;
394 case FSM_80BF80BF80BF:
395 if (octet >= 0x80 && octet <= 0xBF)
396 state = FSM_80BF80BF;
401 if (octet >= 0x80 && octet <= 0x8F)
402 state = FSM_80BF80BF;
410 if (state == FSM_START)