2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
4 * Copyright 2015 John Marino <draco@marino.st>
6 * This source code is derived from the illumos localedef command, and
7 * provided under BSD-style license terms by Nexenta Systems, Inc.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * LC_CTYPE database generation routines for localedef.
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
44 #include <sys/types.h>
49 #include "localedef.h"
54 /* Needed for bootstrapping, _CTYPE_N */
56 #define _CTYPE_N 0x00400000L
59 #define _ISUPPER _CTYPE_U
60 #define _ISLOWER _CTYPE_L
61 #define _ISDIGIT _CTYPE_D
62 #define _ISXDIGIT _CTYPE_X
63 #define _ISSPACE _CTYPE_S
64 #define _ISBLANK _CTYPE_B
65 #define _ISALPHA _CTYPE_A
66 #define _ISPUNCT _CTYPE_P
67 #define _ISGRAPH _CTYPE_G
68 #define _ISPRINT _CTYPE_R
69 #define _ISCNTRL _CTYPE_C
76 static wchar_t last_ctype;
77 static int ctype_compare(const void *n1, const void *n2);
79 typedef struct ctype_node {
84 RB_ENTRY(ctype_node) entry;
87 static RB_HEAD(ctypes, ctype_node) ctypes;
88 RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare);
91 ctype_compare(const void *n1, const void *n2)
93 const ctype_node_t *c1 = n1;
94 const ctype_node_t *c2 = n2;
96 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
107 add_ctype_impl(ctype_node_t *ctn)
111 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
114 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
117 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
120 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
123 ctn->ctype |= _ISSPACE;
126 ctn->ctype |= _ISCNTRL;
129 ctn->ctype |= (_ISGRAPH | _ISPRINT);
132 ctn->ctype |= _ISPRINT;
135 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
138 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
141 ctn->ctype |= (_ISBLANK | _ISSPACE);
144 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
147 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
150 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
153 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
156 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
160 * We can't do anything with this. The character
161 * should already be specified as a digit or alpha.
165 errf("not a valid character class");
169 static ctype_node_t *
170 get_ctype(wchar_t wc)
176 if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
177 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
178 errf("out of memory");
183 RB_INSERT(ctypes, &ctypes, ctn);
193 if ((ctn = get_ctype(val)) == NULL) {
198 last_ctype = ctn->wc;
202 add_ctype_range(wchar_t end)
207 if (end < last_ctype) {
208 errf("malformed character range (%u ... %u))",
212 for (cur = last_ctype + 1; cur <= end; cur++) {
213 if ((ctn = get_ctype(cur)) == NULL) {
224 * A word about widths: if the width mask is specified, then libc
225 * unconditionally honors it. Otherwise, it assumes printable
226 * characters have width 1, and non-printable characters have width
227 * -1 (except for NULL which is special with with 0). Hence, we have
228 * no need to inject defaults here -- the "default" unset value of 0
229 * indicates that libc should use its own logic in wcwidth as described.
232 add_width(int wc, int width)
236 if ((ctn = get_ctype(wc)) == NULL) {
240 ctn->ctype &= ~(_CTYPE_SWM);
243 ctn->ctype |= _CTYPE_SW0;
246 ctn->ctype |= _CTYPE_SW1;
249 ctn->ctype |= _CTYPE_SW2;
252 ctn->ctype |= _CTYPE_SW3;
258 add_width_range(int start, int end, int width)
260 for (; start <= end; start++) {
261 add_width(start, width);
266 add_caseconv(int val, int wc)
270 ctn = get_ctype(val);
294 ctype_node_t *ctn, *last_ct, *last_lo, *last_up;
295 _FileRuneEntry *ct = NULL;
296 _FileRuneEntry *lo = NULL;
297 _FileRuneEntry *up = NULL;
300 (void) memset(&rl, 0, sizeof (rl));
305 if ((f = open_category()) == NULL)
308 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
309 (void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
312 * Initialize the identity map.
314 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
315 rl.maplower[wc] = wc;
316 rl.mapupper[wc] = wc;
319 RB_FOREACH(ctn, ctypes, &ctypes) {
325 * POSIX requires certain portable characters have
326 * certain types. Add them if they are missing.
328 if ((wc >= 1) && (wc <= 127)) {
329 if ((wc >= 'A') && (wc <= 'Z'))
330 ctn->ctype |= _ISUPPER;
331 if ((wc >= 'a') && (wc <= 'z'))
332 ctn->ctype |= _ISLOWER;
333 if ((wc >= '0') && (wc <= '9'))
334 ctn->ctype |= _ISDIGIT;
336 ctn->ctype |= _ISPRINT;
337 if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
338 ctn->ctype |= _ISSPACE;
339 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
340 ctn->ctype |= _ISXDIGIT;
341 if (strchr(" \t", (char)wc))
342 ctn->ctype |= _ISBLANK;
345 * Technically these settings are only
346 * required for the C locale. However, it
347 * turns out that because of the historical
348 * version of isprint(), we need them for all
349 * locales as well. Note that these are not
350 * necessarily valid punctation characters in
351 * the current language, but ispunct() needs
352 * to return TRUE for them.
354 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
356 ctn->ctype |= _ISPUNCT;
360 * POSIX also requires that certain types imply
361 * others. Add any inferred types here.
363 if (ctn->ctype & (_ISUPPER |_ISLOWER))
364 ctn->ctype |= _ISALPHA;
365 if (ctn->ctype & _ISDIGIT)
366 ctn->ctype |= _ISXDIGIT;
367 if (ctn->ctype & _ISBLANK)
368 ctn->ctype |= _ISSPACE;
369 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
370 ctn->ctype |= _ISGRAPH;
371 if (ctn->ctype & _ISGRAPH)
372 ctn->ctype |= _ISPRINT;
375 * Finally, POSIX requires that certain combinations
376 * are invalid. We don't flag this as a fatal error,
377 * but we will warn about.
379 if ((ctn->ctype & _ISALPHA) &&
380 (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
382 if ((ctn->ctype & _ISPUNCT) &&
383 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
385 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
387 if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT))
389 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
393 warn("conflicting classes for character 0x%x (%x)",
397 * Handle the lower 256 characters using the simple
398 * optimization. Note that if we have not defined the
399 * upper/lower case, then we identity map it.
401 if ((unsigned)wc < _CACHED_RUNES) {
402 rl.runetype[wc] = ctn->ctype;
404 rl.maplower[wc] = ctn->tolower;
406 rl.mapupper[wc] = ctn->toupper;
410 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
411 (last_ct->wc + 1 == wc)) {
412 ct[rl.runetype_ext_nranges-1].max = wc;
414 rl.runetype_ext_nranges++;
416 sizeof (*ct) * rl.runetype_ext_nranges);
417 ct[rl.runetype_ext_nranges - 1].min = wc;
418 ct[rl.runetype_ext_nranges - 1].max = wc;
419 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
422 if (ctn->tolower == 0) {
424 } else if ((last_lo != NULL) &&
425 (last_lo->tolower + 1 == ctn->tolower)) {
426 lo[rl.maplower_ext_nranges-1].max = wc;
429 rl.maplower_ext_nranges++;
431 sizeof (*lo) * rl.maplower_ext_nranges);
432 lo[rl.maplower_ext_nranges - 1].min = wc;
433 lo[rl.maplower_ext_nranges - 1].max = wc;
434 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
438 if (ctn->toupper == 0) {
440 } else if ((last_up != NULL) &&
441 (last_up->toupper + 1 == ctn->toupper)) {
442 up[rl.mapupper_ext_nranges-1].max = wc;
445 rl.mapupper_ext_nranges++;
447 sizeof (*up) * rl.mapupper_ext_nranges);
448 up[rl.mapupper_ext_nranges - 1].min = wc;
449 up[rl.mapupper_ext_nranges - 1].max = wc;
450 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
455 if ((wr_category(&rl, sizeof (rl), f) < 0) ||
456 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
457 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
458 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {