]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.bin/localedef/ctype.c
Make linux_ptrace() use linux_msg() instead of printf().
[FreeBSD/FreeBSD.git] / usr.bin / localedef / ctype.c
1 /*-
2  * Copyright 2018 Nexenta Systems, Inc.
3  * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
4  * Copyright 2015 John Marino <draco@marino.st>
5  *
6  * This source code is derived from the illumos localedef command, and
7  * provided under BSD-style license terms by Nexenta Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 /*
33  * LC_CTYPE database generation routines for localedef.
34  */
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37
38 #include <sys/tree.h>
39
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <stddef.h>
43 #include <string.h>
44 #include <sys/types.h>
45 #include <wchar.h>
46 #include <ctype.h>
47 #include <wctype.h>
48 #include <unistd.h>
49 #include "localedef.h"
50 #include "parser.h"
51 #include "runefile.h"
52
53
54 /* Needed for bootstrapping, _CTYPE_N */
55 #ifndef _CTYPE_N
56 #define _CTYPE_N       0x00400000L
57 #endif
58
59 #define _ISUPPER        _CTYPE_U
60 #define _ISLOWER        _CTYPE_L
61 #define _ISDIGIT        _CTYPE_D
62 #define _ISXDIGIT       _CTYPE_X
63 #define _ISSPACE        _CTYPE_S
64 #define _ISBLANK        _CTYPE_B
65 #define _ISALPHA        _CTYPE_A
66 #define _ISPUNCT        _CTYPE_P
67 #define _ISGRAPH        _CTYPE_G
68 #define _ISPRINT        _CTYPE_R
69 #define _ISCNTRL        _CTYPE_C
70 #define _E1             _CTYPE_Q
71 #define _E2             _CTYPE_I
72 #define _E3             0
73 #define _E4             _CTYPE_N
74 #define _E5             _CTYPE_T
75
76 static wchar_t          last_ctype;
77 static int ctype_compare(const void *n1, const void *n2);
78
79 typedef struct ctype_node {
80         wchar_t wc;
81         int32_t ctype;
82         int32_t toupper;
83         int32_t tolower;
84         RB_ENTRY(ctype_node) entry;
85 } ctype_node_t;
86
87 static RB_HEAD(ctypes, ctype_node) ctypes;
88 RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare);
89
90 static int
91 ctype_compare(const void *n1, const void *n2)
92 {
93         const ctype_node_t *c1 = n1;
94         const ctype_node_t *c2 = n2;
95
96         return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
97 }
98
99 void
100 init_ctype(void)
101 {
102         RB_INIT(&ctypes);
103 }
104
105
106 static void
107 add_ctype_impl(ctype_node_t *ctn)
108 {
109         switch (last_kw) {
110         case T_ISUPPER:
111                 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
112                 break;
113         case T_ISLOWER:
114                 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
115                 break;
116         case T_ISALPHA:
117                 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
118                 break;
119         case T_ISDIGIT:
120                 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
121                 break;
122         case T_ISSPACE:
123                 /*
124                  * This can be troublesome as <form-feed>, <newline>,
125                  * <carriage-return>, <tab>, and <vertical-tab> are defined both
126                  * as space and cntrl, and POSIX doesn't allow cntrl/print
127                  * combination.  We will take care of this in dump_ctype().
128                  */
129                 ctn->ctype |= (_ISSPACE | _ISPRINT);
130                 break;
131         case T_ISCNTRL:
132                 ctn->ctype |= _ISCNTRL;
133                 break;
134         case T_ISGRAPH:
135                 ctn->ctype |= (_ISGRAPH | _ISPRINT);
136                 break;
137         case T_ISPRINT:
138                 ctn->ctype |= _ISPRINT;
139                 break;
140         case T_ISPUNCT:
141                 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
142                 break;
143         case T_ISXDIGIT:
144                 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
145                 break;
146         case T_ISBLANK:
147                 ctn->ctype |= (_ISBLANK | _ISSPACE);
148                 break;
149         case T_ISPHONOGRAM:
150                 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
151                 break;
152         case T_ISIDEOGRAM:
153                 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
154                 break;
155         case T_ISENGLISH:
156                 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
157                 break;
158         case T_ISNUMBER:
159                 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
160                 break;
161         case T_ISSPECIAL:
162                 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
163                 break;
164         case T_ISALNUM:
165                 /*
166                  * We can't do anything with this.  The character
167                  * should already be specified as a digit or alpha.
168                  */
169                 break;
170         default:
171                 errf("not a valid character class");
172         }
173 }
174
175 static ctype_node_t *
176 get_ctype(wchar_t wc)
177 {
178         ctype_node_t    srch;
179         ctype_node_t    *ctn;
180
181         srch.wc = wc;
182         if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
183                 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
184                         errf("out of memory");
185                         return (NULL);
186                 }
187                 ctn->wc = wc;
188
189                 RB_INSERT(ctypes, &ctypes, ctn);
190         }
191         return (ctn);
192 }
193
194 void
195 add_ctype(int val)
196 {
197         ctype_node_t    *ctn;
198
199         if ((ctn = get_ctype(val)) == NULL) {
200                 INTERR;
201                 return;
202         }
203         add_ctype_impl(ctn);
204         last_ctype = ctn->wc;
205 }
206
207 void
208 add_ctype_range(wchar_t end)
209 {
210         ctype_node_t    *ctn;
211         wchar_t         cur;
212
213         if (end < last_ctype) {
214                 errf("malformed character range (%u ... %u))",
215                     last_ctype, end);
216                 return;
217         }
218         for (cur = last_ctype + 1; cur <= end; cur++) {
219                 if ((ctn = get_ctype(cur)) == NULL) {
220                         INTERR;
221                         return;
222                 }
223                 add_ctype_impl(ctn);
224         }
225         last_ctype = end;
226
227 }
228
229 /*
230  * A word about widths: if the width mask is specified, then libc
231  * unconditionally honors it.  Otherwise, it assumes printable
232  * characters have width 1, and non-printable characters have width
233  * -1 (except for NULL which is special with width 0).  Hence, we have
234  * no need to inject defaults here -- the "default" unset value of 0
235  * indicates that libc should use its own logic in wcwidth as described.
236  */
237 void
238 add_width(int wc, int width)
239 {
240         ctype_node_t    *ctn;
241
242         if ((ctn = get_ctype(wc)) == NULL) {
243                 INTERR;
244                 return;
245         }
246         ctn->ctype &= ~(_CTYPE_SWM);
247         switch (width) {
248         case 0:
249                 ctn->ctype |= _CTYPE_SW0;
250                 break;
251         case 1:
252                 ctn->ctype |= _CTYPE_SW1;
253                 break;
254         case 2:
255                 ctn->ctype |= _CTYPE_SW2;
256                 break;
257         case 3:
258                 ctn->ctype |= _CTYPE_SW3;
259                 break;
260         }
261 }
262
263 void
264 add_width_range(int start, int end, int width)
265 {
266         for (; start <= end; start++) {
267                 add_width(start, width);
268         }
269 }
270
271 void
272 add_caseconv(int val, int wc)
273 {
274         ctype_node_t    *ctn;
275
276         ctn = get_ctype(val);
277         if (ctn == NULL) {
278                 INTERR;
279                 return;
280         }
281
282         switch (last_kw) {
283         case T_TOUPPER:
284                 ctn->toupper = wc;
285                 break;
286         case T_TOLOWER:
287                 ctn->tolower = wc;
288                 break;
289         default:
290                 INTERR;
291                 break;
292         }
293 }
294
295 void
296 dump_ctype(void)
297 {
298         FILE            *f;
299         _FileRuneLocale rl;
300         ctype_node_t    *ctn, *last_ct, *last_lo, *last_up;
301         _FileRuneEntry  *ct = NULL;
302         _FileRuneEntry  *lo = NULL;
303         _FileRuneEntry  *up = NULL;
304         wchar_t         wc;
305         uint32_t        runetype_ext_nranges;
306         uint32_t        maplower_ext_nranges;
307         uint32_t        mapupper_ext_nranges;
308
309         (void) memset(&rl, 0, sizeof (rl));
310         runetype_ext_nranges = 0;
311         last_ct = NULL;
312         maplower_ext_nranges = 0;
313         last_lo = NULL;
314         mapupper_ext_nranges = 0;
315         last_up = NULL;
316
317         if ((f = open_category()) == NULL)
318                 return;
319
320         (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
321         (void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
322
323         /*
324          * Initialize the identity map.
325          */
326         for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
327                 rl.maplower[wc] = htote(wc);
328                 rl.mapupper[wc] = htote(wc);
329         }
330
331         RB_FOREACH(ctn, ctypes, &ctypes) {
332                 int conflict = 0;
333
334                 wc = ctn->wc;
335
336                 /*
337                  * POSIX requires certain portable characters have
338                  * certain types.  Add them if they are missing.
339                  */
340                 if ((wc >= 1) && (wc <= 127)) {
341                         if ((wc >= 'A') && (wc <= 'Z'))
342                                 ctn->ctype |= _ISUPPER;
343                         if ((wc >= 'a') && (wc <= 'z'))
344                                 ctn->ctype |= _ISLOWER;
345                         if ((wc >= '0') && (wc <= '9'))
346                                 ctn->ctype |= _ISDIGIT;
347                         if (wc == ' ')
348                                 ctn->ctype |= _ISPRINT;
349                         if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
350                                 ctn->ctype |= _ISSPACE;
351                         if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
352                                 ctn->ctype |= _ISXDIGIT;
353                         if (strchr(" \t", (char)wc))
354                                 ctn->ctype |= _ISBLANK;
355
356                         /*
357                          * Technically these settings are only
358                          * required for the C locale.  However, it
359                          * turns out that because of the historical
360                          * version of isprint(), we need them for all
361                          * locales as well.  Note that these are not
362                          * necessarily valid punctation characters in
363                          * the current language, but ispunct() needs
364                          * to return TRUE for them.
365                          */
366                         if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
367                             (char)wc))
368                                 ctn->ctype |= _ISPUNCT;
369                 }
370
371                 /*
372                  * POSIX also requires that certain types imply
373                  * others.  Add any inferred types here.
374                  */
375                 if (ctn->ctype & (_ISUPPER |_ISLOWER))
376                         ctn->ctype |= _ISALPHA;
377                 if (ctn->ctype & _ISDIGIT)
378                         ctn->ctype |= _ISXDIGIT;
379                 if (ctn->ctype & _ISBLANK)
380                         ctn->ctype |= _ISSPACE;
381                 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
382                         ctn->ctype |= _ISGRAPH;
383                 if (ctn->ctype & _ISGRAPH)
384                         ctn->ctype |= _ISPRINT;
385
386                 /*
387                  * POSIX requires that certain combinations are invalid.
388                  * Try fixing the cases we know about (see add_ctype_impl()).
389                  */
390                 if ((ctn->ctype & (_ISSPACE|_ISCNTRL)) == (_ISSPACE|_ISCNTRL))
391                         ctn->ctype &= ~_ISPRINT;
392
393                 /*
394                  * Finally, don't flag remaining cases as a fatal error,
395                  * and just warn about them.
396                  */
397                 if ((ctn->ctype & _ISALPHA) &&
398                     (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
399                         conflict++;
400                 if ((ctn->ctype & _ISPUNCT) &&
401                     (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
402                         conflict++;
403                 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
404                         conflict++;
405                 if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT))
406                         conflict++;
407                 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
408                         conflict++;
409
410                 if (conflict) {
411                         warn("conflicting classes for character 0x%x (%x)",
412                             wc, ctn->ctype);
413                 }
414                 /*
415                  * Handle the lower 256 characters using the simple
416                  * optimization.  Note that if we have not defined the
417                  * upper/lower case, then we identity map it.
418                  */
419                 if ((unsigned)wc < _CACHED_RUNES) {
420                         rl.runetype[wc] = htote(ctn->ctype);
421                         if (ctn->tolower)
422                                 rl.maplower[wc] = htote(ctn->tolower);
423                         if (ctn->toupper)
424                                 rl.mapupper[wc] = htote(ctn->toupper);
425                         continue;
426                 }
427
428                 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
429                     (last_ct->wc + 1 == wc)) {
430                         ct[runetype_ext_nranges - 1].max = htote(wc);
431                 } else {
432                         runetype_ext_nranges++;
433                         ct = realloc(ct, sizeof (*ct) * runetype_ext_nranges);
434                         ct[runetype_ext_nranges - 1].min = htote(wc);
435                         ct[runetype_ext_nranges - 1].max = htote(wc);
436                         ct[runetype_ext_nranges - 1].map =
437                             htote(ctn->ctype);
438                 }
439                 last_ct = ctn;
440                 if (ctn->tolower == 0) {
441                         last_lo = NULL;
442                 } else if ((last_lo != NULL) &&
443                     (last_lo->tolower + 1 == ctn->tolower)) {
444                         lo[maplower_ext_nranges - 1].max = htote(wc);
445                         last_lo = ctn;
446                 } else {
447                         maplower_ext_nranges++;
448                         lo = realloc(lo, sizeof (*lo) * maplower_ext_nranges);
449                         lo[maplower_ext_nranges - 1].min = htote(wc);
450                         lo[maplower_ext_nranges - 1].max = htote(wc);
451                         lo[maplower_ext_nranges - 1].map =
452                             htote(ctn->tolower);
453                         last_lo = ctn;
454                 }
455
456                 if (ctn->toupper == 0) {
457                         last_up = NULL;
458                 } else if ((last_up != NULL) &&
459                     (last_up->toupper + 1 == ctn->toupper)) {
460                         up[mapupper_ext_nranges-1].max = htote(wc);
461                         last_up = ctn;
462                 } else {
463                         mapupper_ext_nranges++;
464                         up = realloc(up, sizeof (*up) * mapupper_ext_nranges);
465                         up[mapupper_ext_nranges - 1].min = htote(wc);
466                         up[mapupper_ext_nranges - 1].max = htote(wc);
467                         up[mapupper_ext_nranges - 1].map =
468                             htote(ctn->toupper);
469                         last_up = ctn;
470                 }
471         }
472
473         rl.runetype_ext_nranges = htote(runetype_ext_nranges);
474         rl.maplower_ext_nranges = htote(maplower_ext_nranges);
475         rl.mapupper_ext_nranges = htote(mapupper_ext_nranges);
476         if ((wr_category(&rl, sizeof (rl), f) < 0) ||
477             (wr_category(ct, sizeof (*ct) * runetype_ext_nranges, f) < 0) ||
478             (wr_category(lo, sizeof (*lo) * maplower_ext_nranges, f) < 0) ||
479             (wr_category(up, sizeof (*up) * mapupper_ext_nranges, f) < 0)) {
480                 return;
481         }
482
483         close_category(f);
484 }