]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/libkern/iconv_ucs.c
kernel: Add options for MAC_DEBUG and MAC_VERIEXEC_DEBUG
[FreeBSD/FreeBSD.git] / sys / libkern / iconv_ucs.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2003, 2005 Ryuichiro Imura
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/systm.h>
32 #include <sys/malloc.h>
33 #include <sys/iconv.h>
34
35 #include "iconv_converter_if.h"
36
37 /*
38  * "UCS" converter
39  */
40
41 #define KICONV_UCS_COMBINE      0x1
42 #define KICONV_UCS_FROM_UTF8    0x2
43 #define KICONV_UCS_TO_UTF8      0x4
44 #define KICONV_UCS_FROM_LE      0x8
45 #define KICONV_UCS_TO_LE        0x10
46 #define KICONV_UCS_FROM_UTF16   0x20
47 #define KICONV_UCS_TO_UTF16     0x40
48 #define KICONV_UCS_UCS4         0x80
49
50 #define ENCODING_UTF16  "UTF-16BE"
51 #define ENCODING_UTF8   "UTF-8"
52
53 static struct {
54         const char *name;
55         int from_flag, to_flag;
56 } unicode_family[] = {
57         { "UTF-8",      KICONV_UCS_FROM_UTF8,   KICONV_UCS_TO_UTF8 },
58         { "UCS-2LE",    KICONV_UCS_FROM_LE,     KICONV_UCS_TO_LE },
59         { "UTF-16BE",   KICONV_UCS_FROM_UTF16,  KICONV_UCS_TO_UTF16 },
60         { "UTF-16LE",   KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
61             KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
62         { NULL,         0,      0 }
63 };
64
65 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
66 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
67 static uint32_t encode_surrogate(uint32_t code);
68 static uint32_t decode_surrogate(const u_char *ucs);
69
70 #ifdef MODULE_DEPEND
71 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
72 #endif
73
74 /*
75  * UCS converter instance
76  */
77 struct iconv_ucs {
78         KOBJ_FIELDS;
79         int                     convtype;
80         struct iconv_cspair *   d_csp;
81         struct iconv_cspair *   d_cspf;
82         void *                  f_ctp;
83         void *                  t_ctp;
84         void *                  ctype;
85 };
86
87 static int
88 iconv_ucs_open(struct iconv_converter_class *dcp,
89         struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
90 {
91         struct iconv_ucs *dp;
92         int i;
93         const char *from, *to;
94
95         dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
96         to = csp->cp_to;
97         from = cspf ? cspf->cp_from : csp->cp_from;
98
99         dp->convtype = 0;
100
101         if (cspf)
102                 dp->convtype |= KICONV_UCS_COMBINE;
103         for (i = 0; unicode_family[i].name; i++) {
104                 if (strcasecmp(from, unicode_family[i].name) == 0)
105                         dp->convtype |= unicode_family[i].from_flag;
106                 if (strcasecmp(to, unicode_family[i].name) == 0)
107                         dp->convtype |= unicode_family[i].to_flag;
108         }
109         if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
110                 dp->convtype |= KICONV_UCS_UCS4;
111         else
112                 dp->convtype &= ~KICONV_UCS_UCS4;
113
114         dp->f_ctp = dp->t_ctp = NULL;
115         if (dp->convtype & KICONV_UCS_COMBINE) {
116                 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
117                     (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
118                         iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
119                 }
120                 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
121                     (dp->convtype & KICONV_UCS_TO_LE) == 0) {
122                         iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
123                 }
124         }
125
126         dp->ctype = NULL;
127         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
128                 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
129
130         dp->d_csp = csp;
131         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
132                 if (cspf) {
133                         dp->d_cspf = cspf;
134                         cspf->cp_refcount++;
135                 } else
136                         csp->cp_refcount++;
137         }
138         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
139                 csp->cp_refcount++;
140         *dpp = (void*)dp;
141         return 0;
142 }
143
144 static int
145 iconv_ucs_close(void *data)
146 {
147         struct iconv_ucs *dp = data;
148
149         if (dp->f_ctp)
150                 iconv_close(dp->f_ctp);
151         if (dp->t_ctp)
152                 iconv_close(dp->t_ctp);
153         if (dp->ctype)
154                 iconv_close(dp->ctype);
155         if (dp->d_cspf)
156                 dp->d_cspf->cp_refcount--;
157         else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
158                 dp->d_csp->cp_refcount--;
159         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
160                 dp->d_csp->cp_refcount--;
161         kobj_delete((struct kobj*)data, M_ICONV);
162         return 0;
163 }
164
165 static int
166 iconv_ucs_conv(void *d2p, const char **inbuf,
167         size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
168         int convchar, int casetype)
169 {
170         struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
171         int ret = 0, i;
172         size_t in, on, ir, or, inlen, outlen, ucslen;
173         const char *src, *p;
174         char *dst;
175         u_char ucs[4], *q;
176         uint32_t code;
177
178         if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
179                 return 0;
180         ir = in = *inbytesleft;
181         or = on = *outbytesleft;
182         src = *inbuf;
183         dst = *outbuf;
184
185         while (ir > 0 && or > 0) {
186                 /*
187                  * The first half of conversion.
188                  * (convert any code into ENCODING_UNICODE)
189                  */
190                 code = 0;
191                 p = src;
192                 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
193                         /* convert UTF-8 to ENCODING_UNICODE */
194                         inlen = 0;
195                         code = utf8_to_ucs4(p, &inlen, ir);
196                         if (code == 0) {
197                                 ret = -1;
198                                 break;
199                         }
200
201                         if (casetype == KICONV_FROM_LOWER && dp->ctype) {
202                                 code = towlower(code, dp->ctype);
203                         } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
204                                 code = towupper(code, dp->ctype);
205                         }
206
207                         if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
208                                 /* reserved for utf-16 surrogate pair */
209                                 /* invalid unicode */
210                                 ret = -1;
211                                 break;
212                         }
213
214                         if (inlen == 4) {
215                                 if (dp->convtype & KICONV_UCS_UCS4) {
216                                         ucslen = 4;
217                                         code = encode_surrogate(code);
218                                 } else {
219                                         /* can't handle with ucs-2 */
220                                         ret = -1;
221                                         break;
222                                 }
223                         } else {
224                                 ucslen = 2;
225                         }
226
227                         /* save UCS-4 into ucs[] */
228                         for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
229                                 *q++ = (code >> (i << 3)) & 0xff;
230
231                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
232                         /* convert local code to ENCODING_UNICODE */
233                         ucslen = 4;
234                         inlen = ir;
235                         q = ucs;
236                         ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
237                             &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
238                         if (ret)
239                                 break;
240                         inlen = ir - inlen;
241                         ucslen = 4 - ucslen;
242
243                 } else {
244                         /* src code is a proper subset of ENCODING_UNICODE */
245                         q = ucs;
246                         if (dp->convtype & KICONV_UCS_FROM_LE) {
247                                 *q = *(p + 1);
248                                 *(q + 1) = *p;
249                                 p += 2;
250                         } else {
251                                 *q = *p++;
252                                 *(q + 1) = *p++;
253                         }
254                         if ((*q & 0xfc) == 0xd8) {
255                                 if (dp->convtype & KICONV_UCS_UCS4 &&
256                                     dp->convtype & KICONV_UCS_FROM_UTF16) {
257                                         inlen = ucslen = 4;
258                                 } else {
259                                         /* invalid unicode */
260                                         ret = -1;
261                                         break;
262                                 }
263                         } else {
264                                 inlen = ucslen = 2;
265                         }
266                         if (ir < inlen) {
267                                 ret = -1;
268                                 break;
269                         }
270                         if (ucslen == 4) {
271                                 q += 2;
272                                 if (dp->convtype & KICONV_UCS_FROM_LE) {
273                                         *q = *(p + 1);
274                                         *(q + 1) = *p;
275                                 } else {
276                                         *q = *p++;
277                                         *(q + 1) = *p;
278                                 }
279                                 if ((*q & 0xfc) != 0xdc) {
280                                         /* invalid unicode */
281                                         ret = -1;
282                                         break;
283                                 }
284                         }
285                 }
286
287                 /*
288                  * The second half of conversion.
289                  * (convert ENCODING_UNICODE into any code)
290                  */
291                 p = ucs;
292                 if (dp->convtype & KICONV_UCS_TO_UTF8) {
293                         q = (u_char *)dst;
294                         if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
295                                 /* decode surrogate pair */
296                                 code = decode_surrogate(p);
297                         } else {
298                                 code = (ucs[0] << 8) | ucs[1];
299                         }
300
301                         if (casetype == KICONV_LOWER && dp->ctype) {
302                                 code = towlower(code, dp->ctype);
303                         } else if (casetype == KICONV_UPPER && dp->ctype) {
304                                 code = towupper(code, dp->ctype);
305                         }
306
307                         outlen = 0;
308                         if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
309                                 ret = -1;
310                                 break;
311                         }
312
313                         src += inlen;
314                         ir -= inlen;
315                         dst += outlen;
316                         or -= outlen;
317
318                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
319                         ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
320                             &or, casetype & (KICONV_LOWER | KICONV_UPPER));
321                         if (ret)
322                                 break;
323
324                         src += inlen;
325                         ir -= inlen;
326
327                 } else {
328                         /* dst code is a proper subset of ENCODING_UNICODE */
329                         if (or < ucslen) {
330                                 ret = -1;
331                                 break;
332                         }
333                         src += inlen;
334                         ir -= inlen;
335                         or -= ucslen;
336                         if (dp->convtype & KICONV_UCS_TO_LE) {
337                                 *dst++ = *(p + 1);
338                                 *dst++ = *p;
339                                 p += 2;
340                         } else {
341                                 *dst++ = *p++;
342                                 *dst++ = *p++;
343                         }
344                         if (ucslen == 4) {
345                                 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
346                                     (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
347                                         ret = -1;
348                                         break;
349                                 }
350                                 if (dp->convtype & KICONV_UCS_TO_LE) {
351                                         *dst++ = *(p + 1);
352                                         *dst++ = *p;
353                                 } else {
354                                         *dst++ = *p++;
355                                         *dst++ = *p;
356                                 }
357                         }
358                 }
359
360                 if (convchar == 1)
361                         break;
362         }
363
364         *inbuf += in - ir;
365         *outbuf += on - or;
366         *inbytesleft -= in - ir;
367         *outbytesleft -= on - or;
368         return (ret);
369 }
370
371 static int
372 iconv_ucs_init(struct iconv_converter_class *dcp)
373 {
374         int error;
375
376         error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
377         if (error)
378                 return (error);
379         error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
380         if (error)
381                 return (error);
382         return (0);
383 }
384
385 static int
386 iconv_ucs_done(struct iconv_converter_class *dcp)
387 {
388         return (0);
389 }
390
391 static const char *
392 iconv_ucs_name(struct iconv_converter_class *dcp)
393 {
394         return (ENCODING_UNICODE);
395 }
396
397 static kobj_method_t iconv_ucs_methods[] = {
398         KOBJMETHOD(iconv_converter_open,        iconv_ucs_open),
399         KOBJMETHOD(iconv_converter_close,       iconv_ucs_close),
400         KOBJMETHOD(iconv_converter_conv,        iconv_ucs_conv),
401         KOBJMETHOD(iconv_converter_init,        iconv_ucs_init),
402         KOBJMETHOD(iconv_converter_done,        iconv_ucs_done),
403         KOBJMETHOD(iconv_converter_name,        iconv_ucs_name),
404         {0, 0}
405 };
406
407 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
408
409 static uint32_t
410 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
411 {
412         size_t i, w = 0;
413         uint32_t ucs4 = 0;
414
415         /*
416          * get leading 1 byte from utf-8
417          */
418         if ((*src & 0x80) == 0) {
419                 /*
420                  * leading 1 bit is "0"
421                  *  utf-8: 0xxxxxxx
422                  *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
423                  */
424                 w = 1;
425                 /* get trailing 7 bits */
426                 ucs4 = *src & 0x7f;
427         } else if ((*src & 0xe0) == 0xc0) {
428                 /*
429                  * leading 3 bits are "110"
430                  *  utf-8: 110xxxxx 10yyyyyy
431                  *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
432                  */
433                 w = 2;
434                 /* get trailing 5 bits */
435                 ucs4 = *src & 0x1f;
436         } else if ((*src & 0xf0) == 0xe0) {
437                 /*
438                  * leading 4 bits are "1110"
439                  *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
440                  *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
441                  */
442                 w = 3;
443                 /* get trailing 4 bits */
444                 ucs4 = *src & 0x0f;
445         } else if ((*src & 0xf8) == 0xf0) {
446                 /*
447                  * leading 5 bits are "11110"
448                  *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
449                  *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
450                  */
451                 w = 4;
452                 /* get trailing 3 bits */
453                 ucs4 = *src & 0x07;
454         } else {
455                 /* out of utf-16 range or having illegal bits */
456                 return (0);
457         }
458
459         if (srclen < w)
460                 return (0);
461
462         /*
463          * get left parts from utf-8
464          */
465         for (i = 1 ; i < w ; i++) {
466                 if ((*(src + i) & 0xc0) != 0x80) {
467                         /* invalid: leading 2 bits are not "10" */
468                         return (0);
469                 }
470                 /* concatenate trailing 6 bits into ucs4 */
471                 ucs4 <<= 6;
472                 ucs4 |= *(src + i) & 0x3f;
473         }
474
475         *utf8width = w;
476         return (ucs4);
477 }
478
479 static u_char *
480 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
481 {
482         u_char lead, *p;
483         size_t i, w;
484
485         /*
486          * determine utf-8 width and leading bits
487          */
488         if (ucs4 < 0x80) {
489                 w = 1;
490                 lead = 0;       /* "0" */
491         } else if (ucs4 < 0x800) {
492                 w = 2;
493                 lead = 0xc0;    /* "11" */
494         } else if (ucs4 < 0x10000) {
495                 w = 3;
496                 lead = 0xe0;    /* "111" */
497         } else if (ucs4 < 0x200000) {
498                 w = 4;
499                 lead = 0xf0;    /* "1111" */
500         } else {
501                 return (NULL);
502         }
503
504         if (dstlen < w)
505                 return (NULL);
506
507         /*
508          * construct utf-8
509          */
510         p = dst;
511         for (i = w - 1 ; i >= 1 ; i--) {
512                 /* get trailing 6 bits and put it with leading bit as "1" */
513                 *(p + i) = (ucs4 & 0x3f) | 0x80;
514                 ucs4 >>= 6;
515         }
516         *p = ucs4 | lead;
517
518         *utf8width = w;
519
520         return (p);
521 }
522
523 static uint32_t
524 encode_surrogate(uint32_t code)
525 {
526         return ((((code - 0x10000) << 6) & 0x3ff0000) |
527             ((code - 0x10000) & 0x3ff) | 0xd800dc00);
528 }
529
530 static uint32_t
531 decode_surrogate(const u_char *ucs)
532 {
533         return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
534             ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
535 }