]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/libkern/iconv_ucs.c
sys/{x86,amd64}: remove one of doubled ;s
[FreeBSD/FreeBSD.git] / sys / libkern / iconv_ucs.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2003, 2005 Ryuichiro Imura
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/iconv.h>
37
38 #include "iconv_converter_if.h"
39
40 /*
41  * "UCS" converter
42  */
43
44 #define KICONV_UCS_COMBINE      0x1
45 #define KICONV_UCS_FROM_UTF8    0x2
46 #define KICONV_UCS_TO_UTF8      0x4
47 #define KICONV_UCS_FROM_LE      0x8
48 #define KICONV_UCS_TO_LE        0x10
49 #define KICONV_UCS_FROM_UTF16   0x20
50 #define KICONV_UCS_TO_UTF16     0x40
51 #define KICONV_UCS_UCS4         0x80
52
53 #define ENCODING_UTF16  "UTF-16BE"
54 #define ENCODING_UTF8   "UTF-8"
55
56 static struct {
57         const char *name;
58         int from_flag, to_flag;
59 } unicode_family[] = {
60         { "UTF-8",      KICONV_UCS_FROM_UTF8,   KICONV_UCS_TO_UTF8 },
61         { "UCS-2LE",    KICONV_UCS_FROM_LE,     KICONV_UCS_TO_LE },
62         { "UTF-16BE",   KICONV_UCS_FROM_UTF16,  KICONV_UCS_TO_UTF16 },
63         { "UTF-16LE",   KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
64             KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
65         { NULL,         0,      0 }
66 };
67
68 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
69 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
70 static uint32_t encode_surrogate(uint32_t code);
71 static uint32_t decode_surrogate(const u_char *ucs);
72
73 #ifdef MODULE_DEPEND
74 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
75 #endif
76
77 /*
78  * UCS converter instance
79  */
80 struct iconv_ucs {
81         KOBJ_FIELDS;
82         int                     convtype;
83         struct iconv_cspair *   d_csp;
84         struct iconv_cspair *   d_cspf;
85         void *                  f_ctp;
86         void *                  t_ctp;
87         void *                  ctype;
88 };
89
90 static int
91 iconv_ucs_open(struct iconv_converter_class *dcp,
92         struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
93 {
94         struct iconv_ucs *dp;
95         int i;
96         const char *from, *to;
97
98         dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
99         to = csp->cp_to;
100         from = cspf ? cspf->cp_from : csp->cp_from;
101
102         dp->convtype = 0;
103
104         if (cspf)
105                 dp->convtype |= KICONV_UCS_COMBINE;
106         for (i = 0; unicode_family[i].name; i++) {
107                 if (strcasecmp(from, unicode_family[i].name) == 0)
108                         dp->convtype |= unicode_family[i].from_flag;
109                 if (strcasecmp(to, unicode_family[i].name) == 0)
110                         dp->convtype |= unicode_family[i].to_flag;
111         }
112         if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
113                 dp->convtype |= KICONV_UCS_UCS4;
114         else
115                 dp->convtype &= ~KICONV_UCS_UCS4;
116
117         dp->f_ctp = dp->t_ctp = NULL;
118         if (dp->convtype & KICONV_UCS_COMBINE) {
119                 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
120                     (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
121                         iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
122                 }
123                 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
124                     (dp->convtype & KICONV_UCS_TO_LE) == 0) {
125                         iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
126                 }
127         }
128
129         dp->ctype = NULL;
130         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
131                 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
132
133         dp->d_csp = csp;
134         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
135                 if (cspf) {
136                         dp->d_cspf = cspf;
137                         cspf->cp_refcount++;
138                 } else
139                         csp->cp_refcount++;
140         }
141         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
142                 csp->cp_refcount++;
143         *dpp = (void*)dp;
144         return 0;
145 }
146
147 static int
148 iconv_ucs_close(void *data)
149 {
150         struct iconv_ucs *dp = data;
151
152         if (dp->f_ctp)
153                 iconv_close(dp->f_ctp);
154         if (dp->t_ctp)
155                 iconv_close(dp->t_ctp);
156         if (dp->ctype)
157                 iconv_close(dp->ctype);
158         if (dp->d_cspf)
159                 dp->d_cspf->cp_refcount--;
160         else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
161                 dp->d_csp->cp_refcount--;
162         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
163                 dp->d_csp->cp_refcount--;
164         kobj_delete((struct kobj*)data, M_ICONV);
165         return 0;
166 }
167
168 static int
169 iconv_ucs_conv(void *d2p, const char **inbuf,
170         size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
171         int convchar, int casetype)
172 {
173         struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
174         int ret = 0, i;
175         size_t in, on, ir, or, inlen, outlen, ucslen;
176         const char *src, *p;
177         char *dst;
178         u_char ucs[4], *q;
179         uint32_t code;
180
181         if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
182                 return 0;
183         ir = in = *inbytesleft;
184         or = on = *outbytesleft;
185         src = *inbuf;
186         dst = *outbuf;
187
188         while (ir > 0 && or > 0) {
189
190                 /*
191                  * The first half of conversion.
192                  * (convert any code into ENCODING_UNICODE)
193                  */
194                 code = 0;
195                 p = src;
196                 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
197                         /* convert UTF-8 to ENCODING_UNICODE */
198                         inlen = 0;
199                         code = utf8_to_ucs4(p, &inlen, ir);
200                         if (code == 0) {
201                                 ret = -1;
202                                 break;
203                         }
204
205                         if (casetype == KICONV_FROM_LOWER && dp->ctype) {
206                                 code = towlower(code, dp->ctype);
207                         } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
208                                 code = towupper(code, dp->ctype);
209                         }
210
211                         if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
212                                 /* reserved for utf-16 surrogate pair */
213                                 /* invalid unicode */
214                                 ret = -1;
215                                 break;
216                         }
217
218                         if (inlen == 4) {
219                                 if (dp->convtype & KICONV_UCS_UCS4) {
220                                         ucslen = 4;
221                                         code = encode_surrogate(code);
222                                 } else {
223                                         /* can't handle with ucs-2 */
224                                         ret = -1;
225                                         break;
226                                 }
227                         } else {
228                                 ucslen = 2;
229                         }
230
231                         /* save UCS-4 into ucs[] */
232                         for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
233                                 *q++ = (code >> (i << 3)) & 0xff;
234
235                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
236                         /* convert local code to ENCODING_UNICODE */
237                         ucslen = 4;
238                         inlen = ir;
239                         q = ucs;
240                         ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
241                             &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
242                         if (ret)
243                                 break;
244                         inlen = ir - inlen;
245                         ucslen = 4 - ucslen;
246
247                 } else {
248                         /* src code is a proper subset of ENCODING_UNICODE */
249                         q = ucs;
250                         if (dp->convtype & KICONV_UCS_FROM_LE) {
251                                 *q = *(p + 1);
252                                 *(q + 1) = *p;
253                                 p += 2;
254                         } else {
255                                 *q = *p++;
256                                 *(q + 1) = *p++;
257                         }
258                         if ((*q & 0xfc) == 0xd8) {
259                                 if (dp->convtype & KICONV_UCS_UCS4 &&
260                                     dp->convtype & KICONV_UCS_FROM_UTF16) {
261                                         inlen = ucslen = 4;
262                                 } else {
263                                         /* invalid unicode */
264                                         ret = -1;
265                                         break;
266                                 }
267                         } else {
268                                 inlen = ucslen = 2;
269                         }
270                         if (ir < inlen) {
271                                 ret = -1;
272                                 break;
273                         }
274                         if (ucslen == 4) {
275                                 q += 2;
276                                 if (dp->convtype & KICONV_UCS_FROM_LE) {
277                                         *q = *(p + 1);
278                                         *(q + 1) = *p;
279                                 } else {
280                                         *q = *p++;
281                                         *(q + 1) = *p;
282                                 }
283                                 if ((*q & 0xfc) != 0xdc) {
284                                         /* invalid unicode */
285                                         ret = -1;
286                                         break;
287                                 }
288                         }
289                 }
290
291                 /*
292                  * The second half of conversion.
293                  * (convert ENCODING_UNICODE into any code)
294                  */
295                 p = ucs;
296                 if (dp->convtype & KICONV_UCS_TO_UTF8) {
297                         q = (u_char *)dst;
298                         if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
299                                 /* decode surrogate pair */
300                                 code = decode_surrogate(p);
301                         } else {
302                                 code = (ucs[0] << 8) | ucs[1];
303                         }
304
305                         if (casetype == KICONV_LOWER && dp->ctype) {
306                                 code = towlower(code, dp->ctype);
307                         } else if (casetype == KICONV_UPPER && dp->ctype) {
308                                 code = towupper(code, dp->ctype);
309                         }
310
311                         outlen = 0;
312                         if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
313                                 ret = -1;
314                                 break;
315                         }
316
317                         src += inlen;
318                         ir -= inlen;
319                         dst += outlen;
320                         or -= outlen;
321
322                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
323                         ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
324                             &or, casetype & (KICONV_LOWER | KICONV_UPPER));
325                         if (ret)
326                                 break;
327
328                         src += inlen;
329                         ir -= inlen;
330
331                 } else {
332                         /* dst code is a proper subset of ENCODING_UNICODE */
333                         if (or < ucslen) {
334                                 ret = -1;
335                                 break;
336                         }
337                         src += inlen;
338                         ir -= inlen;
339                         or -= ucslen;
340                         if (dp->convtype & KICONV_UCS_TO_LE) {
341                                 *dst++ = *(p + 1);
342                                 *dst++ = *p;
343                                 p += 2;
344                         } else {
345                                 *dst++ = *p++;
346                                 *dst++ = *p++;
347                         }
348                         if (ucslen == 4) {
349                                 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
350                                     (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
351                                         ret = -1;
352                                         break;
353                                 }
354                                 if (dp->convtype & KICONV_UCS_TO_LE) {
355                                         *dst++ = *(p + 1);
356                                         *dst++ = *p;
357                                 } else {
358                                         *dst++ = *p++;
359                                         *dst++ = *p;
360                                 }
361                         }
362                 }
363
364                 if (convchar == 1)
365                         break;
366         }
367
368         *inbuf += in - ir;
369         *outbuf += on - or;
370         *inbytesleft -= in - ir;
371         *outbytesleft -= on - or;
372         return (ret);
373 }
374
375 static int
376 iconv_ucs_init(struct iconv_converter_class *dcp)
377 {
378         int error;
379
380         error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
381         if (error)
382                 return (error);
383         error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
384         if (error)
385                 return (error);
386         return (0);
387 }
388
389 static int
390 iconv_ucs_done(struct iconv_converter_class *dcp)
391 {
392         return (0);
393 }
394
395 static const char *
396 iconv_ucs_name(struct iconv_converter_class *dcp)
397 {
398         return (ENCODING_UNICODE);
399 }
400
401 static kobj_method_t iconv_ucs_methods[] = {
402         KOBJMETHOD(iconv_converter_open,        iconv_ucs_open),
403         KOBJMETHOD(iconv_converter_close,       iconv_ucs_close),
404         KOBJMETHOD(iconv_converter_conv,        iconv_ucs_conv),
405         KOBJMETHOD(iconv_converter_init,        iconv_ucs_init),
406         KOBJMETHOD(iconv_converter_done,        iconv_ucs_done),
407         KOBJMETHOD(iconv_converter_name,        iconv_ucs_name),
408         {0, 0}
409 };
410
411 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
412
413 static uint32_t
414 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
415 {
416         size_t i, w = 0;
417         uint32_t ucs4 = 0;
418
419         /*
420          * get leading 1 byte from utf-8
421          */
422         if ((*src & 0x80) == 0) {
423                 /*
424                  * leading 1 bit is "0"
425                  *  utf-8: 0xxxxxxx
426                  *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
427                  */
428                 w = 1;
429                 /* get trailing 7 bits */
430                 ucs4 = *src & 0x7f;
431         } else if ((*src & 0xe0) == 0xc0) {
432                 /*
433                  * leading 3 bits are "110"
434                  *  utf-8: 110xxxxx 10yyyyyy
435                  *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
436                  */
437                 w = 2;
438                 /* get trailing 5 bits */
439                 ucs4 = *src & 0x1f;
440         } else if ((*src & 0xf0) == 0xe0) {
441                 /*
442                  * leading 4 bits are "1110"
443                  *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
444                  *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
445                  */
446                 w = 3;
447                 /* get trailing 4 bits */
448                 ucs4 = *src & 0x0f;
449         } else if ((*src & 0xf8) == 0xf0) {
450                 /*
451                  * leading 5 bits are "11110"
452                  *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
453                  *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
454                  */
455                 w = 4;
456                 /* get trailing 3 bits */
457                 ucs4 = *src & 0x07;
458         } else {
459                 /* out of utf-16 range or having illegal bits */
460                 return (0);
461         }
462
463         if (srclen < w)
464                 return (0);
465
466         /*
467          * get left parts from utf-8
468          */
469         for (i = 1 ; i < w ; i++) {
470                 if ((*(src + i) & 0xc0) != 0x80) {
471                         /* invalid: leading 2 bits are not "10" */
472                         return (0);
473                 }
474                 /* concatenate trailing 6 bits into ucs4 */
475                 ucs4 <<= 6;
476                 ucs4 |= *(src + i) & 0x3f;
477         }
478
479         *utf8width = w;
480         return (ucs4);
481 }
482
483 static u_char *
484 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
485 {
486         u_char lead, *p;
487         size_t i, w;
488
489         /*
490          * determine utf-8 width and leading bits
491          */
492         if (ucs4 < 0x80) {
493                 w = 1;
494                 lead = 0;       /* "0" */
495         } else if (ucs4 < 0x800) {
496                 w = 2;
497                 lead = 0xc0;    /* "11" */
498         } else if (ucs4 < 0x10000) {
499                 w = 3;
500                 lead = 0xe0;    /* "111" */
501         } else if (ucs4 < 0x200000) {
502                 w = 4;
503                 lead = 0xf0;    /* "1111" */
504         } else {
505                 return (NULL);
506         }
507
508         if (dstlen < w)
509                 return (NULL);
510
511         /*
512          * construct utf-8
513          */
514         p = dst;
515         for (i = w - 1 ; i >= 1 ; i--) {
516                 /* get trailing 6 bits and put it with leading bit as "1" */
517                 *(p + i) = (ucs4 & 0x3f) | 0x80;
518                 ucs4 >>= 6;
519         }
520         *p = ucs4 | lead;
521
522         *utf8width = w;
523
524         return (p);
525 }
526
527 static uint32_t
528 encode_surrogate(uint32_t code)
529 {
530         return ((((code - 0x10000) << 6) & 0x3ff0000) |
531             ((code - 0x10000) & 0x3ff) | 0xd800dc00);
532 }
533
534 static uint32_t
535 decode_surrogate(const u_char *ucs)
536 {
537         return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
538             ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
539 }
540