]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - sys/libkern/iconv_ucs.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / sys / libkern / iconv_ucs.c
1 /*-
2  * Copyright (c) 2003, 2005 Ryuichiro Imura
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/kernel.h>
32 #include <sys/systm.h>
33 #include <sys/malloc.h>
34 #include <sys/iconv.h>
35
36 #include "iconv_converter_if.h"
37
38 /*
39  * "UCS" converter
40  */
41
42 #define KICONV_UCS_COMBINE      0x1
43 #define KICONV_UCS_FROM_UTF8    0x2
44 #define KICONV_UCS_TO_UTF8      0x4
45 #define KICONV_UCS_FROM_LE      0x8
46 #define KICONV_UCS_TO_LE        0x10
47 #define KICONV_UCS_FROM_UTF16   0x20
48 #define KICONV_UCS_TO_UTF16     0x40
49 #define KICONV_UCS_UCS4         0x80
50
51 #define ENCODING_UTF16  "UTF-16BE"
52 #define ENCODING_UTF8   "UTF-8"
53
54 static struct {
55         const char *name;
56         int from_flag, to_flag;
57 } unicode_family[] = {
58         { "UTF-8",      KICONV_UCS_FROM_UTF8,   KICONV_UCS_TO_UTF8 },
59         { "UCS-2LE",    KICONV_UCS_FROM_LE,     KICONV_UCS_TO_LE },
60         { "UTF-16BE",   KICONV_UCS_FROM_UTF16,  KICONV_UCS_TO_UTF16 },
61         { "UTF-16LE",   KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
62             KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
63         { NULL,         0,      0 }
64 };
65
66 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
67 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
68 static uint32_t encode_surrogate(uint32_t code);
69 static uint32_t decode_surrogate(const u_char *ucs);
70
71 #ifdef MODULE_DEPEND
72 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
73 #endif
74
75 /*
76  * UCS converter instance
77  */
78 struct iconv_ucs {
79         KOBJ_FIELDS;
80         int                     convtype;
81         struct iconv_cspair *   d_csp;
82         struct iconv_cspair *   d_cspf;
83         void *                  f_ctp;
84         void *                  t_ctp;
85         void *                  ctype;
86 };
87
88 static int
89 iconv_ucs_open(struct iconv_converter_class *dcp,
90         struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
91 {
92         struct iconv_ucs *dp;
93         int i;
94         const char *from, *to;
95
96         dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
97         to = csp->cp_to;
98         from = cspf ? cspf->cp_from : csp->cp_from;
99
100         dp->convtype = 0;
101
102         if (cspf)
103                 dp->convtype |= KICONV_UCS_COMBINE;
104         for (i = 0; unicode_family[i].name; i++) {
105                 if (strcmp(from, unicode_family[i].name) == 0)
106                         dp->convtype |= unicode_family[i].from_flag;
107                 if (strcmp(to, unicode_family[i].name) == 0)
108                         dp->convtype |= unicode_family[i].to_flag;
109         }
110         if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
111                 dp->convtype |= KICONV_UCS_UCS4;
112         else
113                 dp->convtype &= ~KICONV_UCS_UCS4;
114
115         dp->f_ctp = dp->t_ctp = NULL;
116         if (dp->convtype & KICONV_UCS_COMBINE) {
117                 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
118                     (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
119                         iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
120                 }
121                 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
122                     (dp->convtype & KICONV_UCS_TO_LE) == 0) {
123                         iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
124                 }
125         }
126
127         dp->ctype = NULL;
128         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
129                 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
130
131         dp->d_csp = csp;
132         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
133                 if (cspf) {
134                         dp->d_cspf = cspf;
135                         cspf->cp_refcount++;
136                 } else
137                         csp->cp_refcount++;
138         }
139         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
140                 csp->cp_refcount++;
141         *dpp = (void*)dp;
142         return 0;
143 }
144
145 static int
146 iconv_ucs_close(void *data)
147 {
148         struct iconv_ucs *dp = data;
149
150         if (dp->f_ctp)
151                 iconv_close(dp->f_ctp);
152         if (dp->t_ctp)
153                 iconv_close(dp->t_ctp);
154         if (dp->ctype)
155                 iconv_close(dp->ctype);
156         if (dp->d_cspf)
157                 dp->d_cspf->cp_refcount--;
158         else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
159                 dp->d_csp->cp_refcount--;
160         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
161                 dp->d_csp->cp_refcount--;
162         kobj_delete((struct kobj*)data, M_ICONV);
163         return 0;
164 }
165
166 static int
167 iconv_ucs_conv(void *d2p, const char **inbuf,
168         size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
169         int convchar, int casetype)
170 {
171         struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
172         int ret = 0, i;
173         size_t in, on, ir, or, inlen, outlen, ucslen;
174         const char *src, *p;
175         char *dst;
176         u_char ucs[4], *q;
177         uint32_t code;
178
179         if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
180                 return 0;
181         ir = in = *inbytesleft;
182         or = on = *outbytesleft;
183         src = *inbuf;
184         dst = *outbuf;
185
186         while (ir > 0 && or > 0) {
187
188                 /*
189                  * The first half of conversion.
190                  * (convert any code into ENCODING_UNICODE)
191                  */
192                 code = 0;
193                 p = src;
194                 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
195                         /* convert UTF-8 to ENCODING_UNICODE */
196                         inlen = 0;
197                         code = utf8_to_ucs4(p, &inlen, ir);
198                         if (code == 0) {
199                                 ret = -1;
200                                 break;
201                         }
202
203                         if (casetype == KICONV_FROM_LOWER && dp->ctype) {
204                                 code = towlower(code, dp->ctype);
205                         } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
206                                 code = towupper(code, dp->ctype);
207                         }
208
209                         if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
210                                 /* reserved for utf-16 surrogate pair */
211                                 /* invalid unicode */
212                                 ret = -1;
213                                 break;
214                         }
215
216                         if (inlen == 4) {
217                                 if (dp->convtype & KICONV_UCS_UCS4) {
218                                         ucslen = 4;
219                                         code = encode_surrogate(code);
220                                 } else {
221                                         /* can't handle with ucs-2 */
222                                         ret = -1;
223                                         break;
224                                 }
225                         } else {
226                                 ucslen = 2;
227                         }
228
229                         /* save UCS-4 into ucs[] */
230                         for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
231                                 *q++ = (code >> (i << 3)) & 0xff;
232
233                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
234                         /* convert local code to ENCODING_UNICODE */
235                         ucslen = 4;
236                         inlen = ir;
237                         q = ucs;
238                         ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
239                             &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
240                         if (ret)
241                                 break;
242                         inlen = ir - inlen;
243                         ucslen = 4 - ucslen;
244
245                 } else {
246                         /* src code is a proper subset of ENCODING_UNICODE */
247                         q = ucs;
248                         if (dp->convtype & KICONV_UCS_FROM_LE) {
249                                 *q = *(p + 1);
250                                 *(q + 1) = *p;
251                                 p += 2;
252                         } else {
253                                 *q = *p++;
254                                 *(q + 1) = *p++;
255                         }
256                         if ((*q & 0xfc) == 0xd8) {
257                                 if (dp->convtype & KICONV_UCS_UCS4 &&
258                                     dp->convtype & KICONV_UCS_FROM_UTF16) {
259                                         inlen = ucslen = 4;
260                                 } else {
261                                         /* invalid unicode */
262                                         ret = -1;
263                                         break;
264                                 }
265                         } else {
266                                 inlen = ucslen = 2;
267                         }
268                         if (ir < inlen) {
269                                 ret = -1;
270                                 break;
271                         }
272                         if (ucslen == 4) {
273                                 q += 2;
274                                 if (dp->convtype & KICONV_UCS_FROM_LE) {
275                                         *q = *(p + 1);
276                                         *(q + 1) = *p;
277                                 } else {
278                                         *q = *p++;
279                                         *(q + 1) = *p;
280                                 }
281                                 if ((*q & 0xfc) != 0xdc) {
282                                         /* invalid unicode */
283                                         ret = -1;
284                                         break;
285                                 }
286                         }
287                 }
288
289                 /*
290                  * The second half of conversion.
291                  * (convert ENCODING_UNICODE into any code)
292                  */
293                 p = ucs;
294                 if (dp->convtype & KICONV_UCS_TO_UTF8) {
295                         q = (u_char *)dst;
296                         if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
297                                 /* decode surrogate pair */
298                                 code = decode_surrogate(p);
299                         } else {
300                                 code = (ucs[0] << 8) | ucs[1];
301                         }
302
303                         if (casetype == KICONV_LOWER && dp->ctype) {
304                                 code = towlower(code, dp->ctype);
305                         } else if (casetype == KICONV_UPPER && dp->ctype) {
306                                 code = towupper(code, dp->ctype);
307                         }
308
309                         outlen = 0;
310                         if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
311                                 ret = -1;
312                                 break;
313                         }
314
315                         src += inlen;
316                         ir -= inlen;
317                         dst += outlen;
318                         or -= outlen;
319
320                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
321                         ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
322                             &or, casetype & (KICONV_LOWER | KICONV_UPPER));
323                         if (ret)
324                                 break;
325
326                         src += inlen;
327                         ir -= inlen;
328
329                 } else {
330                         /* dst code is a proper subset of ENCODING_UNICODE */
331                         if (or < ucslen) {
332                                 ret = -1;
333                                 break;
334                         }
335                         src += inlen;
336                         ir -= inlen;
337                         or -= ucslen;
338                         if (dp->convtype & KICONV_UCS_TO_LE) {
339                                 *dst++ = *(p + 1);
340                                 *dst++ = *p;
341                                 p += 2;
342                         } else {
343                                 *dst++ = *p++;
344                                 *dst++ = *p++;
345                         }
346                         if (ucslen == 4) {
347                                 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
348                                     (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
349                                         ret = -1;
350                                         break;
351                                 }
352                                 if (dp->convtype & KICONV_UCS_TO_LE) {
353                                         *dst++ = *(p + 1);
354                                         *dst++ = *p;
355                                 } else {
356                                         *dst++ = *p++;
357                                         *dst++ = *p;
358                                 }
359                         }
360                 }
361
362                 if (convchar == 1)
363                         break;
364         }
365
366         *inbuf += in - ir;
367         *outbuf += on - or;
368         *inbytesleft -= in - ir;
369         *outbytesleft -= on - or;
370         return (ret);
371 }
372
373 static int
374 iconv_ucs_init(struct iconv_converter_class *dcp)
375 {
376         int error;
377
378         error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
379         if (error)
380                 return (error);
381         error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
382         if (error)
383                 return (error);
384         return (0);
385 }
386
387 static int
388 iconv_ucs_done(struct iconv_converter_class *dcp)
389 {
390         return (0);
391 }
392
393 static const char *
394 iconv_ucs_name(struct iconv_converter_class *dcp)
395 {
396         return (ENCODING_UNICODE);
397 }
398
399 static kobj_method_t iconv_ucs_methods[] = {
400         KOBJMETHOD(iconv_converter_open,        iconv_ucs_open),
401         KOBJMETHOD(iconv_converter_close,       iconv_ucs_close),
402         KOBJMETHOD(iconv_converter_conv,        iconv_ucs_conv),
403         KOBJMETHOD(iconv_converter_init,        iconv_ucs_init),
404         KOBJMETHOD(iconv_converter_done,        iconv_ucs_done),
405         KOBJMETHOD(iconv_converter_name,        iconv_ucs_name),
406         {0, 0}
407 };
408
409 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
410
411 static uint32_t
412 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
413 {
414         size_t i, w = 0;
415         uint32_t ucs4 = 0;
416
417         /*
418          * get leading 1 byte from utf-8
419          */
420         if ((*src & 0x80) == 0) {
421                 /*
422                  * leading 1 bit is "0"
423                  *  utf-8: 0xxxxxxx
424                  *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
425                  */
426                 w = 1;
427                 /* get trailing 7 bits */
428                 ucs4 = *src & 0x7f;
429         } else if ((*src & 0xe0) == 0xc0) {
430                 /*
431                  * leading 3 bits are "110"
432                  *  utf-8: 110xxxxx 10yyyyyy
433                  *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
434                  */
435                 w = 2;
436                 /* get trailing 5 bits */
437                 ucs4 = *src & 0x1f;
438         } else if ((*src & 0xf0) == 0xe0) {
439                 /*
440                  * leading 4 bits are "1110"
441                  *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
442                  *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
443                  */
444                 w = 3;
445                 /* get trailing 4 bits */
446                 ucs4 = *src & 0x0f;
447         } else if ((*src & 0xf8) == 0xf0) {
448                 /*
449                  * leading 5 bits are "11110"
450                  *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
451                  *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
452                  */
453                 w = 4;
454                 /* get trailing 3 bits */
455                 ucs4 = *src & 0x07;
456         } else {
457                 /* out of utf-16 range or having illegal bits */
458                 return (0);
459         }
460
461         if (srclen < w)
462                 return (0);
463
464         /*
465          * get left parts from utf-8
466          */
467         for (i = 1 ; i < w ; i++) {
468                 if ((*(src + i) & 0xc0) != 0x80) {
469                         /* invalid: leading 2 bits are not "10" */
470                         return (0);
471                 }
472                 /* concatenate trailing 6 bits into ucs4 */
473                 ucs4 <<= 6;
474                 ucs4 |= *(src + i) & 0x3f;
475         }
476
477         *utf8width = w;
478         return (ucs4);
479 }
480
481 static u_char *
482 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
483 {
484         u_char lead, *p;
485         size_t i, w;
486
487         /*
488          * determine utf-8 width and leading bits
489          */
490         if (ucs4 < 0x80) {
491                 w = 1;
492                 lead = 0;       /* "0" */
493         } else if (ucs4 < 0x800) {
494                 w = 2;
495                 lead = 0xc0;    /* "11" */
496         } else if (ucs4 < 0x10000) {
497                 w = 3;
498                 lead = 0xe0;    /* "111" */
499         } else if (ucs4 < 0x200000) {
500                 w = 4;
501                 lead = 0xf0;    /* "1111" */
502         } else {
503                 return (NULL);
504         }
505
506         if (dstlen < w)
507                 return (NULL);
508
509         /*
510          * construct utf-8
511          */
512         p = dst;
513         for (i = w - 1 ; i >= 1 ; i--) {
514                 /* get trailing 6 bits and put it with leading bit as "1" */
515                 *(p + i) = (ucs4 & 0x3f) | 0x80;
516                 ucs4 >>= 6;
517         }
518         *p = ucs4 | lead;
519
520         *utf8width = w;
521
522         return (p);
523 }
524
525 static uint32_t
526 encode_surrogate(register uint32_t code)
527 {
528         return ((((code - 0x10000) << 6) & 0x3ff0000) |
529             ((code - 0x10000) & 0x3ff) | 0xd800dc00);
530 }
531
532 static uint32_t
533 decode_surrogate(register const u_char *ucs)
534 {
535         return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
536             ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
537 }
538