sys/libkern/iconv_ucs.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2003, 2005 Ryuichiro Imura
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 #include <sys/cdefs.h>
  30 __FBSDID("$FreeBSD$");
  31
  32 #include <sys/param.h>
  33 #include <sys/kernel.h>
  34 #include <sys/systm.h>
  35 #include <sys/malloc.h>
  36 #include <sys/iconv.h>
  37
  38 #include "iconv_converter_if.h"
  39
  40 /*
  41  * "UCS" converter
  42  */
  43
  44 #define KICONV_UCS_COMBINE      0x1
  45 #define KICONV_UCS_FROM_UTF8    0x2
  46 #define KICONV_UCS_TO_UTF8      0x4
  47 #define KICONV_UCS_FROM_LE      0x8
  48 #define KICONV_UCS_TO_LE        0x10
  49 #define KICONV_UCS_FROM_UTF16   0x20
  50 #define KICONV_UCS_TO_UTF16     0x40
  51 #define KICONV_UCS_UCS4         0x80
  52
  53 #define ENCODING_UTF16  "UTF-16BE"
  54 #define ENCODING_UTF8   "UTF-8"
  55
  56 static struct {
  57         const char *name;
  58         int from_flag, to_flag;
  59 } unicode_family[] = {
  60         { "UTF-8",      KICONV_UCS_FROM_UTF8,   KICONV_UCS_TO_UTF8 },
  61         { "UCS-2LE",    KICONV_UCS_FROM_LE,     KICONV_UCS_TO_LE },
  62         { "UTF-16BE",   KICONV_UCS_FROM_UTF16,  KICONV_UCS_TO_UTF16 },
  63         { "UTF-16LE",   KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
  64             KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
  65         { NULL,         0,      0 }
  66 };
  67
  68 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
  69 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
  70 static uint32_t encode_surrogate(uint32_t code);
  71 static uint32_t decode_surrogate(const u_char *ucs);
  72
  73 #ifdef MODULE_DEPEND
  74 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
  75 #endif
  76
  77 /*
  78  * UCS converter instance
  79  */
  80 struct iconv_ucs {
  81         KOBJ_FIELDS;
  82         int                     convtype;
  83         struct iconv_cspair *   d_csp;
  84         struct iconv_cspair *   d_cspf;
  85         void *                  f_ctp;
  86         void *                  t_ctp;
  87         void *                  ctype;
  88 };
  89
  90 static int
  91 iconv_ucs_open(struct iconv_converter_class *dcp,
  92         struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
  93 {
  94         struct iconv_ucs *dp;
  95         int i;
  96         const char *from, *to;
  97
  98         dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
  99         to = csp->cp_to;
 100         from = cspf ? cspf->cp_from : csp->cp_from;
 101
 102         dp->convtype = 0;
 103
 104         if (cspf)
 105                 dp->convtype |= KICONV_UCS_COMBINE;
 106         for (i = 0; unicode_family[i].name; i++) {
 107                 if (strcasecmp(from, unicode_family[i].name) == 0)
 108                         dp->convtype |= unicode_family[i].from_flag;
 109                 if (strcasecmp(to, unicode_family[i].name) == 0)
 110                         dp->convtype |= unicode_family[i].to_flag;
 111         }
 112         if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
 113                 dp->convtype |= KICONV_UCS_UCS4;
 114         else
 115                 dp->convtype &= ~KICONV_UCS_UCS4;
 116
 117         dp->f_ctp = dp->t_ctp = NULL;
 118         if (dp->convtype & KICONV_UCS_COMBINE) {
 119                 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
 120                     (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
 121                         iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
 122                 }
 123                 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
 124                     (dp->convtype & KICONV_UCS_TO_LE) == 0) {
 125                         iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
 126                 }
 127         }
 128
 129         dp->ctype = NULL;
 130         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
 131                 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
 132
 133         dp->d_csp = csp;
 134         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
 135                 if (cspf) {
 136                         dp->d_cspf = cspf;
 137                         cspf->cp_refcount++;
 138                 } else
 139                         csp->cp_refcount++;
 140         }
 141         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
 142                 csp->cp_refcount++;
 143         *dpp = (void*)dp;
 144         return 0;
 145 }
 146
 147 static int
 148 iconv_ucs_close(void *data)
 149 {
 150         struct iconv_ucs *dp = data;
 151
 152         if (dp->f_ctp)
 153                 iconv_close(dp->f_ctp);
 154         if (dp->t_ctp)
 155                 iconv_close(dp->t_ctp);
 156         if (dp->ctype)
 157                 iconv_close(dp->ctype);
 158         if (dp->d_cspf)
 159                 dp->d_cspf->cp_refcount--;
 160         else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
 161                 dp->d_csp->cp_refcount--;
 162         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
 163                 dp->d_csp->cp_refcount--;
 164         kobj_delete((struct kobj*)data, M_ICONV);
 165         return 0;
 166 }
 167
 168 static int
 169 iconv_ucs_conv(void *d2p, const char **inbuf,
 170         size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
 171         int convchar, int casetype)
 172 {
 173         struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
 174         int ret = 0, i;
 175         size_t in, on, ir, or, inlen, outlen, ucslen;
 176         const char *src, *p;
 177         char *dst;
 178         u_char ucs[4], *q;
 179         uint32_t code;
 180
 181         if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
 182                 return 0;
 183         ir = in = *inbytesleft;
 184         or = on = *outbytesleft;
 185         src = *inbuf;
 186         dst = *outbuf;
 187
 188         while (ir > 0 && or > 0) {
 189
 190                 /*
 191                  * The first half of conversion.
 192                  * (convert any code into ENCODING_UNICODE)
 193                  */
 194                 code = 0;
 195                 p = src;
 196                 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
 197                         /* convert UTF-8 to ENCODING_UNICODE */
 198                         inlen = 0;
 199                         code = utf8_to_ucs4(p, &inlen, ir);
 200                         if (code == 0) {
 201                                 ret = -1;
 202                                 break;
 203                         }
 204
 205                         if (casetype == KICONV_FROM_LOWER && dp->ctype) {
 206                                 code = towlower(code, dp->ctype);
 207                         } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
 208                                 code = towupper(code, dp->ctype);
 209                         }
 210
 211                         if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
 212                                 /* reserved for utf-16 surrogate pair */
 213                                 /* invalid unicode */
 214                                 ret = -1;
 215                                 break;
 216                         }
 217
 218                         if (inlen == 4) {
 219                                 if (dp->convtype & KICONV_UCS_UCS4) {
 220                                         ucslen = 4;
 221                                         code = encode_surrogate(code);
 222                                 } else {
 223                                         /* can't handle with ucs-2 */
 224                                         ret = -1;
 225                                         break;
 226                                 }
 227                         } else {
 228                                 ucslen = 2;
 229                         }
 230
 231                         /* save UCS-4 into ucs[] */
 232                         for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
 233                                 *q++ = (code >> (i << 3)) & 0xff;
 234
 235                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
 236                         /* convert local code to ENCODING_UNICODE */
 237                         ucslen = 4;
 238                         inlen = ir;
 239                         q = ucs;
 240                         ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
 241                             &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
 242                         if (ret)
 243                                 break;
 244                         inlen = ir - inlen;
 245                         ucslen = 4 - ucslen;
 246
 247                 } else {
 248                         /* src code is a proper subset of ENCODING_UNICODE */
 249                         q = ucs;
 250                         if (dp->convtype & KICONV_UCS_FROM_LE) {
 251                                 *q = *(p + 1);
 252                                 *(q + 1) = *p;
 253                                 p += 2;
 254                         } else {
 255                                 *q = *p++;
 256                                 *(q + 1) = *p++;
 257                         }
 258                         if ((*q & 0xfc) == 0xd8) {
 259                                 if (dp->convtype & KICONV_UCS_UCS4 &&
 260                                     dp->convtype & KICONV_UCS_FROM_UTF16) {
 261                                         inlen = ucslen = 4;
 262                                 } else {
 263                                         /* invalid unicode */
 264                                         ret = -1;
 265                                         break;
 266                                 }
 267                         } else {
 268                                 inlen = ucslen = 2;
 269                         }
 270                         if (ir < inlen) {
 271                                 ret = -1;
 272                                 break;
 273                         }
 274                         if (ucslen == 4) {
 275                                 q += 2;
 276                                 if (dp->convtype & KICONV_UCS_FROM_LE) {
 277                                         *q = *(p + 1);
 278                                         *(q + 1) = *p;
 279                                 } else {
 280                                         *q = *p++;
 281                                         *(q + 1) = *p;
 282                                 }
 283                                 if ((*q & 0xfc) != 0xdc) {
 284                                         /* invalid unicode */
 285                                         ret = -1;
 286                                         break;
 287                                 }
 288                         }
 289                 }
 290
 291                 /*
 292                  * The second half of conversion.
 293                  * (convert ENCODING_UNICODE into any code)
 294                  */
 295                 p = ucs;
 296                 if (dp->convtype & KICONV_UCS_TO_UTF8) {
 297                         q = (u_char *)dst;
 298                         if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
 299                                 /* decode surrogate pair */
 300                                 code = decode_surrogate(p);
 301                         } else {
 302                                 code = (ucs[0] << 8) | ucs[1];
 303                         }
 304
 305                         if (casetype == KICONV_LOWER && dp->ctype) {
 306                                 code = towlower(code, dp->ctype);
 307                         } else if (casetype == KICONV_UPPER && dp->ctype) {
 308                                 code = towupper(code, dp->ctype);
 309                         }
 310
 311                         outlen = 0;
 312                         if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
 313                                 ret = -1;
 314                                 break;
 315                         }
 316
 317                         src += inlen;
 318                         ir -= inlen;
 319                         dst += outlen;
 320                         or -= outlen;
 321
 322                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
 323                         ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
 324                             &or, casetype & (KICONV_LOWER | KICONV_UPPER));
 325                         if (ret)
 326                                 break;
 327
 328                         src += inlen;
 329                         ir -= inlen;
 330
 331                 } else {
 332                         /* dst code is a proper subset of ENCODING_UNICODE */
 333                         if (or < ucslen) {
 334                                 ret = -1;
 335                                 break;
 336                         }
 337                         src += inlen;
 338                         ir -= inlen;
 339                         or -= ucslen;
 340                         if (dp->convtype & KICONV_UCS_TO_LE) {
 341                                 *dst++ = *(p + 1);
 342                                 *dst++ = *p;
 343                                 p += 2;
 344                         } else {
 345                                 *dst++ = *p++;
 346                                 *dst++ = *p++;
 347                         }
 348                         if (ucslen == 4) {
 349                                 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
 350                                     (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
 351                                         ret = -1;
 352                                         break;
 353                                 }
 354                                 if (dp->convtype & KICONV_UCS_TO_LE) {
 355                                         *dst++ = *(p + 1);
 356                                         *dst++ = *p;
 357                                 } else {
 358                                         *dst++ = *p++;
 359                                         *dst++ = *p;
 360                                 }
 361                         }
 362                 }
 363
 364                 if (convchar == 1)
 365                         break;
 366         }
 367
 368         *inbuf += in - ir;
 369         *outbuf += on - or;
 370         *inbytesleft -= in - ir;
 371         *outbytesleft -= on - or;
 372         return (ret);
 373 }
 374
 375 static int
 376 iconv_ucs_init(struct iconv_converter_class *dcp)
 377 {
 378         int error;
 379
 380         error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
 381         if (error)
 382                 return (error);
 383         error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
 384         if (error)
 385                 return (error);
 386         return (0);
 387 }
 388
 389 static int
 390 iconv_ucs_done(struct iconv_converter_class *dcp)
 391 {
 392         return (0);
 393 }
 394
 395 static const char *
 396 iconv_ucs_name(struct iconv_converter_class *dcp)
 397 {
 398         return (ENCODING_UNICODE);
 399 }
 400
 401 static kobj_method_t iconv_ucs_methods[] = {
 402         KOBJMETHOD(iconv_converter_open,        iconv_ucs_open),
 403         KOBJMETHOD(iconv_converter_close,       iconv_ucs_close),
 404         KOBJMETHOD(iconv_converter_conv,        iconv_ucs_conv),
 405         KOBJMETHOD(iconv_converter_init,        iconv_ucs_init),
 406         KOBJMETHOD(iconv_converter_done,        iconv_ucs_done),
 407         KOBJMETHOD(iconv_converter_name,        iconv_ucs_name),
 408         {0, 0}
 409 };
 410
 411 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
 412
 413 static uint32_t
 414 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
 415 {
 416         size_t i, w = 0;
 417         uint32_t ucs4 = 0;
 418
 419         /*
 420          * get leading 1 byte from utf-8
 421          */
 422         if ((*src & 0x80) == 0) {
 423                 /*
 424                  * leading 1 bit is "0"
 425                  *  utf-8: 0xxxxxxx
 426                  *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
 427                  */
 428                 w = 1;
 429                 /* get trailing 7 bits */
 430                 ucs4 = *src & 0x7f;
 431         } else if ((*src & 0xe0) == 0xc0) {
 432                 /*
 433                  * leading 3 bits are "110"
 434                  *  utf-8: 110xxxxx 10yyyyyy
 435                  *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
 436                  */
 437                 w = 2;
 438                 /* get trailing 5 bits */
 439                 ucs4 = *src & 0x1f;
 440         } else if ((*src & 0xf0) == 0xe0) {
 441                 /*
 442                  * leading 4 bits are "1110"
 443                  *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
 444                  *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
 445                  */
 446                 w = 3;
 447                 /* get trailing 4 bits */
 448                 ucs4 = *src & 0x0f;
 449         } else if ((*src & 0xf8) == 0xf0) {
 450                 /*
 451                  * leading 5 bits are "11110"
 452                  *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
 453                  *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
 454                  */
 455                 w = 4;
 456                 /* get trailing 3 bits */
 457                 ucs4 = *src & 0x07;
 458         } else {
 459                 /* out of utf-16 range or having illegal bits */
 460                 return (0);
 461         }
 462
 463         if (srclen < w)
 464                 return (0);
 465
 466         /*
 467          * get left parts from utf-8
 468          */
 469         for (i = 1 ; i < w ; i++) {
 470                 if ((*(src + i) & 0xc0) != 0x80) {
 471                         /* invalid: leading 2 bits are not "10" */
 472                         return (0);
 473                 }
 474                 /* concatenate trailing 6 bits into ucs4 */
 475                 ucs4 <<= 6;
 476                 ucs4 |= *(src + i) & 0x3f;
 477         }
 478
 479         *utf8width = w;
 480         return (ucs4);
 481 }
 482
 483 static u_char *
 484 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
 485 {
 486         u_char lead, *p;
 487         size_t i, w;
 488
 489         /*
 490          * determine utf-8 width and leading bits
 491          */
 492         if (ucs4 < 0x80) {
 493                 w = 1;
 494                 lead = 0;       /* "0" */
 495         } else if (ucs4 < 0x800) {
 496                 w = 2;
 497                 lead = 0xc0;    /* "11" */
 498         } else if (ucs4 < 0x10000) {
 499                 w = 3;
 500                 lead = 0xe0;    /* "111" */
 501         } else if (ucs4 < 0x200000) {
 502                 w = 4;
 503                 lead = 0xf0;    /* "1111" */
 504         } else {
 505                 return (NULL);
 506         }
 507
 508         if (dstlen < w)
 509                 return (NULL);
 510
 511         /*
 512          * construct utf-8
 513          */
 514         p = dst;
 515         for (i = w - 1 ; i >= 1 ; i--) {
 516                 /* get trailing 6 bits and put it with leading bit as "1" */
 517                 *(p + i) = (ucs4 & 0x3f) | 0x80;
 518                 ucs4 >>= 6;
 519         }
 520         *p = ucs4 | lead;
 521
 522         *utf8width = w;
 523
 524         return (p);
 525 }
 526
 527 static uint32_t
 528 encode_surrogate(uint32_t code)
 529 {
 530         return ((((code - 0x10000) << 6) & 0x3ff0000) |
 531             ((code - 0x10000) & 0x3ff) | 0xd800dc00);
 532 }
 533
 534 static uint32_t
 535 decode_surrogate(const u_char *ucs)
 536 {
 537         return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
 538             ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
 539 }
 540