contrib/libarchive/libarchive/archive_string.c

   1 /*-
   2  * Copyright (c) 2003-2011 Tim Kientzle
   3  * Copyright (c) 2011-2012 Michihiro NAKAJIMA
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
  16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  18  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
  19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #include "archive_platform.h"
  28 __FBSDID("$FreeBSD$");
  29
  30 /*
  31  * Basic resizable string support, to simplify manipulating arbitrary-sized
  32  * strings while minimizing heap activity.
  33  *
  34  * In particular, the buffer used by a string object is only grown, it
  35  * never shrinks, so you can clear and reuse the same string object
  36  * without incurring additional memory allocations.
  37  */
  38
  39 #ifdef HAVE_ERRNO_H
  40 #include <errno.h>
  41 #endif
  42 #ifdef HAVE_ICONV_H
  43 #include <iconv.h>
  44 #endif
  45 #ifdef HAVE_LANGINFO_H
  46 #include <langinfo.h>
  47 #endif
  48 #ifdef HAVE_LOCALCHARSET_H
  49 #include <localcharset.h>
  50 #endif
  51 #ifdef HAVE_STDLIB_H
  52 #include <stdlib.h>
  53 #endif
  54 #ifdef HAVE_STRING_H
  55 #include <string.h>
  56 #endif
  57 #ifdef HAVE_WCHAR_H
  58 #include <wchar.h>
  59 #endif
  60 #if defined(_WIN32) && !defined(__CYGWIN__)
  61 #include <windows.h>
  62 #include <locale.h>
  63 #endif
  64
  65 #include "archive_endian.h"
  66 #include "archive_private.h"
  67 #include "archive_string.h"
  68 #include "archive_string_composition.h"
  69
  70 #if !defined(HAVE_WMEMCPY) && !defined(wmemcpy)
  71 #define wmemcpy(a,b,i)  (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t))
  72 #endif
  73
  74 #if !defined(HAVE_WMEMMOVE) && !defined(wmemmove)
  75 #define wmemmove(a,b,i)  (wchar_t *)memmove((a), (b), (i) * sizeof(wchar_t))
  76 #endif
  77
  78 struct archive_string_conv {
  79         struct archive_string_conv      *next;
  80         char                            *from_charset;
  81         char                            *to_charset;
  82         unsigned                         from_cp;
  83         unsigned                         to_cp;
  84         /* Set 1 if from_charset and to_charset are the same. */
  85         int                              same;
  86         int                              flag;
  87 #define SCONV_TO_CHARSET        1       /* MBS is being converted to specified
  88                                          * charset. */
  89 #define SCONV_FROM_CHARSET      (1<<1)  /* MBS is being converted from
  90                                          * specified charset. */
  91 #define SCONV_BEST_EFFORT       (1<<2)  /* Copy at least ASCII code. */
  92 #define SCONV_WIN_CP            (1<<3)  /* Use Windows API for converting
  93                                          * MBS. */
  94 #define SCONV_UTF8_LIBARCHIVE_2 (1<<4)  /* Incorrect UTF-8 made by libarchive
  95                                          * 2.x in the wrong assumption. */
  96 #define SCONV_NORMALIZATION_C   (1<<6)  /* Need normalization to be Form C.
  97                                          * Before UTF-8 characters are actually
  98                                          * processed. */
  99 #define SCONV_NORMALIZATION_D   (1<<7)  /* Need normalization to be Form D.
 100                                          * Before UTF-8 characters are actually
 101                                          * processed.
 102                                          * Currently this only for MAC OS X. */
 103 #define SCONV_TO_UTF8           (1<<8)  /* "to charset" side is UTF-8. */
 104 #define SCONV_FROM_UTF8         (1<<9)  /* "from charset" side is UTF-8. */
 105 #define SCONV_TO_UTF16BE        (1<<10) /* "to charset" side is UTF-16BE. */
 106 #define SCONV_FROM_UTF16BE      (1<<11) /* "from charset" side is UTF-16BE. */
 107 #define SCONV_TO_UTF16LE        (1<<12) /* "to charset" side is UTF-16LE. */
 108 #define SCONV_FROM_UTF16LE      (1<<13) /* "from charset" side is UTF-16LE. */
 109 #define SCONV_TO_UTF16          (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE)
 110 #define SCONV_FROM_UTF16        (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE)
 111
 112 #if HAVE_ICONV
 113         iconv_t                          cd;
 114         iconv_t                          cd_w;/* Use at archive_mstring on
 115                                                * Windows. */
 116 #endif
 117         /* A temporary buffer for normalization. */
 118         struct archive_string            utftmp;
 119         int (*converter[2])(struct archive_string *, const void *, size_t,
 120             struct archive_string_conv *);
 121         int                              nconverter;
 122 };
 123
 124 #define CP_C_LOCALE     0       /* "C" locale only for this file. */
 125 #define CP_UTF16LE      1200
 126 #define CP_UTF16BE      1201
 127
 128 #define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF)
 129 #define IS_LOW_SURROGATE_LA(uc)  ((uc) >= 0xDC00 && (uc) <= 0xDFFF)
 130 #define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF)
 131 #define UNICODE_MAX             0x10FFFF
 132 #define UNICODE_R_CHAR          0xFFFD  /* Replacement character. */
 133 /* Set U+FFFD(Replacement character) in UTF-8. */
 134 static const char utf8_replacement_char[] = {0xef, 0xbf, 0xbd};
 135
 136 static struct archive_string_conv *find_sconv_object(struct archive *,
 137         const char *, const char *);
 138 static void add_sconv_object(struct archive *, struct archive_string_conv *);
 139 static struct archive_string_conv *create_sconv_object(const char *,
 140         const char *, unsigned, int);
 141 static void free_sconv_object(struct archive_string_conv *);
 142 static struct archive_string_conv *get_sconv_object(struct archive *,
 143         const char *, const char *, int);
 144 static unsigned make_codepage_from_charset(const char *);
 145 static unsigned get_current_codepage(void);
 146 static unsigned get_current_oemcp(void);
 147 static size_t mbsnbytes(const void *, size_t);
 148 static size_t utf16nbytes(const void *, size_t);
 149 #if defined(_WIN32) && !defined(__CYGWIN__)
 150 static int archive_wstring_append_from_mbs_in_codepage(
 151     struct archive_wstring *, const char *, size_t,
 152     struct archive_string_conv *);
 153 static int archive_string_append_from_wcs_in_codepage(struct archive_string *,
 154     const wchar_t *, size_t, struct archive_string_conv *);
 155 static int is_big_endian(void);
 156 static int strncat_in_codepage(struct archive_string *, const void *,
 157     size_t, struct archive_string_conv *);
 158 static int win_strncat_from_utf16be(struct archive_string *, const void *,
 159     size_t, struct archive_string_conv *);
 160 static int win_strncat_from_utf16le(struct archive_string *, const void *,
 161     size_t, struct archive_string_conv *);
 162 static int win_strncat_to_utf16be(struct archive_string *, const void *,
 163     size_t, struct archive_string_conv *);
 164 static int win_strncat_to_utf16le(struct archive_string *, const void *,
 165     size_t, struct archive_string_conv *);
 166 #endif
 167 static int best_effort_strncat_from_utf16be(struct archive_string *,
 168     const void *, size_t, struct archive_string_conv *);
 169 static int best_effort_strncat_from_utf16le(struct archive_string *,
 170     const void *, size_t, struct archive_string_conv *);
 171 static int best_effort_strncat_to_utf16be(struct archive_string *,
 172     const void *, size_t, struct archive_string_conv *);
 173 static int best_effort_strncat_to_utf16le(struct archive_string *,
 174     const void *, size_t, struct archive_string_conv *);
 175 #if defined(HAVE_ICONV)
 176 static int iconv_strncat_in_locale(struct archive_string *, const void *,
 177     size_t, struct archive_string_conv *);
 178 #endif
 179 static int best_effort_strncat_in_locale(struct archive_string *,
 180     const void *, size_t, struct archive_string_conv *);
 181 static int _utf8_to_unicode(uint32_t *, const char *, size_t);
 182 static int utf8_to_unicode(uint32_t *, const char *, size_t);
 183 static inline uint32_t combine_surrogate_pair(uint32_t, uint32_t);
 184 static int cesu8_to_unicode(uint32_t *, const char *, size_t);
 185 static size_t unicode_to_utf8(char *, size_t, uint32_t);
 186 static int utf16_to_unicode(uint32_t *, const char *, size_t, int);
 187 static size_t unicode_to_utf16be(char *, size_t, uint32_t);
 188 static size_t unicode_to_utf16le(char *, size_t, uint32_t);
 189 static int strncat_from_utf8_libarchive2(struct archive_string *,
 190     const void *, size_t, struct archive_string_conv *);
 191 static int strncat_from_utf8_to_utf8(struct archive_string *, const void *,
 192     size_t, struct archive_string_conv *);
 193 static int archive_string_normalize_C(struct archive_string *, const void *,
 194     size_t, struct archive_string_conv *);
 195 static int archive_string_normalize_D(struct archive_string *, const void *,
 196     size_t, struct archive_string_conv *);
 197 static int archive_string_append_unicode(struct archive_string *,
 198     const void *, size_t, struct archive_string_conv *);
 199
 200 static struct archive_string *
 201 archive_string_append(struct archive_string *as, const char *p, size_t s)
 202 {
 203         if (archive_string_ensure(as, as->length + s + 1) == NULL)
 204                 return (NULL);
 205         memmove(as->s + as->length, p, s);
 206         as->length += s;
 207         as->s[as->length] = 0;
 208         return (as);
 209 }
 210
 211 static struct archive_wstring *
 212 archive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s)
 213 {
 214         if (archive_wstring_ensure(as, as->length + s + 1) == NULL)
 215                 return (NULL);
 216         wmemmove(as->s + as->length, p, s);
 217         as->length += s;
 218         as->s[as->length] = 0;
 219         return (as);
 220 }
 221
 222 struct archive_string *
 223 archive_array_append(struct archive_string *as, const char *p, size_t s)
 224 {
 225         return archive_string_append(as, p, s);
 226 }
 227
 228 void
 229 archive_string_concat(struct archive_string *dest, struct archive_string *src)
 230 {
 231         if (archive_string_append(dest, src->s, src->length) == NULL)
 232                 __archive_errx(1, "Out of memory");
 233 }
 234
 235 void
 236 archive_wstring_concat(struct archive_wstring *dest,
 237     struct archive_wstring *src)
 238 {
 239         if (archive_wstring_append(dest, src->s, src->length) == NULL)
 240                 __archive_errx(1, "Out of memory");
 241 }
 242
 243 void
 244 archive_string_free(struct archive_string *as)
 245 {
 246         as->length = 0;
 247         as->buffer_length = 0;
 248         free(as->s);
 249         as->s = NULL;
 250 }
 251
 252 void
 253 archive_wstring_free(struct archive_wstring *as)
 254 {
 255         as->length = 0;
 256         as->buffer_length = 0;
 257         free(as->s);
 258         as->s = NULL;
 259 }
 260
 261 struct archive_wstring *
 262 archive_wstring_ensure(struct archive_wstring *as, size_t s)
 263 {
 264         return (struct archive_wstring *)
 265                 archive_string_ensure((struct archive_string *)as,
 266                                         s * sizeof(wchar_t));
 267 }
 268
 269 /* Returns NULL on any allocation failure. */
 270 struct archive_string *
 271 archive_string_ensure(struct archive_string *as, size_t s)
 272 {
 273         char *p;
 274         size_t new_length;
 275
 276         /* If buffer is already big enough, don't reallocate. */
 277         if (as->s && (s <= as->buffer_length))
 278                 return (as);
 279
 280         /*
 281          * Growing the buffer at least exponentially ensures that
 282          * append operations are always linear in the number of
 283          * characters appended.  Using a smaller growth rate for
 284          * larger buffers reduces memory waste somewhat at the cost of
 285          * a larger constant factor.
 286          */
 287         if (as->buffer_length < 32)
 288                 /* Start with a minimum 32-character buffer. */
 289                 new_length = 32;
 290         else if (as->buffer_length < 8192)
 291                 /* Buffers under 8k are doubled for speed. */
 292                 new_length = as->buffer_length + as->buffer_length;
 293         else {
 294                 /* Buffers 8k and over grow by at least 25% each time. */
 295                 new_length = as->buffer_length + as->buffer_length / 4;
 296                 /* Be safe: If size wraps, fail. */
 297                 if (new_length < as->buffer_length) {
 298                         /* On failure, wipe the string and return NULL. */
 299                         archive_string_free(as);
 300                         errno = ENOMEM;/* Make sure errno has ENOMEM. */
 301                         return (NULL);
 302                 }
 303         }
 304         /*
 305          * The computation above is a lower limit to how much we'll
 306          * grow the buffer.  In any case, we have to grow it enough to
 307          * hold the request.
 308          */
 309         if (new_length < s)
 310                 new_length = s;
 311         /* Now we can reallocate the buffer. */
 312         p = (char *)realloc(as->s, new_length);
 313         if (p == NULL) {
 314                 /* On failure, wipe the string and return NULL. */
 315                 archive_string_free(as);
 316                 errno = ENOMEM;/* Make sure errno has ENOMEM. */
 317                 return (NULL);
 318         }
 319
 320         as->s = p;
 321         as->buffer_length = new_length;
 322         return (as);
 323 }
 324
 325 /*
 326  * TODO: See if there's a way to avoid scanning
 327  * the source string twice.  Then test to see
 328  * if it actually helps (remember that we're almost
 329  * always called with pretty short arguments, so
 330  * such an optimization might not help).
 331  */
 332 struct archive_string *
 333 archive_strncat(struct archive_string *as, const void *_p, size_t n)
 334 {
 335         size_t s;
 336         const char *p, *pp;
 337
 338         p = (const char *)_p;
 339
 340         /* Like strlen(p), except won't examine positions beyond p[n]. */
 341         s = 0;
 342         pp = p;
 343         while (s < n && *pp) {
 344                 pp++;
 345                 s++;
 346         }
 347         if ((as = archive_string_append(as, p, s)) == NULL)
 348                 __archive_errx(1, "Out of memory");
 349         return (as);
 350 }
 351
 352 struct archive_wstring *
 353 archive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n)
 354 {
 355         size_t s;
 356         const wchar_t *pp;
 357
 358         /* Like strlen(p), except won't examine positions beyond p[n]. */
 359         s = 0;
 360         pp = p;
 361         while (s < n && *pp) {
 362                 pp++;
 363                 s++;
 364         }
 365         if ((as = archive_wstring_append(as, p, s)) == NULL)
 366                 __archive_errx(1, "Out of memory");
 367         return (as);
 368 }
 369
 370 struct archive_string *
 371 archive_strcat(struct archive_string *as, const void *p)
 372 {
 373         /* strcat is just strncat without an effective limit.
 374          * Assert that we'll never get called with a source
 375          * string over 16MB.
 376          * TODO: Review all uses of strcat in the source
 377          * and try to replace them with strncat().
 378          */
 379         return archive_strncat(as, p, 0x1000000);
 380 }
 381
 382 struct archive_wstring *
 383 archive_wstrcat(struct archive_wstring *as, const wchar_t *p)
 384 {
 385         /* Ditto. */
 386         return archive_wstrncat(as, p, 0x1000000);
 387 }
 388
 389 struct archive_string *
 390 archive_strappend_char(struct archive_string *as, char c)
 391 {
 392         if ((as = archive_string_append(as, &c, 1)) == NULL)
 393                 __archive_errx(1, "Out of memory");
 394         return (as);
 395 }
 396
 397 struct archive_wstring *
 398 archive_wstrappend_wchar(struct archive_wstring *as, wchar_t c)
 399 {
 400         if ((as = archive_wstring_append(as, &c, 1)) == NULL)
 401                 __archive_errx(1, "Out of memory");
 402         return (as);
 403 }
 404
 405 /*
 406  * Get the "current character set" name to use with iconv.
 407  * On FreeBSD, the empty character set name "" chooses
 408  * the correct character encoding for the current locale,
 409  * so this isn't necessary.
 410  * But iconv on Mac OS 10.6 doesn't seem to handle this correctly;
 411  * on that system, we have to explicitly call nl_langinfo()
 412  * to get the right name.  Not sure about other platforms.
 413  *
 414  * NOTE: GNU libiconv does not recognize the character-set name
 415  * which some platform nl_langinfo(CODESET) returns, so we should
 416  * use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv.
 417  */
 418 static const char *
 419 default_iconv_charset(const char *charset) {
 420         if (charset != NULL && charset[0] != '\0')
 421                 return charset;
 422 #if HAVE_LOCALE_CHARSET && !defined(__APPLE__)
 423         /* locale_charset() is broken on Mac OS */
 424         return locale_charset();
 425 #elif HAVE_NL_LANGINFO
 426         return nl_langinfo(CODESET);
 427 #else
 428         return "";
 429 #endif
 430 }
 431
 432 #if defined(_WIN32) && !defined(__CYGWIN__)
 433
 434 /*
 435  * Convert MBS to WCS.
 436  * Note: returns -1 if conversion fails.
 437  */
 438 int
 439 archive_wstring_append_from_mbs(struct archive_wstring *dest,
 440     const char *p, size_t len)
 441 {
 442         return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL);
 443 }
 444
 445 static int
 446 archive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest,
 447     const char *s, size_t length, struct archive_string_conv *sc)
 448 {
 449         int count, ret = 0;
 450         UINT from_cp;
 451
 452         if (sc != NULL)
 453                 from_cp = sc->from_cp;
 454         else
 455                 from_cp = get_current_codepage();
 456
 457         if (from_cp == CP_C_LOCALE) {
 458                 /*
 459                  * "C" locale special process.
 460                  */
 461                 wchar_t *ws;
 462                 const unsigned char *mp;
 463
 464                 if (NULL == archive_wstring_ensure(dest,
 465                     dest->length + length + 1))
 466                         return (-1);
 467
 468                 ws = dest->s + dest->length;
 469                 mp = (const unsigned char *)s;
 470                 count = 0;
 471                 while (count < (int)length && *mp) {
 472                         *ws++ = (wchar_t)*mp++;
 473                         count++;
 474                 }
 475         } else if (sc != NULL &&
 476             (sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) {
 477                 /*
 478                  * Normalize UTF-8 and UTF-16BE and convert it directly
 479                  * to UTF-16 as wchar_t.
 480                  */
 481                 struct archive_string u16;
 482                 int saved_flag = sc->flag;/* save current flag. */
 483
 484                 if (is_big_endian())
 485                         sc->flag |= SCONV_TO_UTF16BE;
 486                 else
 487                         sc->flag |= SCONV_TO_UTF16LE;
 488
 489                 if (sc->flag & SCONV_FROM_UTF16) {
 490                         /*
 491                          *  UTF-16BE/LE NFD ===> UTF-16 NFC
 492                          *  UTF-16BE/LE NFC ===> UTF-16 NFD
 493                          */
 494                         count = (int)utf16nbytes(s, length);
 495                 } else {
 496                         /*
 497                          *  UTF-8 NFD ===> UTF-16 NFC
 498                          *  UTF-8 NFC ===> UTF-16 NFD
 499                          */
 500                         count = (int)mbsnbytes(s, length);
 501                 }
 502                 u16.s = (char *)dest->s;
 503                 u16.length = dest->length << 1;;
 504                 u16.buffer_length = dest->buffer_length;
 505                 if (sc->flag & SCONV_NORMALIZATION_C)
 506                         ret = archive_string_normalize_C(&u16, s, count, sc);
 507                 else
 508                         ret = archive_string_normalize_D(&u16, s, count, sc);
 509                 dest->s = (wchar_t *)u16.s;
 510                 dest->length = u16.length >> 1;
 511                 dest->buffer_length = u16.buffer_length;
 512                 sc->flag = saved_flag;/* restore the saved flag. */
 513                 return (ret);
 514         } else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) {
 515                 count = (int)utf16nbytes(s, length);
 516                 count >>= 1; /* to be WCS length */
 517                 /* Allocate memory for WCS. */
 518                 if (NULL == archive_wstring_ensure(dest,
 519                     dest->length + count + 1))
 520                         return (-1);
 521                 wmemcpy(dest->s + dest->length, (const wchar_t *)s, count);
 522                 if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) {
 523                         uint16_t *u16 = (uint16_t *)(dest->s + dest->length);
 524                         int b;
 525                         for (b = 0; b < count; b++) {
 526                                 uint16_t val = archive_le16dec(u16+b);
 527                                 archive_be16enc(u16+b, val);
 528                         }
 529                 } else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) {
 530                         uint16_t *u16 = (uint16_t *)(dest->s + dest->length);
 531                         int b;
 532                         for (b = 0; b < count; b++) {
 533                                 uint16_t val = archive_be16dec(u16+b);
 534                                 archive_le16enc(u16+b, val);
 535                         }
 536                 }
 537         } else {
 538                 DWORD mbflag;
 539                 size_t buffsize;
 540
 541                 if (sc == NULL)
 542                         mbflag = 0;
 543                 else if (sc->flag & SCONV_FROM_CHARSET) {
 544                         /* Do not trust the length which comes from
 545                          * an archive file. */
 546                         length = mbsnbytes(s, length);
 547                         mbflag = 0;
 548                 } else
 549                         mbflag = MB_PRECOMPOSED;
 550
 551                 buffsize = dest->length + length + 1;
 552                 do {
 553                         /* Allocate memory for WCS. */
 554                         if (NULL == archive_wstring_ensure(dest, buffsize))
 555                                 return (-1);
 556                         /* Convert MBS to WCS. */
 557                         count = MultiByteToWideChar(from_cp,
 558                             mbflag, s, (int)length, dest->s + dest->length,
 559                             (int)(dest->buffer_length >> 1) -1);
 560                         if (count == 0 &&
 561                             GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
 562                                 /* Expand the WCS buffer. */
 563                                 buffsize = dest->buffer_length << 1;
 564                                 continue;
 565                         }
 566                         if (count == 0 && length != 0)
 567                                 ret = -1;
 568                         break;
 569                 } while (1);
 570         }
 571         dest->length += count;
 572         dest->s[dest->length] = L'\0';
 573         return (ret);
 574 }
 575
 576 #else
 577
 578 /*
 579  * Convert MBS to WCS.
 580  * Note: returns -1 if conversion fails.
 581  */
 582 int
 583 archive_wstring_append_from_mbs(struct archive_wstring *dest,
 584     const char *p, size_t len)
 585 {
 586         size_t r;
 587         int ret_val = 0;
 588         /*
 589          * No single byte will be more than one wide character,
 590          * so this length estimate will always be big enough.
 591          */
 592         size_t wcs_length = len;
 593         size_t mbs_length = len;
 594         const char *mbs = p;
 595         wchar_t *wcs;
 596 #if HAVE_MBRTOWC
 597         mbstate_t shift_state;
 598
 599         memset(&shift_state, 0, sizeof(shift_state));
 600 #endif
 601         if (NULL == archive_wstring_ensure(dest, dest->length + wcs_length + 1))
 602                 return (-1);
 603         wcs = dest->s + dest->length;
 604         /*
 605          * We cannot use mbsrtowcs/mbstowcs here because those may convert
 606          * extra MBS when strlen(p) > len and one wide character consists of
 607          * multi bytes.
 608          */
 609         while (*mbs && mbs_length > 0) {
 610                 if (wcs_length == 0) {
 611                         dest->length = wcs - dest->s;
 612                         dest->s[dest->length] = L'\0';
 613                         wcs_length = mbs_length;
 614                         if (NULL == archive_wstring_ensure(dest,
 615                             dest->length + wcs_length + 1))
 616                                 return (-1);
 617                         wcs = dest->s + dest->length;
 618                 }
 619 #if HAVE_MBRTOWC
 620                 r = mbrtowc(wcs, mbs, wcs_length, &shift_state);
 621 #else
 622                 r = mbtowc(wcs, mbs, wcs_length);
 623 #endif
 624                 if (r == (size_t)-1 || r == (size_t)-2) {
 625                         ret_val = -1;
 626                         if (errno == EILSEQ) {
 627                                 ++mbs;
 628                                 --mbs_length;
 629                                 continue;
 630                         } else
 631                                 break;
 632                 }
 633                 if (r == 0 || r > mbs_length)
 634                         break;
 635                 wcs++;
 636                 wcs_length--;
 637                 mbs += r;
 638                 mbs_length -= r;
 639         }
 640         dest->length = wcs - dest->s;
 641         dest->s[dest->length] = L'\0';
 642         return (ret_val);
 643 }
 644
 645 #endif
 646
 647 #if defined(_WIN32) && !defined(__CYGWIN__)
 648
 649 /*
 650  * WCS ==> MBS.
 651  * Note: returns -1 if conversion fails.
 652  *
 653  * Win32 builds use WideCharToMultiByte from the Windows API.
 654  * (Maybe Cygwin should too?  WideCharToMultiByte will know a
 655  * lot more about local character encodings than the wcrtomb()
 656  * wrapper is going to know.)
 657  */
 658 int
 659 archive_string_append_from_wcs(struct archive_string *as,
 660     const wchar_t *w, size_t len)
 661 {
 662         return archive_string_append_from_wcs_in_codepage(as, w, len, NULL);
 663 }
 664
 665 static int
 666 archive_string_append_from_wcs_in_codepage(struct archive_string *as,
 667     const wchar_t *ws, size_t len, struct archive_string_conv *sc)
 668 {
 669         BOOL defchar_used, *dp;
 670         int count, ret = 0;
 671         UINT to_cp;
 672         int wslen = (int)len;
 673
 674         if (sc != NULL)
 675                 to_cp = sc->to_cp;
 676         else
 677                 to_cp = get_current_codepage();
 678
 679         if (to_cp == CP_C_LOCALE) {
 680                 /*
 681                  * "C" locale special process.
 682                  */
 683                 const wchar_t *wp = ws;
 684                 char *p;
 685
 686                 if (NULL == archive_string_ensure(as,
 687                     as->length + wslen +1))
 688                         return (-1);
 689                 p = as->s + as->length;
 690                 count = 0;
 691                 defchar_used = 0;
 692                 while (count < wslen && *wp) {
 693                         if (*wp > 255) {
 694                                 *p++ = '?';
 695                                 wp++;
 696                                 defchar_used = 1;
 697                         } else
 698                                 *p++ = (char)*wp++;
 699                         count++;
 700                 }
 701         } else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) {
 702                 uint16_t *u16;
 703
 704                 if (NULL ==
 705                     archive_string_ensure(as, as->length + len * 2 + 2))
 706                         return (-1);
 707                 u16 = (uint16_t *)(as->s + as->length);
 708                 count = 0;
 709                 defchar_used = 0;
 710                 if (sc->flag & SCONV_TO_UTF16BE) {
 711                         while (count < (int)len && *ws) {
 712                                 archive_be16enc(u16+count, *ws);
 713                                 ws++;
 714                                 count++;
 715                         }
 716                 } else {
 717                         while (count < (int)len && *ws) {
 718                                 archive_le16enc(u16+count, *ws);
 719                                 ws++;
 720                                 count++;
 721                         }
 722                 }
 723                 count <<= 1; /* to be byte size */
 724         } else {
 725                 /* Make sure the MBS buffer has plenty to set. */
 726                 if (NULL ==
 727                     archive_string_ensure(as, as->length + len * 2 + 1))
 728                         return (-1);
 729                 do {
 730                         defchar_used = 0;
 731                         if (to_cp == CP_UTF8 || sc == NULL)
 732                                 dp = NULL;
 733                         else
 734                                 dp = &defchar_used;
 735                         count = WideCharToMultiByte(to_cp, 0, ws, wslen,
 736                             as->s + as->length, (int)as->buffer_length-1, NULL, dp);
 737                         if (count == 0 &&
 738                             GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
 739                                 /* Expand the MBS buffer and retry. */
 740                                 if (NULL == archive_string_ensure(as,
 741                                         as->buffer_length + len))
 742                                         return (-1);
 743                                 continue;
 744                         }
 745                         if (count == 0)
 746                                 ret = -1;
 747                         break;
 748                 } while (1);
 749         }
 750         as->length += count;
 751         as->s[as->length] = '\0';
 752         return (defchar_used?-1:ret);
 753 }
 754
 755 #elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB)
 756
 757 /*
 758  * Translates a wide character string into current locale character set
 759  * and appends to the archive_string.  Note: returns -1 if conversion
 760  * fails.
 761  */
 762 int
 763 archive_string_append_from_wcs(struct archive_string *as,
 764     const wchar_t *w, size_t len)
 765 {
 766         /* We cannot use the standard wcstombs() here because it
 767          * cannot tell us how big the output buffer should be.  So
 768          * I've built a loop around wcrtomb() or wctomb() that
 769          * converts a character at a time and resizes the string as
 770          * needed.  We prefer wcrtomb() when it's available because
 771          * it's thread-safe. */
 772         int n, ret_val = 0;
 773         char *p;
 774         char *end;
 775 #if HAVE_WCRTOMB
 776         mbstate_t shift_state;
 777
 778         memset(&shift_state, 0, sizeof(shift_state));
 779 #else
 780         /* Clear the shift state before starting. */
 781         wctomb(NULL, L'\0');
 782 #endif
 783         /*
 784          * Allocate buffer for MBS.
 785          * We need this allocation here since it is possible that
 786          * as->s is still NULL.
 787          */
 788         if (archive_string_ensure(as, as->length + len + 1) == NULL)
 789                 return (-1);
 790
 791         p = as->s + as->length;
 792         end = as->s + as->buffer_length - MB_CUR_MAX -1;
 793         while (*w != L'\0' && len > 0) {
 794                 if (p >= end) {
 795                         as->length = p - as->s;
 796                         as->s[as->length] = '\0';
 797                         /* Re-allocate buffer for MBS. */
 798                         if (archive_string_ensure(as,
 799                             as->length + len * 2 + 1) == NULL)
 800                                 return (-1);
 801                         p = as->s + as->length;
 802                         end = as->s + as->buffer_length - MB_CUR_MAX -1;
 803                 }
 804 #if HAVE_WCRTOMB
 805                 n = wcrtomb(p, *w++, &shift_state);
 806 #else
 807                 n = wctomb(p, *w++);
 808 #endif
 809                 if (n == -1) {
 810                         if (errno == EILSEQ) {
 811                                 /* Skip an illegal wide char. */
 812                                 *p++ = '?';
 813                                 ret_val = -1;
 814                         } else {
 815                                 ret_val = -1;
 816                                 break;
 817                         }
 818                 } else
 819                         p += n;
 820                 len--;
 821         }
 822         as->length = p - as->s;
 823         as->s[as->length] = '\0';
 824         return (ret_val);
 825 }
 826
 827 #else /* HAVE_WCTOMB || HAVE_WCRTOMB */
 828
 829 /*
 830  * TODO: Test if __STDC_ISO_10646__ is defined.
 831  * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion
 832  * one character at a time.  If a non-Windows platform doesn't have
 833  * either of these, fall back to the built-in UTF8 conversion.
 834  */
 835 int
 836 archive_string_append_from_wcs(struct archive_string *as,
 837     const wchar_t *w, size_t len)
 838 {
 839         (void)as;/* UNUSED */
 840         (void)w;/* UNUSED */
 841         (void)len;/* UNUSED */
 842         errno = ENOSYS;
 843         return (-1);
 844 }
 845
 846 #endif /* HAVE_WCTOMB || HAVE_WCRTOMB */
 847
 848 /*
 849  * Find a string conversion object by a pair of 'from' charset name
 850  * and 'to' charset name from an archive object.
 851  * Return NULL if not found.
 852  */
 853 static struct archive_string_conv *
 854 find_sconv_object(struct archive *a, const char *fc, const char *tc)
 855 {
 856         struct archive_string_conv *sc;
 857
 858         if (a == NULL)
 859                 return (NULL);
 860
 861         for (sc = a->sconv; sc != NULL; sc = sc->next) {
 862                 if (strcmp(sc->from_charset, fc) == 0 &&
 863                     strcmp(sc->to_charset, tc) == 0)
 864                         break;
 865         }
 866         return (sc);
 867 }
 868
 869 /*
 870  * Register a string object to an archive object.
 871  */
 872 static void
 873 add_sconv_object(struct archive *a, struct archive_string_conv *sc)
 874 {
 875         struct archive_string_conv **psc;
 876
 877         /* Add a new sconv to sconv list. */
 878         psc = &(a->sconv);
 879         while (*psc != NULL)
 880                 psc = &((*psc)->next);
 881         *psc = sc;
 882 }
 883
 884 static void
 885 add_converter(struct archive_string_conv *sc, int (*converter)
 886     (struct archive_string *, const void *, size_t,
 887      struct archive_string_conv *))
 888 {
 889         if (sc == NULL || sc->nconverter >= 2)
 890                 __archive_errx(1, "Programing error");
 891         sc->converter[sc->nconverter++] = converter;
 892 }
 893
 894 static void
 895 setup_converter(struct archive_string_conv *sc)
 896 {
 897
 898         /* Reset. */
 899         sc->nconverter = 0;
 900
 901         /*
 902          * Perform special sequence for the incorrect UTF-8 filenames
 903          * made by libarchive2.x.
 904          */
 905         if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) {
 906                 add_converter(sc, strncat_from_utf8_libarchive2);
 907                 return;
 908         }
 909
 910         /*
 911          * Convert a string to UTF-16BE/LE.
 912          */
 913         if (sc->flag & SCONV_TO_UTF16) {
 914                 /*
 915                  * If the current locale is UTF-8, we can translate
 916                  * a UTF-8 string into a UTF-16BE string.
 917                  */
 918                 if (sc->flag & SCONV_FROM_UTF8) {
 919                         add_converter(sc, archive_string_append_unicode);
 920                         return;
 921                 }
 922
 923 #if defined(_WIN32) && !defined(__CYGWIN__)
 924                 if (sc->flag & SCONV_WIN_CP) {
 925                         if (sc->flag & SCONV_TO_UTF16BE)
 926                                 add_converter(sc, win_strncat_to_utf16be);
 927                         else
 928                                 add_converter(sc, win_strncat_to_utf16le);
 929                         return;
 930                 }
 931 #endif
 932
 933 #if defined(HAVE_ICONV)
 934                 if (sc->cd != (iconv_t)-1) {
 935                         add_converter(sc, iconv_strncat_in_locale);
 936                         return;
 937                 }
 938 #endif
 939
 940                 if (sc->flag & SCONV_BEST_EFFORT) {
 941                         if (sc->flag & SCONV_TO_UTF16BE)
 942                                 add_converter(sc,
 943                                         best_effort_strncat_to_utf16be);
 944                         else
 945                                 add_converter(sc,
 946                                         best_effort_strncat_to_utf16le);
 947                 } else
 948                         /* Make sure we have no converter. */
 949                         sc->nconverter = 0;
 950                 return;
 951         }
 952
 953         /*
 954          * Convert a string from UTF-16BE/LE.
 955          */
 956         if (sc->flag & SCONV_FROM_UTF16) {
 957                 /*
 958                  * At least we should normalize a UTF-16BE string.
 959                  */
 960                 if (sc->flag & SCONV_NORMALIZATION_D)
 961                         add_converter(sc,archive_string_normalize_D);
 962                 else if (sc->flag & SCONV_NORMALIZATION_C)
 963                         add_converter(sc, archive_string_normalize_C);
 964
 965                 if (sc->flag & SCONV_TO_UTF8) {
 966                         /*
 967                          * If the current locale is UTF-8, we can translate
 968                          * a UTF-16BE/LE string into a UTF-8 string directly.
 969                          */
 970                         if (!(sc->flag &
 971                             (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))
 972                                 add_converter(sc,
 973                                     archive_string_append_unicode);
 974                         return;
 975                 }
 976
 977 #if defined(_WIN32) && !defined(__CYGWIN__)
 978                 if (sc->flag & SCONV_WIN_CP) {
 979                         if (sc->flag & SCONV_FROM_UTF16BE)
 980                                 add_converter(sc, win_strncat_from_utf16be);
 981                         else
 982                                 add_converter(sc, win_strncat_from_utf16le);
 983                         return;
 984                 }
 985 #endif
 986
 987 #if defined(HAVE_ICONV)
 988                 if (sc->cd != (iconv_t)-1) {
 989                         add_converter(sc, iconv_strncat_in_locale);
 990                         return;
 991                 }
 992 #endif
 993
 994                 if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))
 995                     == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))
 996                         add_converter(sc, best_effort_strncat_from_utf16be);
 997                 else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))
 998                     == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))
 999                         add_converter(sc, best_effort_strncat_from_utf16le);
1000                 else
1001                         /* Make sure we have no converter. */
1002                         sc->nconverter = 0;
1003                 return;
1004         }
1005
1006         if (sc->flag & SCONV_FROM_UTF8) {
1007                 /*
1008                  * At least we should normalize a UTF-8 string.
1009                  */
1010                 if (sc->flag & SCONV_NORMALIZATION_D)
1011                         add_converter(sc,archive_string_normalize_D);
1012                 else if (sc->flag & SCONV_NORMALIZATION_C)
1013                         add_converter(sc, archive_string_normalize_C);
1014
1015                 /*
1016                  * Copy UTF-8 string with a check of CESU-8.
1017                  * Apparently, iconv does not check surrogate pairs in UTF-8
1018                  * when both from-charset and to-charset are UTF-8, and then
1019                  * we use our UTF-8 copy code.
1020                  */
1021                 if (sc->flag & SCONV_TO_UTF8) {
1022                         /*
1023                          * If the current locale is UTF-8, we can translate
1024                          * a UTF-16BE string into a UTF-8 string directly.
1025                          */
1026                         if (!(sc->flag &
1027                             (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))
1028                                 add_converter(sc, strncat_from_utf8_to_utf8);
1029                         return;
1030                 }
1031         }
1032
1033 #if defined(_WIN32) && !defined(__CYGWIN__)
1034         /*
1035          * On Windows we can use Windows API for a string conversion.
1036          */
1037         if (sc->flag & SCONV_WIN_CP) {
1038                 add_converter(sc, strncat_in_codepage);
1039                 return;
1040         }
1041 #endif
1042
1043 #if HAVE_ICONV
1044         if (sc->cd != (iconv_t)-1) {
1045                 add_converter(sc, iconv_strncat_in_locale);
1046                 /*
1047                  * iconv generally does not support UTF-8-MAC and so
1048                  * we have to the output of iconv from NFC to NFD if
1049                  * need.
1050                  */
1051                 if ((sc->flag & SCONV_FROM_CHARSET) &&
1052                     (sc->flag & SCONV_TO_UTF8)) {
1053                         if (sc->flag & SCONV_NORMALIZATION_D)
1054                                 add_converter(sc, archive_string_normalize_D);
1055                 }
1056                 return;
1057         }
1058 #endif
1059
1060         /*
1061          * Try conversion in the best effort or no conversion.
1062          */
1063         if ((sc->flag & SCONV_BEST_EFFORT) || sc->same)
1064                 add_converter(sc, best_effort_strncat_in_locale);
1065         else
1066                 /* Make sure we have no converter. */
1067                 sc->nconverter = 0;
1068 }
1069
1070 /*
1071  * Return canonicalized charset-name but this supports just UTF-8, UTF-16BE
1072  * and CP932 which are referenced in create_sconv_object().
1073  */
1074 static const char *
1075 canonical_charset_name(const char *charset)
1076 {
1077         char cs[16];
1078         char *p;
1079         const char *s;
1080
1081         if (charset == NULL || charset[0] == '\0'
1082             || strlen(charset) > 15)
1083                 return (charset);
1084
1085         /* Copy name to uppercase. */
1086         p = cs;
1087         s = charset;
1088         while (*s) {
1089                 char c = *s++;
1090                 if (c >= 'a' && c <= 'z')
1091                         c -= 'a' - 'A';
1092                 *p++ = c;
1093         }
1094         *p++ = '\0';
1095
1096         if (strcmp(cs, "UTF-8") == 0 ||
1097             strcmp(cs, "UTF8") == 0)
1098                 return ("UTF-8");
1099         if (strcmp(cs, "UTF-16BE") == 0 ||
1100             strcmp(cs, "UTF16BE") == 0)
1101                 return ("UTF-16BE");
1102         if (strcmp(cs, "UTF-16LE") == 0 ||
1103             strcmp(cs, "UTF16LE") == 0)
1104                 return ("UTF-16LE");
1105         if (strcmp(cs, "CP932") == 0)
1106                 return ("CP932");
1107         return (charset);
1108 }
1109
1110 /*
1111  * Create a string conversion object.
1112  */
1113 static struct archive_string_conv *
1114 create_sconv_object(const char *fc, const char *tc,
1115     unsigned current_codepage, int flag)
1116 {
1117         struct archive_string_conv *sc;
1118
1119         sc = calloc(1, sizeof(*sc));
1120         if (sc == NULL)
1121                 return (NULL);
1122         sc->next = NULL;
1123         sc->from_charset = strdup(fc);
1124         if (sc->from_charset == NULL) {
1125                 free(sc);
1126                 return (NULL);
1127         }
1128         sc->to_charset = strdup(tc);
1129         if (sc->to_charset == NULL) {
1130                 free(sc->from_charset);
1131                 free(sc);
1132                 return (NULL);
1133         }
1134         archive_string_init(&sc->utftmp);
1135
1136         if (flag & SCONV_TO_CHARSET) {
1137                 /*
1138                  * Convert characters from the current locale charset to
1139                  * a specified charset.
1140                  */
1141                 sc->from_cp = current_codepage;
1142                 sc->to_cp = make_codepage_from_charset(tc);
1143 #if defined(_WIN32) && !defined(__CYGWIN__)
1144                 if (IsValidCodePage(sc->to_cp))
1145                         flag |= SCONV_WIN_CP;
1146 #endif
1147         } else if (flag & SCONV_FROM_CHARSET) {
1148                 /*
1149                  * Convert characters from a specified charset to
1150                  * the current locale charset.
1151                  */
1152                 sc->to_cp = current_codepage;
1153                 sc->from_cp = make_codepage_from_charset(fc);
1154 #if defined(_WIN32) && !defined(__CYGWIN__)
1155                 if (IsValidCodePage(sc->from_cp))
1156                         flag |= SCONV_WIN_CP;
1157 #endif
1158         }
1159
1160         /*
1161          * Check if "from charset" and "to charset" are the same.
1162          */
1163         if (strcmp(fc, tc) == 0 ||
1164             (sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp))
1165                 sc->same = 1;
1166         else
1167                 sc->same = 0;
1168
1169         /*
1170          * Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE.
1171          */
1172         if (strcmp(tc, "UTF-8") == 0)
1173                 flag |= SCONV_TO_UTF8;
1174         else if (strcmp(tc, "UTF-16BE") == 0)
1175                 flag |= SCONV_TO_UTF16BE;
1176         else if (strcmp(tc, "UTF-16LE") == 0)
1177                 flag |= SCONV_TO_UTF16LE;
1178         if (strcmp(fc, "UTF-8") == 0)
1179                 flag |= SCONV_FROM_UTF8;
1180         else if (strcmp(fc, "UTF-16BE") == 0)
1181                 flag |= SCONV_FROM_UTF16BE;
1182         else if (strcmp(fc, "UTF-16LE") == 0)
1183                 flag |= SCONV_FROM_UTF16LE;
1184 #if defined(_WIN32) && !defined(__CYGWIN__)
1185         if (sc->to_cp == CP_UTF8)
1186                 flag |= SCONV_TO_UTF8;
1187         else if (sc->to_cp == CP_UTF16BE)
1188                 flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP;
1189         else if (sc->to_cp == CP_UTF16LE)
1190                 flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP;
1191         if (sc->from_cp == CP_UTF8)
1192                 flag |= SCONV_FROM_UTF8;
1193         else if (sc->from_cp == CP_UTF16BE)
1194                 flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP;
1195         else if (sc->from_cp == CP_UTF16LE)
1196                 flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP;
1197 #endif
1198
1199         /*
1200          * Set a flag for Unicode NFD. Usually iconv cannot correctly
1201          * handle it. So we have to translate NFD characters to NFC ones
1202          * ourselves before iconv handles. Another reason is to prevent
1203          * that the same sight of two filenames, one is NFC and other
1204          * is NFD, would be in its directory.
1205          * On Mac OS X, although its filesystem layer automatically
1206          * convert filenames to NFD, it would be useful for filename
1207          * comparing to find out the same filenames that we normalize
1208          * that to be NFD ourselves.
1209          */
1210         if ((flag & SCONV_FROM_CHARSET) &&
1211             (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) {
1212 #if defined(__APPLE__)
1213                 if (flag & SCONV_TO_UTF8)
1214                         flag |= SCONV_NORMALIZATION_D;
1215                 else
1216 #endif
1217                         flag |= SCONV_NORMALIZATION_C;
1218         }
1219 #if defined(__APPLE__)
1220         /*
1221          * In case writing an archive file, make sure that a filename
1222          * going to be passed to iconv is a Unicode NFC string since
1223          * a filename in HFS Plus filesystem is a Unicode NFD one and
1224          * iconv cannot handle it with "UTF-8" charset. It is simpler
1225          * than a use of "UTF-8-MAC" charset.
1226          */
1227         if ((flag & SCONV_TO_CHARSET) &&
1228             (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
1229             !(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8)))
1230                 flag |= SCONV_NORMALIZATION_C;
1231         /*
1232          * In case reading an archive file. make sure that a filename
1233          * will be passed to users is a Unicode NFD string in order to
1234          * correctly compare the filename with other one which comes
1235          * from HFS Plus filesystem.
1236          */
1237         if ((flag & SCONV_FROM_CHARSET) &&
1238            !(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
1239             (flag & SCONV_TO_UTF8))
1240                 flag |= SCONV_NORMALIZATION_D;
1241 #endif
1242
1243 #if defined(HAVE_ICONV)
1244         sc->cd_w = (iconv_t)-1;
1245         /*
1246          * Create an iconv object.
1247          */
1248         if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) &&
1249             (flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) ||
1250             (flag & SCONV_WIN_CP)) {
1251                 /* This case we won't use iconv. */
1252                 sc->cd = (iconv_t)-1;
1253         } else {
1254                 sc->cd = iconv_open(tc, fc);
1255                 if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) {
1256                         /*
1257                          * Unfortunately, all of iconv implements do support
1258                          * "CP932" character-set, so we should use "SJIS"
1259                          * instead if iconv_open failed.
1260                          */
1261                         if (strcmp(tc, "CP932") == 0)
1262                                 sc->cd = iconv_open("SJIS", fc);
1263                         else if (strcmp(fc, "CP932") == 0)
1264                                 sc->cd = iconv_open(tc, "SJIS");
1265                 }
1266 #if defined(_WIN32) && !defined(__CYGWIN__)
1267                 /*
1268                  * archive_mstring on Windows directly convert multi-bytes
1269                  * into archive_wstring in order not to depend on locale
1270                  * so that you can do a I18N programming. This will be
1271                  * used only in archive_mstring_copy_mbs_len_l so far.
1272                  */
1273                 if (flag & SCONV_FROM_CHARSET) {
1274                         sc->cd_w = iconv_open("UTF-8", fc);
1275                         if (sc->cd_w == (iconv_t)-1 &&
1276                             (sc->flag & SCONV_BEST_EFFORT)) {
1277                                 if (strcmp(fc, "CP932") == 0)
1278                                         sc->cd_w = iconv_open("UTF-8", "SJIS");
1279                         }
1280                 }
1281 #endif /* _WIN32 && !__CYGWIN__ */
1282         }
1283 #endif  /* HAVE_ICONV */
1284
1285         sc->flag = flag;
1286
1287         /*
1288          * Set up converters.
1289          */
1290         setup_converter(sc);
1291
1292         return (sc);
1293 }
1294
1295 /*
1296  * Free a string conversion object.
1297  */
1298 static void
1299 free_sconv_object(struct archive_string_conv *sc)
1300 {
1301         free(sc->from_charset);
1302         free(sc->to_charset);
1303         archive_string_free(&sc->utftmp);
1304 #if HAVE_ICONV
1305         if (sc->cd != (iconv_t)-1)
1306                 iconv_close(sc->cd);
1307         if (sc->cd_w != (iconv_t)-1)
1308                 iconv_close(sc->cd_w);
1309 #endif
1310         free(sc);
1311 }
1312
1313 #if defined(_WIN32) && !defined(__CYGWIN__)
1314 static unsigned
1315 my_atoi(const char *p)
1316 {
1317         unsigned cp;
1318
1319         cp = 0;
1320         while (*p) {
1321                 if (*p >= '0' && *p <= '9')
1322                         cp = cp * 10 + (*p - '0');
1323                 else
1324                         return (-1);
1325                 p++;
1326         }
1327         return (cp);
1328 }
1329
1330 /*
1331  * Translate Charset name (as used by iconv) into CodePage (as used by Windows)
1332  * Return -1 if failed.
1333  *
1334  * Note: This translation code may be insufficient.
1335  */
1336 static struct charset {
1337         const char *name;
1338         unsigned cp;
1339 } charsets[] = {
1340         /* MUST BE SORTED! */
1341         {"ASCII", 1252},
1342         {"ASMO-708", 708},
1343         {"BIG5", 950},
1344         {"CHINESE", 936},
1345         {"CP367", 1252},
1346         {"CP819", 1252},
1347         {"CP1025", 21025},
1348         {"DOS-720", 720},
1349         {"DOS-862", 862},
1350         {"EUC-CN", 51936},
1351         {"EUC-JP", 51932},
1352         {"EUC-KR", 949},
1353         {"EUCCN", 51936},
1354         {"EUCJP", 51932},
1355         {"EUCKR", 949},
1356         {"GB18030", 54936},
1357         {"GB2312", 936},
1358         {"HEBREW", 1255},
1359         {"HZ-GB-2312", 52936},
1360         {"IBM273", 20273},
1361         {"IBM277", 20277},
1362         {"IBM278", 20278},
1363         {"IBM280", 20280},
1364         {"IBM284", 20284},
1365         {"IBM285", 20285},
1366         {"IBM290", 20290},
1367         {"IBM297", 20297},
1368         {"IBM367", 1252},
1369         {"IBM420", 20420},
1370         {"IBM423", 20423},
1371         {"IBM424", 20424},
1372         {"IBM819", 1252},
1373         {"IBM871", 20871},
1374         {"IBM880", 20880},
1375         {"IBM905", 20905},
1376         {"IBM924", 20924},
1377         {"ISO-8859-1", 28591},
1378         {"ISO-8859-13", 28603},
1379         {"ISO-8859-15", 28605},
1380         {"ISO-8859-2", 28592},
1381         {"ISO-8859-3", 28593},
1382         {"ISO-8859-4", 28594},
1383         {"ISO-8859-5", 28595},
1384         {"ISO-8859-6", 28596},
1385         {"ISO-8859-7", 28597},
1386         {"ISO-8859-8", 28598},
1387         {"ISO-8859-9", 28599},
1388         {"ISO8859-1", 28591},
1389         {"ISO8859-13", 28603},
1390         {"ISO8859-15", 28605},
1391         {"ISO8859-2", 28592},
1392         {"ISO8859-3", 28593},
1393         {"ISO8859-4", 28594},
1394         {"ISO8859-5", 28595},
1395         {"ISO8859-6", 28596},
1396         {"ISO8859-7", 28597},
1397         {"ISO8859-8", 28598},
1398         {"ISO8859-9", 28599},
1399         {"JOHAB", 1361},
1400         {"KOI8-R", 20866},
1401         {"KOI8-U", 21866},
1402         {"KS_C_5601-1987", 949},
1403         {"LATIN1", 1252},
1404         {"LATIN2", 28592},
1405         {"MACINTOSH", 10000},
1406         {"SHIFT-JIS", 932},
1407         {"SHIFT_JIS", 932},
1408         {"SJIS", 932},
1409         {"US", 1252},
1410         {"US-ASCII", 1252},
1411         {"UTF-16", 1200},
1412         {"UTF-16BE", 1201},
1413         {"UTF-16LE", 1200},
1414         {"UTF-8", CP_UTF8},
1415         {"X-EUROPA", 29001},
1416         {"X-MAC-ARABIC", 10004},
1417         {"X-MAC-CE", 10029},
1418         {"X-MAC-CHINESEIMP", 10008},
1419         {"X-MAC-CHINESETRAD", 10002},
1420         {"X-MAC-CROATIAN", 10082},
1421         {"X-MAC-CYRILLIC", 10007},
1422         {"X-MAC-GREEK", 10006},
1423         {"X-MAC-HEBREW", 10005},
1424         {"X-MAC-ICELANDIC", 10079},
1425         {"X-MAC-JAPANESE", 10001},
1426         {"X-MAC-KOREAN", 10003},
1427         {"X-MAC-ROMANIAN", 10010},
1428         {"X-MAC-THAI", 10021},
1429         {"X-MAC-TURKISH", 10081},
1430         {"X-MAC-UKRAINIAN", 10017},
1431 };
1432 static unsigned
1433 make_codepage_from_charset(const char *charset)
1434 {
1435         char cs[16];
1436         char *p;
1437         unsigned cp;
1438         int a, b;
1439
1440         if (charset == NULL || strlen(charset) > 15)
1441                 return -1;
1442
1443         /* Copy name to uppercase. */
1444         p = cs;
1445         while (*charset) {
1446                 char c = *charset++;
1447                 if (c >= 'a' && c <= 'z')
1448                         c -= 'a' - 'A';
1449                 *p++ = c;
1450         }
1451         *p++ = '\0';
1452         cp = -1;
1453
1454         /* Look it up in the table first, so that we can easily
1455          * override CP367, which we map to 1252 instead of 367. */
1456         a = 0;
1457         b = sizeof(charsets)/sizeof(charsets[0]);
1458         while (b > a) {
1459                 int c = (b + a) / 2;
1460                 int r = strcmp(charsets[c].name, cs);
1461                 if (r < 0)
1462                         a = c + 1;
1463                 else if (r > 0)
1464                         b = c;
1465                 else
1466                         return charsets[c].cp;
1467         }
1468
1469         /* If it's not in the table, try to parse it. */
1470         switch (*cs) {
1471         case 'C':
1472                 if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') {
1473                         cp = my_atoi(cs + 2);
1474                 } else if (strcmp(cs, "CP_ACP") == 0)
1475                         cp = get_current_codepage();
1476                 else if (strcmp(cs, "CP_OEMCP") == 0)
1477                         cp = get_current_oemcp();
1478                 break;
1479         case 'I':
1480                 if (cs[1] == 'B' && cs[2] == 'M' &&
1481                     cs[3] >= '0' && cs[3] <= '9') {
1482                         cp = my_atoi(cs + 3);
1483                 }
1484                 break;
1485         case 'W':
1486                 if (strncmp(cs, "WINDOWS-", 8) == 0) {
1487                         cp = my_atoi(cs + 8);
1488                         if (cp != 874 && (cp < 1250 || cp > 1258))
1489                                 cp = -1;/* This may invalid code. */
1490                 }
1491                 break;
1492         }
1493         return (cp);
1494 }
1495
1496 /*
1497  * Return ANSI Code Page of current locale set by setlocale().
1498  */
1499 static unsigned
1500 get_current_codepage(void)
1501 {
1502         char *locale, *p;
1503         unsigned cp;
1504
1505         locale = setlocale(LC_CTYPE, NULL);
1506         if (locale == NULL)
1507                 return (GetACP());
1508         if (locale[0] == 'C' && locale[1] == '\0')
1509                 return (CP_C_LOCALE);
1510         p = strrchr(locale, '.');
1511         if (p == NULL)
1512                 return (GetACP());
1513         cp = my_atoi(p+1);
1514         if (cp <= 0)
1515                 return (GetACP());
1516         return (cp);
1517 }
1518
1519 /*
1520  * Translation table between Locale Name and ACP/OEMCP.
1521  */
1522 static struct {
1523         unsigned acp;
1524         unsigned ocp;
1525         const char *locale;
1526 } acp_ocp_map[] = {
1527         {  950,  950, "Chinese_Taiwan" },
1528         {  936,  936, "Chinese_People's Republic of China" },
1529         {  950,  950, "Chinese_Taiwan" },
1530         { 1250,  852, "Czech_Czech Republic" },
1531         { 1252,  850, "Danish_Denmark" },
1532         { 1252,  850, "Dutch_Netherlands" },
1533         { 1252,  850, "Dutch_Belgium" },
1534         { 1252,  437, "English_United States" },
1535         { 1252,  850, "English_Australia" },
1536         { 1252,  850, "English_Canada" },
1537         { 1252,  850, "English_New Zealand" },
1538         { 1252,  850, "English_United Kingdom" },
1539         { 1252,  437, "English_United States" },
1540         { 1252,  850, "Finnish_Finland" },
1541         { 1252,  850, "French_France" },
1542         { 1252,  850, "French_Belgium" },
1543         { 1252,  850, "French_Canada" },
1544         { 1252,  850, "French_Switzerland" },
1545         { 1252,  850, "German_Germany" },
1546         { 1252,  850, "German_Austria" },
1547         { 1252,  850, "German_Switzerland" },
1548         { 1253,  737, "Greek_Greece" },
1549         { 1250,  852, "Hungarian_Hungary" },
1550         { 1252,  850, "Icelandic_Iceland" },
1551         { 1252,  850, "Italian_Italy" },
1552         { 1252,  850, "Italian_Switzerland" },
1553         {  932,  932, "Japanese_Japan" },
1554         {  949,  949, "Korean_Korea" },
1555         { 1252,  850, "Norwegian (BokmOl)_Norway" },
1556         { 1252,  850, "Norwegian (BokmOl)_Norway" },
1557         { 1252,  850, "Norwegian-Nynorsk_Norway" },
1558         { 1250,  852, "Polish_Poland" },
1559         { 1252,  850, "Portuguese_Portugal" },
1560         { 1252,  850, "Portuguese_Brazil" },
1561         { 1251,  866, "Russian_Russia" },
1562         { 1250,  852, "Slovak_Slovakia" },
1563         { 1252,  850, "Spanish_Spain" },
1564         { 1252,  850, "Spanish_Mexico" },
1565         { 1252,  850, "Spanish_Spain" },
1566         { 1252,  850, "Swedish_Sweden" },
1567         { 1254,  857, "Turkish_Turkey" },
1568         { 0, 0, NULL}
1569 };
1570
1571 /*
1572  * Return OEM Code Page of current locale set by setlocale().
1573  */
1574 static unsigned
1575 get_current_oemcp(void)
1576 {
1577         int i;
1578         char *locale, *p;
1579         size_t len;
1580
1581         locale = setlocale(LC_CTYPE, NULL);
1582         if (locale == NULL)
1583                 return (GetOEMCP());
1584         if (locale[0] == 'C' && locale[1] == '\0')
1585                 return (CP_C_LOCALE);
1586
1587         p = strrchr(locale, '.');
1588         if (p == NULL)
1589                 return (GetOEMCP());
1590         len = p - locale;
1591         for (i = 0; acp_ocp_map[i].acp; i++) {
1592                 if (strncmp(acp_ocp_map[i].locale, locale, len) == 0)
1593                         return (acp_ocp_map[i].ocp);
1594         }
1595         return (GetOEMCP());
1596 }
1597 #else
1598
1599 /*
1600  * POSIX platform does not use CodePage.
1601  */
1602
1603 static unsigned
1604 get_current_codepage(void)
1605 {
1606         return (-1);/* Unknown */
1607 }
1608 static unsigned
1609 make_codepage_from_charset(const char *charset)
1610 {
1611         (void)charset; /* UNUSED */
1612         return (-1);/* Unknown */
1613 }
1614 static unsigned
1615 get_current_oemcp(void)
1616 {
1617         return (-1);/* Unknown */
1618 }
1619
1620 #endif /* defined(_WIN32) && !defined(__CYGWIN__) */
1621
1622 /*
1623  * Return a string conversion object.
1624  */
1625 static struct archive_string_conv *
1626 get_sconv_object(struct archive *a, const char *fc, const char *tc, int flag)
1627 {
1628         struct archive_string_conv *sc;
1629         unsigned current_codepage;
1630
1631         /* Check if we have made the sconv object. */
1632         sc = find_sconv_object(a, fc, tc);
1633         if (sc != NULL)
1634                 return (sc);
1635
1636         if (a == NULL)
1637                 current_codepage = get_current_codepage();
1638         else
1639                 current_codepage = a->current_codepage;
1640
1641         sc = create_sconv_object(canonical_charset_name(fc),
1642             canonical_charset_name(tc), current_codepage, flag);
1643         if (sc == NULL) {
1644                 if (a != NULL)
1645                         archive_set_error(a, ENOMEM,
1646                             "Could not allocate memory for "
1647                             "a string conversion object");
1648                 return (NULL);
1649         }
1650
1651         /*
1652          * If there is no converter for current string conversion object,
1653          * we cannot handle this conversion.
1654          */
1655         if (sc->nconverter == 0) {
1656                 if (a != NULL) {
1657 #if HAVE_ICONV
1658                         archive_set_error(a, ARCHIVE_ERRNO_MISC,
1659                             "iconv_open failed : Cannot handle ``%s''",
1660                             (flag & SCONV_TO_CHARSET)?tc:fc);
1661 #else
1662                         archive_set_error(a, ARCHIVE_ERRNO_MISC,
1663                             "A character-set conversion not fully supported "
1664                             "on this platform");
1665 #endif
1666                 }
1667                 /* Failed; free a sconv object. */
1668                 free_sconv_object(sc);
1669                 return (NULL);
1670         }
1671
1672         /*
1673          * Success!
1674          */
1675         if (a != NULL)
1676                 add_sconv_object(a, sc);
1677         return (sc);
1678 }
1679
1680 static const char *
1681 get_current_charset(struct archive *a)
1682 {
1683         const char *cur_charset;
1684
1685         if (a == NULL)
1686                 cur_charset = default_iconv_charset("");
1687         else {
1688                 cur_charset = default_iconv_charset(a->current_code);
1689                 if (a->current_code == NULL) {
1690                         a->current_code = strdup(cur_charset);
1691                         a->current_codepage = get_current_codepage();
1692                         a->current_oemcp = get_current_oemcp();
1693                 }
1694         }
1695         return (cur_charset);
1696 }
1697
1698 /*
1699  * Make and Return a string conversion object.
1700  * Return NULL if the platform does not support the specified conversion
1701  * and best_effort is 0.
1702  * If best_effort is set, A string conversion object must be returned
1703  * unless memory allocation for the object fails, but the conversion
1704  * might fail when non-ASCII code is found.
1705  */
1706 struct archive_string_conv *
1707 archive_string_conversion_to_charset(struct archive *a, const char *charset,
1708     int best_effort)
1709 {
1710         int flag = SCONV_TO_CHARSET;
1711
1712         if (best_effort)
1713                 flag |= SCONV_BEST_EFFORT;
1714         return (get_sconv_object(a, get_current_charset(a), charset, flag));
1715 }
1716
1717 struct archive_string_conv *
1718 archive_string_conversion_from_charset(struct archive *a, const char *charset,
1719     int best_effort)
1720 {
1721         int flag = SCONV_FROM_CHARSET;
1722
1723         if (best_effort)
1724                 flag |= SCONV_BEST_EFFORT;
1725         return (get_sconv_object(a, charset, get_current_charset(a), flag));
1726 }
1727
1728 /*
1729  * archive_string_default_conversion_*_archive() are provided for Windows
1730  * platform because other archiver application use CP_OEMCP for
1731  * MultiByteToWideChar() and WideCharToMultiByte() for the filenames
1732  * in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP
1733  * unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP).
1734  * So we should make a string conversion between CP_ACP and CP_OEMCP
1735  * for compatibility.
1736  */
1737 #if defined(_WIN32) && !defined(__CYGWIN__)
1738 struct archive_string_conv *
1739 archive_string_default_conversion_for_read(struct archive *a)
1740 {
1741         const char *cur_charset = get_current_charset(a);
1742         char oemcp[16];
1743
1744         /* NOTE: a check of cur_charset is unneeded but we need
1745          * that get_current_charset() has been surely called at
1746          * this time whatever C compiler optimized. */
1747         if (cur_charset != NULL &&
1748             (a->current_codepage == CP_C_LOCALE ||
1749              a->current_codepage == a->current_oemcp))
1750                 return (NULL);/* no conversion. */
1751
1752         _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);
1753         /* Make sure a null termination must be set. */
1754         oemcp[sizeof(oemcp)-1] = '\0';
1755         return (get_sconv_object(a, oemcp, cur_charset,
1756             SCONV_FROM_CHARSET));
1757 }
1758
1759 struct archive_string_conv *
1760 archive_string_default_conversion_for_write(struct archive *a)
1761 {
1762         const char *cur_charset = get_current_charset(a);
1763         char oemcp[16];
1764
1765         /* NOTE: a check of cur_charset is unneeded but we need
1766          * that get_current_charset() has been surely called at
1767          * this time whatever C compiler optimized. */
1768         if (cur_charset != NULL &&
1769             (a->current_codepage == CP_C_LOCALE ||
1770              a->current_codepage == a->current_oemcp))
1771                 return (NULL);/* no conversion. */
1772
1773         _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);
1774         /* Make sure a null termination must be set. */
1775         oemcp[sizeof(oemcp)-1] = '\0';
1776         return (get_sconv_object(a, cur_charset, oemcp,
1777             SCONV_TO_CHARSET));
1778 }
1779 #else
1780 struct archive_string_conv *
1781 archive_string_default_conversion_for_read(struct archive *a)
1782 {
1783         (void)a; /* UNUSED */
1784         return (NULL);
1785 }
1786
1787 struct archive_string_conv *
1788 archive_string_default_conversion_for_write(struct archive *a)
1789 {
1790         (void)a; /* UNUSED */
1791         return (NULL);
1792 }
1793 #endif
1794
1795 /*
1796  * Dispose of all character conversion objects in the archive object.
1797  */
1798 void
1799 archive_string_conversion_free(struct archive *a)
1800 {
1801         struct archive_string_conv *sc;
1802         struct archive_string_conv *sc_next;
1803
1804         for (sc = a->sconv; sc != NULL; sc = sc_next) {
1805                 sc_next = sc->next;
1806                 free_sconv_object(sc);
1807         }
1808         a->sconv = NULL;
1809         free(a->current_code);
1810         a->current_code = NULL;
1811 }
1812
1813 /*
1814  * Return a conversion charset name.
1815  */
1816 const char *
1817 archive_string_conversion_charset_name(struct archive_string_conv *sc)
1818 {
1819         if (sc->flag & SCONV_TO_CHARSET)
1820                 return (sc->to_charset);
1821         else
1822                 return (sc->from_charset);
1823 }
1824
1825 /*
1826  * Change the behavior of a string conversion.
1827  */
1828 void
1829 archive_string_conversion_set_opt(struct archive_string_conv *sc, int opt)
1830 {
1831         switch (opt) {
1832         /*
1833          * A filename in UTF-8 was made with libarchive 2.x in a wrong
1834          * assumption that wchar_t was Unicode.
1835          * This option enables simulating the assumption in order to read
1836          * that filename correctly.
1837          */
1838         case SCONV_SET_OPT_UTF8_LIBARCHIVE2X:
1839 #if (defined(_WIN32) && !defined(__CYGWIN__)) \
1840          || defined(__STDC_ISO_10646__) || defined(__APPLE__)
1841                 /*
1842                  * Nothing to do for it since wchar_t on these platforms
1843                  * is really Unicode.
1844                  */
1845                 (void)sc; /* UNUSED */
1846 #else
1847                 if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) {
1848                         sc->flag |= SCONV_UTF8_LIBARCHIVE_2;
1849                         /* Set up string converters. */
1850                         setup_converter(sc);
1851                 }
1852 #endif
1853                 break;
1854         case SCONV_SET_OPT_NORMALIZATION_C:
1855                 if ((sc->flag & SCONV_NORMALIZATION_C) == 0) {
1856                         sc->flag |= SCONV_NORMALIZATION_C;
1857                         sc->flag &= ~SCONV_NORMALIZATION_D;
1858                         /* Set up string converters. */
1859                         setup_converter(sc);
1860                 }
1861                 break;
1862         case SCONV_SET_OPT_NORMALIZATION_D:
1863 #if defined(HAVE_ICONV)
1864                 /*
1865                  * If iconv will take the string, do not change the
1866                  * setting of the normalization.
1867                  */
1868                 if (!(sc->flag & SCONV_WIN_CP) &&
1869                      (sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) &&
1870                     !(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8)))
1871                         break;
1872 #endif
1873                 if ((sc->flag & SCONV_NORMALIZATION_D) == 0) {
1874                         sc->flag |= SCONV_NORMALIZATION_D;
1875                         sc->flag &= ~SCONV_NORMALIZATION_C;
1876                         /* Set up string converters. */
1877                         setup_converter(sc);
1878                 }
1879                 break;
1880         default:
1881                 break;
1882         }
1883 }
1884
1885 /*
1886  *
1887  * Copy one archive_string to another in locale conversion.
1888  *
1889  *      archive_strncat_l();
1890  *      archive_strncpy_l();
1891  *
1892  */
1893
1894 static size_t
1895 mbsnbytes(const void *_p, size_t n)
1896 {
1897         size_t s;
1898         const char *p, *pp;
1899
1900         if (_p == NULL)
1901                 return (0);
1902         p = (const char *)_p;
1903
1904         /* Like strlen(p), except won't examine positions beyond p[n]. */
1905         s = 0;
1906         pp = p;
1907         while (s < n && *pp) {
1908                 pp++;
1909                 s++;
1910         }
1911         return (s);
1912 }
1913
1914 static size_t
1915 utf16nbytes(const void *_p, size_t n)
1916 {
1917         size_t s;
1918         const char *p, *pp;
1919
1920         if (_p == NULL)
1921                 return (0);
1922         p = (const char *)_p;
1923
1924         /* Like strlen(p), except won't examine positions beyond p[n]. */
1925         s = 0;
1926         pp = p;
1927         n >>= 1;
1928         while (s < n && (pp[0] || pp[1])) {
1929                 pp += 2;
1930                 s++;
1931         }
1932         return (s<<1);
1933 }
1934
1935 int
1936 archive_strncpy_l(struct archive_string *as, const void *_p, size_t n,
1937     struct archive_string_conv *sc)
1938 {
1939         as->length = 0;
1940         return (archive_strncat_l(as, _p, n, sc));
1941 }
1942
1943 int
1944 archive_strncat_l(struct archive_string *as, const void *_p, size_t n,
1945     struct archive_string_conv *sc)
1946 {
1947         const void *s;
1948         size_t length = 0;
1949         int i, r = 0, r2;
1950
1951         if (_p != NULL && n > 0) {
1952                 if (sc != NULL && (sc->flag & SCONV_FROM_UTF16))
1953                         length = utf16nbytes(_p, n);
1954                 else
1955                         length = mbsnbytes(_p, n);
1956         }
1957
1958         /* We must allocate memory even if there is no data for conversion
1959          * or copy. This simulates archive_string_append behavior. */
1960         if (length == 0) {
1961                 int tn = 1;
1962                 if (sc != NULL && (sc->flag & SCONV_TO_UTF16))
1963                         tn = 2;
1964                 if (archive_string_ensure(as, as->length + tn) == NULL)
1965                         return (-1);
1966                 as->s[as->length] = 0;
1967                 if (tn == 2)
1968                         as->s[as->length+1] = 0;
1969                 return (0);
1970         }
1971
1972         /*
1973          * If sc is NULL, we just make a copy.
1974          */
1975         if (sc == NULL) {
1976                 if (archive_string_append(as, _p, length) == NULL)
1977                         return (-1);/* No memory */
1978                 return (0);
1979         }
1980
1981         s = _p;
1982         i = 0;
1983         if (sc->nconverter > 1) {
1984                 sc->utftmp.length = 0;
1985                 r2 = sc->converter[0](&(sc->utftmp), s, length, sc);
1986                 if (r2 != 0 && errno == ENOMEM)
1987                         return (r2);
1988                 if (r > r2)
1989                         r = r2;
1990                 s = sc->utftmp.s;
1991                 length = sc->utftmp.length;
1992                 ++i;
1993         }
1994         r2 = sc->converter[i](as, s, length, sc);
1995         if (r > r2)
1996                 r = r2;
1997         return (r);
1998 }
1999
2000 #if HAVE_ICONV
2001
2002 /*
2003  * Return -1 if conversion fails.
2004  */
2005 static int
2006 iconv_strncat_in_locale(struct archive_string *as, const void *_p,
2007     size_t length, struct archive_string_conv *sc)
2008 {
2009         ICONV_CONST char *itp;
2010         size_t remaining;
2011         iconv_t cd;
2012         char *outp;
2013         size_t avail, bs;
2014         int return_value = 0; /* success */
2015         int to_size, from_size;
2016
2017         if (sc->flag & SCONV_TO_UTF16)
2018                 to_size = 2;
2019         else
2020                 to_size = 1;
2021         if (sc->flag & SCONV_FROM_UTF16)
2022                 from_size = 2;
2023         else
2024                 from_size = 1;
2025
2026         if (archive_string_ensure(as, as->length + length*2+to_size) == NULL)
2027                 return (-1);
2028
2029         cd = sc->cd;
2030         itp = (char *)(uintptr_t)_p;
2031         remaining = length;
2032         outp = as->s + as->length;
2033         avail = as->buffer_length - as->length - to_size;
2034         while (remaining >= (size_t)from_size) {
2035                 size_t result = iconv(cd, &itp, &remaining, &outp, &avail);
2036
2037                 if (result != (size_t)-1)
2038                         break; /* Conversion completed. */
2039
2040                 if (errno == EILSEQ || errno == EINVAL) {
2041                         /*
2042                          * If an output charset is UTF-8 or UTF-16BE/LE,
2043                          * unknown character should be U+FFFD
2044                          * (replacement character).
2045                          */
2046                         if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) {
2047                                 size_t rbytes;
2048                                 if (sc->flag & SCONV_TO_UTF8)
2049                                         rbytes = sizeof(utf8_replacement_char);
2050                                 else
2051                                         rbytes = 2;
2052
2053                                 if (avail < rbytes) {
2054                                         as->length = outp - as->s;
2055                                         bs = as->buffer_length +
2056                                             (remaining * to_size) + rbytes;
2057                                         if (NULL ==
2058                                             archive_string_ensure(as, bs))
2059                                                 return (-1);
2060                                         outp = as->s + as->length;
2061                                         avail = as->buffer_length
2062                                             - as->length - to_size;
2063                                 }
2064                                 if (sc->flag & SCONV_TO_UTF8)
2065                                         memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char));
2066                                 else if (sc->flag & SCONV_TO_UTF16BE)
2067                                         archive_be16enc(outp, UNICODE_R_CHAR);
2068                                 else
2069                                         archive_le16enc(outp, UNICODE_R_CHAR);
2070                                 outp += rbytes;
2071                                 avail -= rbytes;
2072                         } else {
2073                                 /* Skip the illegal input bytes. */
2074                                 *outp++ = '?';
2075                                 avail--;
2076                         }
2077                         itp += from_size;
2078                         remaining -= from_size;
2079                         return_value = -1; /* failure */
2080                 } else {
2081                         /* E2BIG no output buffer,
2082                          * Increase an output buffer.  */
2083                         as->length = outp - as->s;
2084                         bs = as->buffer_length + remaining * 2;
2085                         if (NULL == archive_string_ensure(as, bs))
2086                                 return (-1);
2087                         outp = as->s + as->length;
2088                         avail = as->buffer_length - as->length - to_size;
2089                 }
2090         }
2091         as->length = outp - as->s;
2092         as->s[as->length] = 0;
2093         if (to_size == 2)
2094                 as->s[as->length+1] = 0;
2095         return (return_value);
2096 }
2097
2098 #endif /* HAVE_ICONV */
2099
2100
2101 #if defined(_WIN32) && !defined(__CYGWIN__)
2102
2103 /*
2104  * Translate a string from a some CodePage to an another CodePage by
2105  * Windows APIs, and copy the result. Return -1 if conversion fails.
2106  */
2107 static int
2108 strncat_in_codepage(struct archive_string *as,
2109     const void *_p, size_t length, struct archive_string_conv *sc)
2110 {
2111         const char *s = (const char *)_p;
2112         struct archive_wstring aws;
2113         size_t l;
2114         int r, saved_flag;
2115
2116         archive_string_init(&aws);
2117         saved_flag = sc->flag;
2118         sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C);
2119         r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc);
2120         sc->flag = saved_flag;
2121         if (r != 0) {
2122                 archive_wstring_free(&aws);
2123                 if (errno != ENOMEM)
2124                         archive_string_append(as, s, length);
2125                 return (-1);
2126         }
2127
2128         l = as->length;
2129         r = archive_string_append_from_wcs_in_codepage(
2130             as, aws.s, aws.length, sc);
2131         if (r != 0 && errno != ENOMEM && l == as->length)
2132                 archive_string_append(as, s, length);
2133         archive_wstring_free(&aws);
2134         return (r);
2135 }
2136
2137 /*
2138  * Test whether MBS ==> WCS is okay.
2139  */
2140 static int
2141 invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)
2142 {
2143         const char *p = (const char *)_p;
2144         unsigned codepage;
2145         DWORD mbflag = MB_ERR_INVALID_CHARS;
2146
2147         if (sc->flag & SCONV_FROM_CHARSET)
2148                 codepage = sc->to_cp;
2149         else
2150                 codepage = sc->from_cp;
2151
2152         if (codepage == CP_C_LOCALE)
2153                 return (0);
2154         if (codepage != CP_UTF8)
2155                 mbflag |= MB_PRECOMPOSED;
2156
2157         if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0)
2158                 return (-1); /* Invalid */
2159         return (0); /* Okay */
2160 }
2161
2162 #else
2163
2164 /*
2165  * Test whether MBS ==> WCS is okay.
2166  */
2167 static int
2168 invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)
2169 {
2170         const char *p = (const char *)_p;
2171         size_t r;
2172
2173 #if HAVE_MBRTOWC
2174         mbstate_t shift_state;
2175
2176         memset(&shift_state, 0, sizeof(shift_state));
2177 #else
2178         /* Clear the shift state before starting. */
2179         mbtowc(NULL, NULL, 0);
2180 #endif
2181         while (n) {
2182                 wchar_t wc;
2183
2184 #if HAVE_MBRTOWC
2185                 r = mbrtowc(&wc, p, n, &shift_state);
2186 #else
2187                 r = mbtowc(&wc, p, n);
2188 #endif
2189                 if (r == (size_t)-1 || r == (size_t)-2)
2190                         return (-1);/* Invalid. */
2191                 if (r == 0)
2192                         break;
2193                 p += r;
2194                 n -= r;
2195         }
2196         (void)sc; /* UNUSED */
2197         return (0); /* All Okey. */
2198 }
2199
2200 #endif /* defined(_WIN32) && !defined(__CYGWIN__) */
2201
2202 /*
2203  * Basically returns -1 because we cannot make a conversion of charset
2204  * without iconv but in some cases this would return 0.
2205  * Returns 0 if all copied characters are ASCII.
2206  * Returns 0 if both from-locale and to-locale are the same and those
2207  * can be WCS with no error.
2208  */
2209 static int
2210 best_effort_strncat_in_locale(struct archive_string *as, const void *_p,
2211     size_t length, struct archive_string_conv *sc)
2212 {
2213         size_t remaining;
2214         const uint8_t *itp;
2215         int return_value = 0; /* success */
2216
2217         /*
2218          * If both from-locale and to-locale is the same, this makes a copy.
2219          * And then this checks all copied MBS can be WCS if so returns 0.
2220          */
2221         if (sc->same) {
2222                 if (archive_string_append(as, _p, length) == NULL)
2223                         return (-1);/* No memory */
2224                 return (invalid_mbs(_p, length, sc));
2225         }
2226
2227         /*
2228          * If a character is ASCII, this just copies it. If not, this
2229          * assigns '?' character instead but in UTF-8 locale this assigns
2230          * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD,
2231          * a Replacement Character in Unicode.
2232          */
2233
2234         remaining = length;
2235         itp = (const uint8_t *)_p;
2236         while (*itp && remaining > 0) {
2237                 if (*itp > 127) {
2238                         // Non-ASCII: Substitute with suitable replacement
2239                         if (sc->flag & SCONV_TO_UTF8) {
2240                                 if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) {
2241                                         __archive_errx(1, "Out of memory");
2242                                 }
2243                         } else {
2244                                 archive_strappend_char(as, '?');
2245                         }
2246                         return_value = -1;
2247                 } else {
2248                         archive_strappend_char(as, *itp);
2249                 }
2250                 ++itp;
2251         }
2252         return (return_value);
2253 }
2254
2255
2256 /*
2257  * Unicode conversion functions.
2258  *   - UTF-8 <===> UTF-8 in removing surrogate pairs.
2259  *   - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs.
2260  *   - UTF-8 made by libarchive 2.x ===> UTF-8.
2261  *   - UTF-16BE <===> UTF-8.
2262  *
2263  */
2264
2265 /*
2266  * Utility to convert a single UTF-8 sequence.
2267  *
2268  * Usually return used bytes, return used byte in negative value when
2269  * a unicode character is replaced with U+FFFD.
2270  * See also http://unicode.org/review/pr-121.html Public Review Issue #121
2271  * Recommended Practice for Replacement Characters.
2272  */
2273 static int
2274 _utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2275 {
2276         static const char utf8_count[256] = {
2277                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */
2278                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */
2279                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */
2280                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */
2281                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */
2282                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */
2283                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */
2284                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */
2285                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */
2286                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */
2287                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */
2288                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */
2289                  0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */
2290                  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */
2291                  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */
2292                  4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */
2293         };
2294         int ch, i;
2295         int cnt;
2296         uint32_t wc;
2297
2298         /* Sanity check. */
2299         if (n == 0)
2300                 return (0);
2301         /*
2302          * Decode 1-4 bytes depending on the value of the first byte.
2303          */
2304         ch = (unsigned char)*s;
2305         if (ch == 0)
2306                 return (0); /* Standard:  return 0 for end-of-string. */
2307         cnt = utf8_count[ch];
2308
2309         /* Invalid sequence or there are not plenty bytes. */
2310         if ((int)n < cnt) {
2311                 cnt = (int)n;
2312                 for (i = 1; i < cnt; i++) {
2313                         if ((s[i] & 0xc0) != 0x80) {
2314                                 cnt = i;
2315                                 break;
2316                         }
2317                 }
2318                 goto invalid_sequence;
2319         }
2320
2321         /* Make a Unicode code point from a single UTF-8 sequence. */
2322         switch (cnt) {
2323         case 1: /* 1 byte sequence. */
2324                 *pwc = ch & 0x7f;
2325                 return (cnt);
2326         case 2: /* 2 bytes sequence. */
2327                 if ((s[1] & 0xc0) != 0x80) {
2328                         cnt = 1;
2329                         goto invalid_sequence;
2330                 }
2331                 *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);
2332                 return (cnt);
2333         case 3: /* 3 bytes sequence. */
2334                 if ((s[1] & 0xc0) != 0x80) {
2335                         cnt = 1;
2336                         goto invalid_sequence;
2337                 }
2338                 if ((s[2] & 0xc0) != 0x80) {
2339                         cnt = 2;
2340                         goto invalid_sequence;
2341                 }
2342                 wc = ((ch & 0x0f) << 12)
2343                     | ((s[1] & 0x3f) << 6)
2344                     | (s[2] & 0x3f);
2345                 if (wc < 0x800)
2346                         goto invalid_sequence;/* Overlong sequence. */
2347                 break;
2348         case 4: /* 4 bytes sequence. */
2349                 if ((s[1] & 0xc0) != 0x80) {
2350                         cnt = 1;
2351                         goto invalid_sequence;
2352                 }
2353                 if ((s[2] & 0xc0) != 0x80) {
2354                         cnt = 2;
2355                         goto invalid_sequence;
2356                 }
2357                 if ((s[3] & 0xc0) != 0x80) {
2358                         cnt = 3;
2359                         goto invalid_sequence;
2360                 }
2361                 wc = ((ch & 0x07) << 18)
2362                     | ((s[1] & 0x3f) << 12)
2363                     | ((s[2] & 0x3f) << 6)
2364                     | (s[3] & 0x3f);
2365                 if (wc < 0x10000)
2366                         goto invalid_sequence;/* Overlong sequence. */
2367                 break;
2368         default: /* Others are all invalid sequence. */
2369                 if (ch == 0xc0 || ch == 0xc1)
2370                         cnt = 2;
2371                 else if (ch >= 0xf5 && ch <= 0xf7)
2372                         cnt = 4;
2373                 else if (ch >= 0xf8 && ch <= 0xfb)
2374                         cnt = 5;
2375                 else if (ch == 0xfc || ch == 0xfd)
2376                         cnt = 6;
2377                 else
2378                         cnt = 1;
2379                 if ((int)n < cnt)
2380                         cnt = (int)n;
2381                 for (i = 1; i < cnt; i++) {
2382                         if ((s[i] & 0xc0) != 0x80) {
2383                                 cnt = i;
2384                                 break;
2385                         }
2386                 }
2387                 goto invalid_sequence;
2388         }
2389
2390         /* The code point larger than 0x10FFFF is not legal
2391          * Unicode values. */
2392         if (wc > UNICODE_MAX)
2393                 goto invalid_sequence;
2394         /* Correctly gets a Unicode, returns used bytes. */
2395         *pwc = wc;
2396         return (cnt);
2397 invalid_sequence:
2398         *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
2399         return (cnt * -1);
2400 }
2401
2402 static int
2403 utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2404 {
2405         int cnt;
2406
2407         cnt = _utf8_to_unicode(pwc, s, n);
2408         /* Any of Surrogate pair is not legal Unicode values. */
2409         if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc))
2410                 return (-3);
2411         return (cnt);
2412 }
2413
2414 static inline uint32_t
2415 combine_surrogate_pair(uint32_t uc, uint32_t uc2)
2416 {
2417         uc -= 0xD800;
2418         uc *= 0x400;
2419         uc += uc2 - 0xDC00;
2420         uc += 0x10000;
2421         return (uc);
2422 }
2423
2424 /*
2425  * Convert a single UTF-8/CESU-8 sequence to a Unicode code point in
2426  * removing surrogate pairs.
2427  *
2428  * CESU-8: The Compatibility Encoding Scheme for UTF-16.
2429  *
2430  * Usually return used bytes, return used byte in negative value when
2431  * a unicode character is replaced with U+FFFD.
2432  */
2433 static int
2434 cesu8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2435 {
2436         uint32_t wc = 0;
2437         int cnt;
2438
2439         cnt = _utf8_to_unicode(&wc, s, n);
2440         if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) {
2441                 uint32_t wc2 = 0;
2442                 if (n - 3 < 3) {
2443                         /* Invalid byte sequence. */
2444                         goto invalid_sequence;
2445                 }
2446                 cnt = _utf8_to_unicode(&wc2, s+3, n-3);
2447                 if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) {
2448                         /* Invalid byte sequence. */
2449                         goto invalid_sequence;
2450                 }
2451                 wc = combine_surrogate_pair(wc, wc2);
2452                 cnt = 6;
2453         } else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) {
2454                 /* Invalid byte sequence. */
2455                 goto invalid_sequence;
2456         }
2457         *pwc = wc;
2458         return (cnt);
2459 invalid_sequence:
2460         *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
2461         if (cnt > 0)
2462                 cnt *= -1;
2463         return (cnt);
2464 }
2465
2466 /*
2467  * Convert a Unicode code point to a single UTF-8 sequence.
2468  *
2469  * NOTE:This function does not check if the Unicode is legal or not.
2470  * Please you definitely check it before calling this.
2471  */
2472 static size_t
2473 unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
2474 {
2475         char *_p = p;
2476
2477         /* Invalid Unicode char maps to Replacement character */
2478         if (uc > UNICODE_MAX)
2479                 uc = UNICODE_R_CHAR;
2480         /* Translate code point to UTF8 */
2481         if (uc <= 0x7f) {
2482                 if (remaining == 0)
2483                         return (0);
2484                 *p++ = (char)uc;
2485         } else if (uc <= 0x7ff) {
2486                 if (remaining < 2)
2487                         return (0);
2488                 *p++ = 0xc0 | ((uc >> 6) & 0x1f);
2489                 *p++ = 0x80 | (uc & 0x3f);
2490         } else if (uc <= 0xffff) {
2491                 if (remaining < 3)
2492                         return (0);
2493                 *p++ = 0xe0 | ((uc >> 12) & 0x0f);
2494                 *p++ = 0x80 | ((uc >> 6) & 0x3f);
2495                 *p++ = 0x80 | (uc & 0x3f);
2496         } else {
2497                 if (remaining < 4)
2498                         return (0);
2499                 *p++ = 0xf0 | ((uc >> 18) & 0x07);
2500                 *p++ = 0x80 | ((uc >> 12) & 0x3f);
2501                 *p++ = 0x80 | ((uc >> 6) & 0x3f);
2502                 *p++ = 0x80 | (uc & 0x3f);
2503         }
2504         return (p - _p);
2505 }
2506
2507 static int
2508 utf16be_to_unicode(uint32_t *pwc, const char *s, size_t n)
2509 {
2510         return (utf16_to_unicode(pwc, s, n, 1));
2511 }
2512
2513 static int
2514 utf16le_to_unicode(uint32_t *pwc, const char *s, size_t n)
2515 {
2516         return (utf16_to_unicode(pwc, s, n, 0));
2517 }
2518
2519 static int
2520 utf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be)
2521 {
2522         const char *utf16 = s;
2523         unsigned uc;
2524
2525         if (n == 0)
2526                 return (0);
2527         if (n == 1) {
2528                 /* set the Replacement Character instead. */
2529                 *pwc = UNICODE_R_CHAR;
2530                 return (-1);
2531         }
2532
2533         if (be)
2534                 uc = archive_be16dec(utf16);
2535         else
2536                 uc = archive_le16dec(utf16);
2537         utf16 += 2;
2538
2539         /* If this is a surrogate pair, assemble the full code point.*/
2540         if (IS_HIGH_SURROGATE_LA(uc)) {
2541                 unsigned uc2;
2542
2543                 if (n >= 4) {
2544                         if (be)
2545                                 uc2 = archive_be16dec(utf16);
2546                         else
2547                                 uc2 = archive_le16dec(utf16);
2548                 } else
2549                         uc2 = 0;
2550                 if (IS_LOW_SURROGATE_LA(uc2)) {
2551                         uc = combine_surrogate_pair(uc, uc2);
2552                         utf16 += 2;
2553                 } else {
2554                         /* Undescribed code point should be U+FFFD
2555                         * (replacement character). */
2556                         *pwc = UNICODE_R_CHAR;
2557                         return (-2);
2558                 }
2559         }
2560
2561         /*
2562          * Surrogate pair values(0xd800 through 0xdfff) are only
2563          * used by UTF-16, so, after above calculation, the code
2564          * must not be surrogate values, and Unicode has no codes
2565          * larger than 0x10ffff. Thus, those are not legal Unicode
2566          * values.
2567          */
2568         if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) {
2569                 /* Undescribed code point should be U+FFFD
2570                 * (replacement character). */
2571                 *pwc = UNICODE_R_CHAR;
2572                 return (((int)(utf16 - s)) * -1);
2573         }
2574         *pwc = uc;
2575         return ((int)(utf16 - s));
2576 }
2577
2578 static size_t
2579 unicode_to_utf16be(char *p, size_t remaining, uint32_t uc)
2580 {
2581         char *utf16 = p;
2582
2583         if (uc > 0xffff) {
2584                 /* We have a code point that won't fit into a
2585                  * wchar_t; convert it to a surrogate pair. */
2586                 if (remaining < 4)
2587                         return (0);
2588                 uc -= 0x10000;
2589                 archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
2590                 archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
2591                 return (4);
2592         } else {
2593                 if (remaining < 2)
2594                         return (0);
2595                 archive_be16enc(utf16, uc);
2596                 return (2);
2597         }
2598 }
2599
2600 static size_t
2601 unicode_to_utf16le(char *p, size_t remaining, uint32_t uc)
2602 {
2603         char *utf16 = p;
2604
2605         if (uc > 0xffff) {
2606                 /* We have a code point that won't fit into a
2607                  * wchar_t; convert it to a surrogate pair. */
2608                 if (remaining < 4)
2609                         return (0);
2610                 uc -= 0x10000;
2611                 archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
2612                 archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
2613                 return (4);
2614         } else {
2615                 if (remaining < 2)
2616                         return (0);
2617                 archive_le16enc(utf16, uc);
2618                 return (2);
2619         }
2620 }
2621
2622 /*
2623  * Copy UTF-8 string in checking surrogate pair.
2624  * If any surrogate pair are found, it would be canonicalized.
2625  */
2626 static int
2627 strncat_from_utf8_to_utf8(struct archive_string *as, const void *_p,
2628     size_t len, struct archive_string_conv *sc)
2629 {
2630         const char *s;
2631         char *p, *endp;
2632         int n, ret = 0;
2633
2634         (void)sc; /* UNUSED */
2635
2636         if (archive_string_ensure(as, as->length + len + 1) == NULL)
2637                 return (-1);
2638
2639         s = (const char *)_p;
2640         p = as->s + as->length;
2641         endp = as->s + as->buffer_length -1;
2642         do {
2643                 uint32_t uc;
2644                 const char *ss = s;
2645                 size_t w;
2646
2647                 /*
2648                  * Forward byte sequence until a conversion of that is needed.
2649                  */
2650                 while ((n = utf8_to_unicode(&uc, s, len)) > 0) {
2651                         s += n;
2652                         len -= n;
2653                 }
2654                 if (ss < s) {
2655                         if (p + (s - ss) > endp) {
2656                                 as->length = p - as->s;
2657                                 if (archive_string_ensure(as,
2658                                     as->buffer_length + len + 1) == NULL)
2659                                         return (-1);
2660                                 p = as->s + as->length;
2661                                 endp = as->s + as->buffer_length -1;
2662                         }
2663
2664                         memcpy(p, ss, s - ss);
2665                         p += s - ss;
2666                 }
2667
2668                 /*
2669                  * If n is negative, current byte sequence needs a replacement.
2670                  */
2671                 if (n < 0) {
2672                         if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) {
2673                                 /* Current byte sequence may be CESU-8. */
2674                                 n = cesu8_to_unicode(&uc, s, len);
2675                         }
2676                         if (n < 0) {
2677                                 ret = -1;
2678                                 n *= -1;/* Use a replaced unicode character. */
2679                         }
2680
2681                         /* Rebuild UTF-8 byte sequence. */
2682                         while ((w = unicode_to_utf8(p, endp - p, uc)) == 0) {
2683                                 as->length = p - as->s;
2684                                 if (archive_string_ensure(as,
2685                                     as->buffer_length + len + 1) == NULL)
2686                                         return (-1);
2687                                 p = as->s + as->length;
2688                                 endp = as->s + as->buffer_length -1;
2689                         }
2690                         p += w;
2691                         s += n;
2692                         len -= n;
2693                 }
2694         } while (n > 0);
2695         as->length = p - as->s;
2696         as->s[as->length] = '\0';
2697         return (ret);
2698 }
2699
2700 static int
2701 archive_string_append_unicode(struct archive_string *as, const void *_p,
2702     size_t len, struct archive_string_conv *sc)
2703 {
2704         const char *s;
2705         char *p, *endp;
2706         uint32_t uc;
2707         size_t w;
2708         int n, ret = 0, ts, tm;
2709         int (*parse)(uint32_t *, const char *, size_t);
2710         size_t (*unparse)(char *, size_t, uint32_t);
2711
2712         if (sc->flag & SCONV_TO_UTF16BE) {
2713                 unparse = unicode_to_utf16be;
2714                 ts = 2;
2715         } else if (sc->flag & SCONV_TO_UTF16LE) {
2716                 unparse = unicode_to_utf16le;
2717                 ts = 2;
2718         } else if (sc->flag & SCONV_TO_UTF8) {
2719                 unparse = unicode_to_utf8;
2720                 ts = 1;
2721         } else {
2722                 /*
2723                  * This case is going to be converted to another
2724                  * character-set through iconv.
2725                  */
2726                 if (sc->flag & SCONV_FROM_UTF16BE) {
2727                         unparse = unicode_to_utf16be;
2728                         ts = 2;
2729                 } else if (sc->flag & SCONV_FROM_UTF16LE) {
2730                         unparse = unicode_to_utf16le;
2731                         ts = 2;
2732                 } else {
2733                         unparse = unicode_to_utf8;
2734                         ts = 1;
2735                 }
2736         }
2737
2738         if (sc->flag & SCONV_FROM_UTF16BE) {
2739                 parse = utf16be_to_unicode;
2740                 tm = 1;
2741         } else if (sc->flag & SCONV_FROM_UTF16LE) {
2742                 parse = utf16le_to_unicode;
2743                 tm = 1;
2744         } else {
2745                 parse = cesu8_to_unicode;
2746                 tm = ts;
2747         }
2748
2749         if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
2750                 return (-1);
2751
2752         s = (const char *)_p;
2753         p = as->s + as->length;
2754         endp = as->s + as->buffer_length - ts;
2755         while ((n = parse(&uc, s, len)) != 0) {
2756                 if (n < 0) {
2757                         /* Use a replaced unicode character. */
2758                         n *= -1;
2759                         ret = -1;
2760                 }
2761                 s += n;
2762                 len -= n;
2763                 while ((w = unparse(p, endp - p, uc)) == 0) {
2764                         /* There is not enough output buffer so
2765                          * we have to expand it. */
2766                         as->length = p - as->s;
2767                         if (archive_string_ensure(as,
2768                             as->buffer_length + len * tm + ts) == NULL)
2769                                 return (-1);
2770                         p = as->s + as->length;
2771                         endp = as->s + as->buffer_length - ts;
2772                 }
2773                 p += w;
2774         }
2775         as->length = p - as->s;
2776         as->s[as->length] = '\0';
2777         if (ts == 2)
2778                 as->s[as->length+1] = '\0';
2779         return (ret);
2780 }
2781
2782 /*
2783  * Following Constants for Hangul compositions this information comes from
2784  * Unicode Standard Annex #15  http://unicode.org/reports/tr15/
2785  */
2786 #define HC_SBASE        0xAC00
2787 #define HC_LBASE        0x1100
2788 #define HC_VBASE        0x1161
2789 #define HC_TBASE        0x11A7
2790 #define HC_LCOUNT       19
2791 #define HC_VCOUNT       21
2792 #define HC_TCOUNT       28
2793 #define HC_NCOUNT       (HC_VCOUNT * HC_TCOUNT)
2794 #define HC_SCOUNT       (HC_LCOUNT * HC_NCOUNT)
2795
2796 static uint32_t
2797 get_nfc(uint32_t uc, uint32_t uc2)
2798 {
2799         int t, b;
2800
2801         t = 0;
2802         b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1;
2803         while (b >= t) {
2804                 int m = (t + b) / 2;
2805                 if (u_composition_table[m].cp1 < uc)
2806                         t = m + 1;
2807                 else if (u_composition_table[m].cp1 > uc)
2808                         b = m - 1;
2809                 else if (u_composition_table[m].cp2 < uc2)
2810                         t = m + 1;
2811                 else if (u_composition_table[m].cp2 > uc2)
2812                         b = m - 1;
2813                 else
2814                         return (u_composition_table[m].nfc);
2815         }
2816         return (0);
2817 }
2818
2819 #define FDC_MAX 10      /* The maximum number of Following Decomposable
2820                          * Characters. */
2821
2822 /*
2823  * Update first code point.
2824  */
2825 #define UPDATE_UC(new_uc)       do {            \
2826         uc = new_uc;                            \
2827         ucptr = NULL;                           \
2828 } while (0)
2829
2830 /*
2831  * Replace first code point with second code point.
2832  */
2833 #define REPLACE_UC_WITH_UC2() do {              \
2834         uc = uc2;                               \
2835         ucptr = uc2ptr;                         \
2836         n = n2;                                 \
2837 } while (0)
2838
2839 #define EXPAND_BUFFER() do {                    \
2840         as->length = p - as->s;                 \
2841         if (archive_string_ensure(as,           \
2842             as->buffer_length + len * tm + ts) == NULL)\
2843                 return (-1);                    \
2844         p = as->s + as->length;                 \
2845         endp = as->s + as->buffer_length - ts;  \
2846 } while (0)
2847
2848 #define UNPARSE(p, endp, uc)    do {            \
2849         while ((w = unparse(p, (endp) - (p), uc)) == 0) {\
2850                 EXPAND_BUFFER();                \
2851         }                                       \
2852         p += w;                                 \
2853 } while (0)
2854
2855 /*
2856  * Write first code point.
2857  * If the code point has not be changed from its original code,
2858  * this just copies it from its original buffer pointer.
2859  * If not, this converts it to UTF-8 byte sequence and copies it.
2860  */
2861 #define WRITE_UC()      do {                    \
2862         if (ucptr) {                            \
2863                 if (p + n > endp)               \
2864                         EXPAND_BUFFER();        \
2865                 switch (n) {                    \
2866                 case 4:                         \
2867                         *p++ = *ucptr++;        \
2868                         /* FALL THROUGH */      \
2869                 case 3:                         \
2870                         *p++ = *ucptr++;        \
2871                         /* FALL THROUGH */      \
2872                 case 2:                         \
2873                         *p++ = *ucptr++;        \
2874                         /* FALL THROUGH */      \
2875                 case 1:                         \
2876                         *p++ = *ucptr;          \
2877                         break;                  \
2878                 }                               \
2879                 ucptr = NULL;                   \
2880         } else {                                \
2881                 UNPARSE(p, endp, uc);           \
2882         }                                       \
2883 } while (0)
2884
2885 /*
2886  * Collect following decomposable code points.
2887  */
2888 #define COLLECT_CPS(start)      do {            \
2889         int _i;                                 \
2890         for (_i = start; _i < FDC_MAX ; _i++) { \
2891                 nx = parse(&ucx[_i], s, len);   \
2892                 if (nx <= 0)                    \
2893                         break;                  \
2894                 cx = CCC(ucx[_i]);              \
2895                 if (cl >= cx && cl != 228 && cx != 228)\
2896                         break;                  \
2897                 s += nx;                        \
2898                 len -= nx;                      \
2899                 cl = cx;                        \
2900                 ccx[_i] = cx;                   \
2901         }                                       \
2902         if (_i >= FDC_MAX) {                    \
2903                 ret = -1;                       \
2904                 ucx_size = FDC_MAX;             \
2905         } else                                  \
2906                 ucx_size = _i;                  \
2907 } while (0)
2908
2909 /*
2910  * Normalize UTF-8/UTF-16BE characters to Form C and copy the result.
2911  *
2912  * TODO: Convert composition exclusions, which are never converted
2913  * from NFC,NFD,NFKC and NFKD, to Form C.
2914  */
2915 static int
2916 archive_string_normalize_C(struct archive_string *as, const void *_p,
2917     size_t len, struct archive_string_conv *sc)
2918 {
2919         const char *s = (const char *)_p;
2920         char *p, *endp;
2921         uint32_t uc, uc2;
2922         size_t w;
2923         int always_replace, n, n2, ret = 0, spair, ts, tm;
2924         int (*parse)(uint32_t *, const char *, size_t);
2925         size_t (*unparse)(char *, size_t, uint32_t);
2926
2927         always_replace = 1;
2928         ts = 1;/* text size. */
2929         if (sc->flag & SCONV_TO_UTF16BE) {
2930                 unparse = unicode_to_utf16be;
2931                 ts = 2;
2932                 if (sc->flag & SCONV_FROM_UTF16BE)
2933                         always_replace = 0;
2934         } else if (sc->flag & SCONV_TO_UTF16LE) {
2935                 unparse = unicode_to_utf16le;
2936                 ts = 2;
2937                 if (sc->flag & SCONV_FROM_UTF16LE)
2938                         always_replace = 0;
2939         } else if (sc->flag & SCONV_TO_UTF8) {
2940                 unparse = unicode_to_utf8;
2941                 if (sc->flag & SCONV_FROM_UTF8)
2942                         always_replace = 0;
2943         } else {
2944                 /*
2945                  * This case is going to be converted to another
2946                  * character-set through iconv.
2947                  */
2948                 always_replace = 0;
2949                 if (sc->flag & SCONV_FROM_UTF16BE) {
2950                         unparse = unicode_to_utf16be;
2951                         ts = 2;
2952                 } else if (sc->flag & SCONV_FROM_UTF16LE) {
2953                         unparse = unicode_to_utf16le;
2954                         ts = 2;
2955                 } else {
2956                         unparse = unicode_to_utf8;
2957                 }
2958         }
2959
2960         if (sc->flag & SCONV_FROM_UTF16BE) {
2961                 parse = utf16be_to_unicode;
2962                 tm = 1;
2963                 spair = 4;/* surrogate pair size in UTF-16. */
2964         } else if (sc->flag & SCONV_FROM_UTF16LE) {
2965                 parse = utf16le_to_unicode;
2966                 tm = 1;
2967                 spair = 4;/* surrogate pair size in UTF-16. */
2968         } else {
2969                 parse = cesu8_to_unicode;
2970                 tm = ts;
2971                 spair = 6;/* surrogate pair size in UTF-8. */
2972         }
2973
2974         if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
2975                 return (-1);
2976
2977         p = as->s + as->length;
2978         endp = as->s + as->buffer_length - ts;
2979         while ((n = parse(&uc, s, len)) != 0) {
2980                 const char *ucptr, *uc2ptr;
2981
2982                 if (n < 0) {
2983                         /* Use a replaced unicode character. */
2984                         UNPARSE(p, endp, uc);
2985                         s += n*-1;
2986                         len -= n*-1;
2987                         ret = -1;
2988                         continue;
2989                 } else if (n == spair || always_replace)
2990                         /* uc is converted from a surrogate pair.
2991                          * this should be treated as a changed code. */
2992                         ucptr = NULL;
2993                 else
2994                         ucptr = s;
2995                 s += n;
2996                 len -= n;
2997
2998                 /* Read second code point. */
2999                 while ((n2 = parse(&uc2, s, len)) > 0) {
3000                         uint32_t ucx[FDC_MAX];
3001                         int ccx[FDC_MAX];
3002                         int cl, cx, i, nx, ucx_size;
3003                         int LIndex,SIndex;
3004                         uint32_t nfc;
3005
3006                         if (n2 == spair || always_replace)
3007                                 /* uc2 is converted from a surrogate pair.
3008                                  * this should be treated as a changed code. */
3009                                 uc2ptr = NULL;
3010                         else
3011                                 uc2ptr = s;
3012                         s += n2;
3013                         len -= n2;
3014
3015                         /*
3016                          * If current second code point is out of decomposable
3017                          * code points, finding compositions is unneeded.
3018                          */
3019                         if (!IS_DECOMPOSABLE_BLOCK(uc2)) {
3020                                 WRITE_UC();
3021                                 REPLACE_UC_WITH_UC2();
3022                                 continue;
3023                         }
3024
3025                         /*
3026                          * Try to combine current code points.
3027                          */
3028                         /*
3029                          * We have to combine Hangul characters according to
3030                          * http://uniicode.org/reports/tr15/#Hangul
3031                          */
3032                         if (0 <= (LIndex = uc - HC_LBASE) &&
3033                             LIndex < HC_LCOUNT) {
3034                                 /*
3035                                  * Hangul Composition.
3036                                  * 1. Two current code points are L and V.
3037                                  */
3038                                 int VIndex = uc2 - HC_VBASE;
3039                                 if (0 <= VIndex && VIndex < HC_VCOUNT) {
3040                                         /* Make syllable of form LV. */
3041                                         UPDATE_UC(HC_SBASE +
3042                                             (LIndex * HC_VCOUNT + VIndex) *
3043                                              HC_TCOUNT);
3044                                 } else {
3045                                         WRITE_UC();
3046                                         REPLACE_UC_WITH_UC2();
3047                                 }
3048                                 continue;
3049                         } else if (0 <= (SIndex = uc - HC_SBASE) &&
3050                             SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) {
3051                                 /*
3052                                  * Hangul Composition.
3053                                  * 2. Two current code points are LV and T.
3054                                  */
3055                                 int TIndex = uc2 - HC_TBASE;
3056                                 if (0 < TIndex && TIndex < HC_TCOUNT) {
3057                                         /* Make syllable of form LVT. */
3058                                         UPDATE_UC(uc + TIndex);
3059                                 } else {
3060                                         WRITE_UC();
3061                                         REPLACE_UC_WITH_UC2();
3062                                 }
3063                                 continue;
3064                         } else if ((nfc = get_nfc(uc, uc2)) != 0) {
3065                                 /* A composition to current code points
3066                                  * is found. */
3067                                 UPDATE_UC(nfc);
3068                                 continue;
3069                         } else if ((cl = CCC(uc2)) == 0) {
3070                                 /* Clearly 'uc2' the second code point is not
3071                                  * a decomposable code. */
3072                                 WRITE_UC();
3073                                 REPLACE_UC_WITH_UC2();
3074                                 continue;
3075                         }
3076
3077                         /*
3078                          * Collect following decomposable code points.
3079                          */
3080                         cx = 0;
3081                         ucx[0] = uc2;
3082                         ccx[0] = cl;
3083                         COLLECT_CPS(1);
3084
3085                         /*
3086                          * Find a composed code in the collected code points.
3087                          */
3088                         i = 1;
3089                         while (i < ucx_size) {
3090                                 int j;
3091
3092                                 if ((nfc = get_nfc(uc, ucx[i])) == 0) {
3093                                         i++;
3094                                         continue;
3095                                 }
3096
3097                                 /*
3098                                  * nfc is composed of uc and ucx[i].
3099                                  */
3100                                 UPDATE_UC(nfc);
3101
3102                                 /*
3103                                  * Remove ucx[i] by shifting
3104                                  * following code points.
3105                                  */
3106                                 for (j = i; j+1 < ucx_size; j++) {
3107                                         ucx[j] = ucx[j+1];
3108                                         ccx[j] = ccx[j+1];
3109                                 }
3110                                 ucx_size --;
3111
3112                                 /*
3113                                  * Collect following code points blocked
3114                                  * by ucx[i] the removed code point.
3115                                  */
3116                                 if (ucx_size > 0 && i == ucx_size &&
3117                                     nx > 0 && cx == cl) {
3118                                         cl =  ccx[ucx_size-1];
3119                                         COLLECT_CPS(ucx_size);
3120                                 }
3121                                 /*
3122                                  * Restart finding a composed code with
3123                                  * the updated uc from the top of the
3124                                  * collected code points.
3125                                  */
3126                                 i = 0;
3127                         }
3128
3129                         /*
3130                          * Apparently the current code points are not
3131                          * decomposed characters or already composed.
3132                          */
3133                         WRITE_UC();
3134                         for (i = 0; i < ucx_size; i++)
3135                                 UNPARSE(p, endp, ucx[i]);
3136
3137                         /*
3138                          * Flush out remaining canonical combining characters.
3139                          */
3140                         if (nx > 0 && cx == cl && len > 0) {
3141                                 while ((nx = parse(&ucx[0], s, len))
3142                                     > 0) {
3143                                         cx = CCC(ucx[0]);
3144                                         if (cl > cx)
3145                                                 break;
3146                                         s += nx;
3147                                         len -= nx;
3148                                         cl = cx;
3149                                         UNPARSE(p, endp, ucx[0]);
3150                                 }
3151                         }
3152                         break;
3153                 }
3154                 if (n2 < 0) {
3155                         WRITE_UC();
3156                         /* Use a replaced unicode character. */
3157                         UNPARSE(p, endp, uc2);
3158                         s += n2*-1;
3159                         len -= n2*-1;
3160                         ret = -1;
3161                         continue;
3162                 } else if (n2 == 0) {
3163                         WRITE_UC();
3164                         break;
3165                 }
3166         }
3167         as->length = p - as->s;
3168         as->s[as->length] = '\0';
3169         if (ts == 2)
3170                 as->s[as->length+1] = '\0';
3171         return (ret);
3172 }
3173
3174 static int
3175 get_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc)
3176 {
3177         int t, b;
3178
3179         /*
3180          * These are not converted to NFD on Mac OS.
3181          */
3182         if ((uc >= 0x2000 && uc <= 0x2FFF) ||
3183             (uc >= 0xF900 && uc <= 0xFAFF) ||
3184             (uc >= 0x2F800 && uc <= 0x2FAFF))
3185                 return (0);
3186         /*
3187          * Those code points are not converted to NFD on Mac OS.
3188          * I do not know the reason because it is undocumented.
3189          *   NFC        NFD
3190          *   1109A  ==> 11099 110BA
3191          *   1109C  ==> 1109B 110BA
3192          *   110AB  ==> 110A5 110BA
3193          */
3194         if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB)
3195                 return (0);
3196
3197         t = 0;
3198         b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1;
3199         while (b >= t) {
3200                 int m = (t + b) / 2;
3201                 if (u_decomposition_table[m].nfc < uc)
3202                         t = m + 1;
3203                 else if (u_decomposition_table[m].nfc > uc)
3204                         b = m - 1;
3205                 else {
3206                         *cp1 = u_decomposition_table[m].cp1;
3207                         *cp2 = u_decomposition_table[m].cp2;
3208                         return (1);
3209                 }
3210         }
3211         return (0);
3212 }
3213
3214 #define REPLACE_UC_WITH(cp) do {                \
3215         uc = cp;                                \
3216         ucptr = NULL;                           \
3217 } while (0)
3218
3219 /*
3220  * Normalize UTF-8 characters to Form D and copy the result.
3221  */
3222 static int
3223 archive_string_normalize_D(struct archive_string *as, const void *_p,
3224     size_t len, struct archive_string_conv *sc)
3225 {
3226         const char *s = (const char *)_p;
3227         char *p, *endp;
3228         uint32_t uc, uc2;
3229         size_t w;
3230         int always_replace, n, n2, ret = 0, spair, ts, tm;
3231         int (*parse)(uint32_t *, const char *, size_t);
3232         size_t (*unparse)(char *, size_t, uint32_t);
3233
3234         always_replace = 1;
3235         ts = 1;/* text size. */
3236         if (sc->flag & SCONV_TO_UTF16BE) {
3237                 unparse = unicode_to_utf16be;
3238                 ts = 2;
3239                 if (sc->flag & SCONV_FROM_UTF16BE)
3240                         always_replace = 0;
3241         } else if (sc->flag & SCONV_TO_UTF16LE) {
3242                 unparse = unicode_to_utf16le;
3243                 ts = 2;
3244                 if (sc->flag & SCONV_FROM_UTF16LE)
3245                         always_replace = 0;
3246         } else if (sc->flag & SCONV_TO_UTF8) {
3247                 unparse = unicode_to_utf8;
3248                 if (sc->flag & SCONV_FROM_UTF8)
3249                         always_replace = 0;
3250         } else {
3251                 /*
3252                  * This case is going to be converted to another
3253                  * character-set through iconv.
3254                  */
3255                 always_replace = 0;
3256                 if (sc->flag & SCONV_FROM_UTF16BE) {
3257                         unparse = unicode_to_utf16be;
3258                         ts = 2;
3259                 } else if (sc->flag & SCONV_FROM_UTF16LE) {
3260                         unparse = unicode_to_utf16le;
3261                         ts = 2;
3262                 } else {
3263                         unparse = unicode_to_utf8;
3264                 }
3265         }
3266
3267         if (sc->flag & SCONV_FROM_UTF16BE) {
3268                 parse = utf16be_to_unicode;
3269                 tm = 1;
3270                 spair = 4;/* surrogate pair size in UTF-16. */
3271         } else if (sc->flag & SCONV_FROM_UTF16LE) {
3272                 parse = utf16le_to_unicode;
3273                 tm = 1;
3274                 spair = 4;/* surrogate pair size in UTF-16. */
3275         } else {
3276                 parse = cesu8_to_unicode;
3277                 tm = ts;
3278                 spair = 6;/* surrogate pair size in UTF-8. */
3279         }
3280
3281         if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)
3282                 return (-1);
3283
3284         p = as->s + as->length;
3285         endp = as->s + as->buffer_length - ts;
3286         while ((n = parse(&uc, s, len)) != 0) {
3287                 const char *ucptr;
3288                 uint32_t cp1, cp2;
3289                 int SIndex;
3290                 struct {
3291                         uint32_t uc;
3292                         int ccc;
3293                 } fdc[FDC_MAX];
3294                 int fdi, fdj;
3295                 int ccc;
3296
3297 check_first_code:
3298                 if (n < 0) {
3299                         /* Use a replaced unicode character. */
3300                         UNPARSE(p, endp, uc);
3301                         s += n*-1;
3302                         len -= n*-1;
3303                         ret = -1;
3304                         continue;
3305                 } else if (n == spair || always_replace)
3306                         /* uc is converted from a surrogate pair.
3307                          * this should be treated as a changed code. */
3308                         ucptr = NULL;
3309                 else
3310                         ucptr = s;
3311                 s += n;
3312                 len -= n;
3313
3314                 /* Hangul Decomposition. */
3315                 if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) {
3316                         int L = HC_LBASE + SIndex / HC_NCOUNT;
3317                         int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT;
3318                         int T = HC_TBASE + SIndex % HC_TCOUNT;
3319
3320                         REPLACE_UC_WITH(L);
3321                         WRITE_UC();
3322                         REPLACE_UC_WITH(V);
3323                         WRITE_UC();
3324                         if (T != HC_TBASE) {
3325                                 REPLACE_UC_WITH(T);
3326                                 WRITE_UC();
3327                         }
3328                         continue;
3329                 }
3330                 if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) {
3331                         WRITE_UC();
3332                         continue;
3333                 }
3334
3335                 fdi = 0;
3336                 while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) {
3337                         int k;
3338
3339                         for (k = fdi; k > 0; k--)
3340                                 fdc[k] = fdc[k-1];
3341                         fdc[0].ccc = CCC(cp2);
3342                         fdc[0].uc = cp2;
3343                         fdi++;
3344                         REPLACE_UC_WITH(cp1);
3345                 }
3346
3347                 /* Read following code points. */
3348                 while ((n2 = parse(&uc2, s, len)) > 0 &&
3349                     (ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) {
3350                         int j, k;
3351
3352                         s += n2;
3353                         len -= n2;
3354                         for (j = 0; j < fdi; j++) {
3355                                 if (fdc[j].ccc > ccc)
3356                                         break;
3357                         }
3358                         if (j < fdi) {
3359                                 for (k = fdi; k > j; k--)
3360                                         fdc[k] = fdc[k-1];
3361                                 fdc[j].ccc = ccc;
3362                                 fdc[j].uc = uc2;
3363                         } else {
3364                                 fdc[fdi].ccc = ccc;
3365                                 fdc[fdi].uc = uc2;
3366                         }
3367                         fdi++;
3368                 }
3369
3370                 WRITE_UC();
3371                 for (fdj = 0; fdj < fdi; fdj++) {
3372                         REPLACE_UC_WITH(fdc[fdj].uc);
3373                         WRITE_UC();
3374                 }
3375
3376                 if (n2 == 0)
3377                         break;
3378                 REPLACE_UC_WITH(uc2);
3379                 n = n2;
3380                 goto check_first_code;
3381         }
3382         as->length = p - as->s;
3383         as->s[as->length] = '\0';
3384         if (ts == 2)
3385                 as->s[as->length+1] = '\0';
3386         return (ret);
3387 }
3388
3389 /*
3390  * libarchive 2.x made incorrect UTF-8 strings in the wrong assumption
3391  * that WCS is Unicode. It is true for several platforms but some are false.
3392  * And then people who did not use UTF-8 locale on the non Unicode WCS
3393  * platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those
3394  * now cannot get right filename from libarchive 3.x and later since we
3395  * fixed the wrong assumption and it is incompatible to older its versions.
3396  * So we provide special option, "compat-2x.x", for resolving it.
3397  * That option enable the string conversion of libarchive 2.x.
3398  *
3399  * Translates the wrong UTF-8 string made by libarchive 2.x into current
3400  * locale character set and appends to the archive_string.
3401  * Note: returns -1 if conversion fails.
3402  */
3403 static int
3404 strncat_from_utf8_libarchive2(struct archive_string *as,
3405     const void *_p, size_t len, struct archive_string_conv *sc)
3406 {
3407         const char *s;
3408         int n;
3409         char *p;
3410         char *end;
3411         uint32_t unicode;
3412 #if HAVE_WCRTOMB
3413         mbstate_t shift_state;
3414
3415         memset(&shift_state, 0, sizeof(shift_state));
3416 #else
3417         /* Clear the shift state before starting. */
3418         wctomb(NULL, L'\0');
3419 #endif
3420         (void)sc; /* UNUSED */
3421         /*
3422          * Allocate buffer for MBS.
3423          * We need this allocation here since it is possible that
3424          * as->s is still NULL.
3425          */
3426         if (archive_string_ensure(as, as->length + len + 1) == NULL)
3427                 return (-1);
3428
3429         s = (const char *)_p;
3430         p = as->s + as->length;
3431         end = as->s + as->buffer_length - MB_CUR_MAX -1;
3432         while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) {
3433                 wchar_t wc;
3434
3435                 if (p >= end) {
3436                         as->length = p - as->s;
3437                         /* Re-allocate buffer for MBS. */
3438                         if (archive_string_ensure(as,
3439                             as->length + len * 2 + 1) == NULL)
3440                                 return (-1);
3441                         p = as->s + as->length;
3442                         end = as->s + as->buffer_length - MB_CUR_MAX -1;
3443                 }
3444
3445                 /*
3446                  * As libarchive 2.x, translates the UTF-8 characters into
3447                  * wide-characters in the assumption that WCS is Unicode.
3448                  */
3449                 if (n < 0) {
3450                         n *= -1;
3451                         wc = L'?';
3452                 } else
3453                         wc = (wchar_t)unicode;
3454
3455                 s += n;
3456                 len -= n;
3457                 /*
3458                  * Translates the wide-character into the current locale MBS.
3459                  */
3460 #if HAVE_WCRTOMB
3461                 n = (int)wcrtomb(p, wc, &shift_state);
3462 #else
3463                 n = (int)wctomb(p, wc);
3464 #endif
3465                 if (n == -1)
3466                         return (-1);
3467                 p += n;
3468         }
3469         as->length = p - as->s;
3470         as->s[as->length] = '\0';
3471         return (0);
3472 }
3473
3474
3475 /*
3476  * Conversion functions between current locale dependent MBS and UTF-16BE.
3477  *   strncat_from_utf16be() : UTF-16BE --> MBS
3478  *   strncat_to_utf16be()   : MBS --> UTF16BE
3479  */
3480
3481 #if defined(_WIN32) && !defined(__CYGWIN__)
3482
3483 /*
3484  * Convert a UTF-16BE/LE string to current locale and copy the result.
3485  * Return -1 if conversion fails.
3486  */
3487 static int
3488 win_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes,
3489     struct archive_string_conv *sc, int be)
3490 {
3491         struct archive_string tmp;
3492         const char *u16;
3493         int ll;
3494         BOOL defchar;
3495         char *mbs;
3496         size_t mbs_size, b;
3497         int ret = 0;
3498
3499         bytes &= ~1;
3500         if (archive_string_ensure(as, as->length + bytes +1) == NULL)
3501                 return (-1);
3502
3503         mbs = as->s + as->length;
3504         mbs_size = as->buffer_length - as->length -1;
3505
3506         if (sc->to_cp == CP_C_LOCALE) {
3507                 /*
3508                  * "C" locale special process.
3509                  */
3510                 u16 = _p;
3511                 ll = 0;
3512                 for (b = 0; b < bytes; b += 2) {
3513                         uint16_t val;
3514                         if (be)
3515                                 val = archive_be16dec(u16+b);
3516                         else
3517                                 val = archive_le16dec(u16+b);
3518                         if (val > 255) {
3519                                 *mbs++ = '?';
3520                                 ret = -1;
3521                         } else
3522                                 *mbs++ = (char)(val&0xff);
3523                         ll++;
3524                 }
3525                 as->length += ll;
3526                 as->s[as->length] = '\0';
3527                 return (ret);
3528         }
3529
3530         archive_string_init(&tmp);
3531         if (be) {
3532                 if (is_big_endian()) {
3533                         u16 = _p;
3534                 } else {
3535                         if (archive_string_ensure(&tmp, bytes+2) == NULL)
3536                                 return (-1);
3537                         memcpy(tmp.s, _p, bytes);
3538                         for (b = 0; b < bytes; b += 2) {
3539                                 uint16_t val = archive_be16dec(tmp.s+b);
3540                                 archive_le16enc(tmp.s+b, val);
3541                         }
3542                         u16 = tmp.s;
3543                 }
3544         } else {
3545                 if (!is_big_endian()) {
3546                         u16 = _p;
3547                 } else {
3548                         if (archive_string_ensure(&tmp, bytes+2) == NULL)
3549                                 return (-1);
3550                         memcpy(tmp.s, _p, bytes);
3551                         for (b = 0; b < bytes; b += 2) {
3552                                 uint16_t val = archive_le16dec(tmp.s+b);
3553                                 archive_be16enc(tmp.s+b, val);
3554                         }
3555                         u16 = tmp.s;
3556                 }
3557         }
3558
3559         do {
3560                 defchar = 0;
3561                 ll = WideCharToMultiByte(sc->to_cp, 0,
3562                     (LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size,
3563                         NULL, &defchar);
3564                 /* Exit loop if we succeeded */
3565                 if (ll != 0 ||
3566                     GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
3567                         break;
3568                 }
3569                 /* Else expand buffer and loop to try again. */
3570                 ll = WideCharToMultiByte(sc->to_cp, 0,
3571                     (LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL);
3572                 if (archive_string_ensure(as, ll +1) == NULL)
3573                         return (-1);
3574                 mbs = as->s + as->length;
3575                 mbs_size = as->buffer_length - as->length -1;
3576         } while (1);
3577         archive_string_free(&tmp);
3578         as->length += ll;
3579         as->s[as->length] = '\0';
3580         if (ll == 0 || defchar)
3581                 ret = -1;
3582         return (ret);
3583 }
3584
3585 static int
3586 win_strncat_from_utf16be(struct archive_string *as, const void *_p,
3587     size_t bytes, struct archive_string_conv *sc)
3588 {
3589         return (win_strncat_from_utf16(as, _p, bytes, sc, 1));
3590 }
3591
3592 static int
3593 win_strncat_from_utf16le(struct archive_string *as, const void *_p,
3594     size_t bytes, struct archive_string_conv *sc)
3595 {
3596         return (win_strncat_from_utf16(as, _p, bytes, sc, 0));
3597 }
3598
3599 static int
3600 is_big_endian(void)
3601 {
3602         uint16_t d = 1;
3603
3604         return (archive_be16dec(&d) == 1);
3605 }
3606
3607 /*
3608  * Convert a current locale string to UTF-16BE/LE and copy the result.
3609  * Return -1 if conversion fails.
3610  */
3611 static int
3612 win_strncat_to_utf16(struct archive_string *as16, const void *_p,
3613     size_t length, struct archive_string_conv *sc, int bigendian)
3614 {
3615         const char *s = (const char *)_p;
3616         char *u16;
3617         size_t count, avail;
3618
3619         if (archive_string_ensure(as16,
3620             as16->length + (length + 1) * 2) == NULL)
3621                 return (-1);
3622
3623         u16 = as16->s + as16->length;
3624         avail = as16->buffer_length - 2;
3625         if (sc->from_cp == CP_C_LOCALE) {
3626                 /*
3627                  * "C" locale special process.
3628                  */
3629                 count = 0;
3630                 while (count < length && *s) {
3631                         if (bigendian)
3632                                 archive_be16enc(u16, *s);
3633                         else
3634                                 archive_le16enc(u16, *s);
3635                         u16 += 2;
3636                         s++;
3637                         count++;
3638                 }
3639                 as16->length += count << 1;
3640                 as16->s[as16->length] = 0;
3641                 as16->s[as16->length+1] = 0;
3642                 return (0);
3643         }
3644         do {
3645                 count = MultiByteToWideChar(sc->from_cp,
3646                     MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1);
3647                 /* Exit loop if we succeeded */
3648                 if (count != 0 ||
3649                     GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
3650                         break;
3651                 }
3652                 /* Expand buffer and try again */
3653                 count = MultiByteToWideChar(sc->from_cp,
3654                     MB_PRECOMPOSED, s, (int)length, NULL, 0);
3655                 if (archive_string_ensure(as16, (count +1) * 2)
3656                     == NULL)
3657                         return (-1);
3658                 u16 = as16->s + as16->length;
3659                 avail = as16->buffer_length - 2;
3660         } while (1);
3661         as16->length += count * 2;
3662         as16->s[as16->length] = 0;
3663         as16->s[as16->length+1] = 0;
3664         if (count == 0)
3665                 return (-1);
3666
3667         if (is_big_endian()) {
3668                 if (!bigendian) {
3669                         while (count > 0) {
3670                                 uint16_t v = archive_be16dec(u16);
3671                                 archive_le16enc(u16, v);
3672                                 u16 += 2;
3673                                 count--;
3674                         }
3675                 }
3676         } else {
3677                 if (bigendian) {
3678                         while (count > 0) {
3679                                 uint16_t v = archive_le16dec(u16);
3680                                 archive_be16enc(u16, v);
3681                                 u16 += 2;
3682                                 count--;
3683                         }
3684                 }
3685         }
3686         return (0);
3687 }
3688
3689 static int
3690 win_strncat_to_utf16be(struct archive_string *as16, const void *_p,
3691     size_t length, struct archive_string_conv *sc)
3692 {
3693         return (win_strncat_to_utf16(as16, _p, length, sc, 1));
3694 }
3695
3696 static int
3697 win_strncat_to_utf16le(struct archive_string *as16, const void *_p,
3698     size_t length, struct archive_string_conv *sc)
3699 {
3700         return (win_strncat_to_utf16(as16, _p, length, sc, 0));
3701 }
3702
3703 #endif /* _WIN32 && !__CYGWIN__ */
3704
3705 /*
3706  * Do the best effort for conversions.
3707  * We cannot handle UTF-16BE character-set without such iconv,
3708  * but there is a chance if a string consists just ASCII code or
3709  * a current locale is UTF-8.
3710  */
3711
3712 /*
3713  * Convert a UTF-16BE string to current locale and copy the result.
3714  * Return -1 if conversion fails.
3715  */
3716 static int
3717 best_effort_strncat_from_utf16(struct archive_string *as, const void *_p,
3718     size_t bytes, struct archive_string_conv *sc, int be)
3719 {
3720         const char *utf16 = (const char *)_p;
3721         char *mbs;
3722         uint32_t uc;
3723         int n, ret;
3724
3725         (void)sc; /* UNUSED */
3726         /*
3727          * Other case, we should do the best effort.
3728          * If all character are ASCII(<0x7f), we can convert it.
3729          * if not , we set a alternative character and return -1.
3730          */
3731         ret = 0;
3732         if (archive_string_ensure(as, as->length + bytes +1) == NULL)
3733                 return (-1);
3734         mbs = as->s + as->length;
3735
3736         while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) {
3737                 if (n < 0) {
3738                         n *= -1;
3739                         ret =  -1;
3740                 }
3741                 bytes -= n;
3742                 utf16 += n;
3743
3744                 if (uc > 127) {
3745                         /* We cannot handle it. */
3746                         *mbs++ = '?';
3747                         ret =  -1;
3748                 } else
3749                         *mbs++ = (char)uc;
3750         }
3751         as->length = mbs - as->s;
3752         as->s[as->length] = '\0';
3753         return (ret);
3754 }
3755
3756 static int
3757 best_effort_strncat_from_utf16be(struct archive_string *as, const void *_p,
3758     size_t bytes, struct archive_string_conv *sc)
3759 {
3760         return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1));
3761 }
3762
3763 static int
3764 best_effort_strncat_from_utf16le(struct archive_string *as, const void *_p,
3765     size_t bytes, struct archive_string_conv *sc)
3766 {
3767         return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0));
3768 }
3769
3770 /*
3771  * Convert a current locale string to UTF-16BE/LE and copy the result.
3772  * Return -1 if conversion fails.
3773  */
3774 static int
3775 best_effort_strncat_to_utf16(struct archive_string *as16, const void *_p,
3776     size_t length, struct archive_string_conv *sc, int bigendian)
3777 {
3778         const char *s = (const char *)_p;
3779         char *utf16;
3780         size_t remaining;
3781         int ret;
3782
3783         (void)sc; /* UNUSED */
3784         /*
3785          * Other case, we should do the best effort.
3786          * If all character are ASCII(<0x7f), we can convert it.
3787          * if not , we set a alternative character and return -1.
3788          */
3789         ret = 0;
3790         remaining = length;
3791
3792         if (archive_string_ensure(as16,
3793             as16->length + (length + 1) * 2) == NULL)
3794                 return (-1);
3795
3796         utf16 = as16->s + as16->length;
3797         while (remaining--) {
3798                 unsigned c = *s++;
3799                 if (c > 127) {
3800                         /* We cannot handle it. */
3801                         c = UNICODE_R_CHAR;
3802                         ret = -1;
3803                 }
3804                 if (bigendian)
3805                         archive_be16enc(utf16, c);
3806                 else
3807                         archive_le16enc(utf16, c);
3808                 utf16 += 2;
3809         }
3810         as16->length = utf16 - as16->s;
3811         as16->s[as16->length] = 0;
3812         as16->s[as16->length+1] = 0;
3813         return (ret);
3814 }
3815
3816 static int
3817 best_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p,
3818     size_t length, struct archive_string_conv *sc)
3819 {
3820         return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1));
3821 }
3822
3823 static int
3824 best_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p,
3825     size_t length, struct archive_string_conv *sc)
3826 {
3827         return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0));
3828 }
3829
3830
3831 /*
3832  * Multistring operations.
3833  */
3834
3835 void
3836 archive_mstring_clean(struct archive_mstring *aes)
3837 {
3838         archive_wstring_free(&(aes->aes_wcs));
3839         archive_string_free(&(aes->aes_mbs));
3840         archive_string_free(&(aes->aes_utf8));
3841         archive_string_free(&(aes->aes_mbs_in_locale));
3842         aes->aes_set = 0;
3843 }
3844
3845 void
3846 archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src)
3847 {
3848         dest->aes_set = src->aes_set;
3849         archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs));
3850         archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8));
3851         archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs));
3852 }
3853
3854 int
3855 archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes,
3856   const char **p)
3857 {
3858         struct archive_string_conv *sc;
3859         int r;
3860
3861         /* If we already have a UTF8 form, return that immediately. */
3862         if (aes->aes_set & AES_SET_UTF8) {
3863                 *p = aes->aes_utf8.s;
3864                 return (0);
3865         }
3866
3867         *p = NULL;
3868         if (aes->aes_set & AES_SET_MBS) {
3869                 sc = archive_string_conversion_to_charset(a, "UTF-8", 1);
3870                 if (sc == NULL)
3871                         return (-1);/* Couldn't allocate memory for sc. */
3872                 r = archive_strncpy_l(&(aes->aes_utf8), aes->aes_mbs.s,
3873                     aes->aes_mbs.length, sc);
3874                 if (a == NULL)
3875                         free_sconv_object(sc);
3876                 if (r == 0) {
3877                         aes->aes_set |= AES_SET_UTF8;
3878                         *p = aes->aes_utf8.s;
3879                         return (0);/* success. */
3880                 } else
3881                         return (-1);/* failure. */
3882         }
3883         return (0);/* success. */
3884 }
3885
3886 int
3887 archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes,
3888     const char **p)
3889 {
3890         int r, ret = 0;
3891
3892         (void)a; /* UNUSED */
3893         /* If we already have an MBS form, return that immediately. */
3894         if (aes->aes_set & AES_SET_MBS) {
3895                 *p = aes->aes_mbs.s;
3896                 return (ret);
3897         }
3898
3899         *p = NULL;
3900         /* If there's a WCS form, try converting with the native locale. */
3901         if (aes->aes_set & AES_SET_WCS) {
3902                 archive_string_empty(&(aes->aes_mbs));
3903                 r = archive_string_append_from_wcs(&(aes->aes_mbs),
3904                     aes->aes_wcs.s, aes->aes_wcs.length);
3905                 *p = aes->aes_mbs.s;
3906                 if (r == 0) {
3907                         aes->aes_set |= AES_SET_MBS;
3908                         return (ret);
3909                 } else
3910                         ret = -1;
3911         }
3912
3913         /*
3914          * Only a UTF-8 form cannot avail because its conversion already
3915          * failed at archive_mstring_update_utf8().
3916          */
3917         return (ret);
3918 }
3919
3920 int
3921 archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes,
3922     const wchar_t **wp)
3923 {
3924         int r, ret = 0;
3925
3926         (void)a;/* UNUSED */
3927         /* Return WCS form if we already have it. */
3928         if (aes->aes_set & AES_SET_WCS) {
3929                 *wp = aes->aes_wcs.s;
3930                 return (ret);
3931         }
3932
3933         *wp = NULL;
3934         /* Try converting MBS to WCS using native locale. */
3935         if (aes->aes_set & AES_SET_MBS) {
3936                 archive_wstring_empty(&(aes->aes_wcs));
3937                 r = archive_wstring_append_from_mbs(&(aes->aes_wcs),
3938                     aes->aes_mbs.s, aes->aes_mbs.length);
3939                 if (r == 0) {
3940                         aes->aes_set |= AES_SET_WCS;
3941                         *wp = aes->aes_wcs.s;
3942                 } else
3943                         ret = -1;/* failure. */
3944         }
3945         return (ret);
3946 }
3947
3948 int
3949 archive_mstring_get_mbs_l(struct archive_mstring *aes,
3950     const char **p, size_t *length, struct archive_string_conv *sc)
3951 {
3952         int r, ret = 0;
3953
3954 #if defined(_WIN32) && !defined(__CYGWIN__)
3955         /*
3956          * Internationalization programming on Windows must use Wide
3957          * characters because Windows platform cannot make locale UTF-8.
3958          */
3959         if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) {
3960                 archive_string_empty(&(aes->aes_mbs_in_locale));
3961                 r = archive_string_append_from_wcs_in_codepage(
3962                     &(aes->aes_mbs_in_locale), aes->aes_wcs.s,
3963                     aes->aes_wcs.length, sc);
3964                 if (r == 0) {
3965                         *p = aes->aes_mbs_in_locale.s;
3966                         if (length != NULL)
3967                                 *length = aes->aes_mbs_in_locale.length;
3968                         return (0);
3969                 } else if (errno == ENOMEM)
3970                         return (-1);
3971                 else
3972                         ret = -1;
3973         }
3974 #endif
3975
3976         /* If there is not an MBS form but is a WCS form, try converting
3977          * with the native locale to be used for translating it to specified
3978          * character-set. */
3979         if ((aes->aes_set & AES_SET_MBS) == 0 &&
3980             (aes->aes_set & AES_SET_WCS) != 0) {
3981                 archive_string_empty(&(aes->aes_mbs));
3982                 r = archive_string_append_from_wcs(&(aes->aes_mbs),
3983                     aes->aes_wcs.s, aes->aes_wcs.length);
3984                 if (r == 0)
3985                         aes->aes_set |= AES_SET_MBS;
3986                 else if (errno == ENOMEM)
3987                         return (-1);
3988                 else
3989                         ret = -1;
3990         }
3991         /* If we already have an MBS form, use it to be translated to
3992          * specified character-set. */
3993         if (aes->aes_set & AES_SET_MBS) {
3994                 if (sc == NULL) {
3995                         /* Conversion is unneeded. */
3996                         *p = aes->aes_mbs.s;
3997                         if (length != NULL)
3998                                 *length = aes->aes_mbs.length;
3999                         return (0);
4000                 }
4001                 ret = archive_strncpy_l(&(aes->aes_mbs_in_locale),
4002                     aes->aes_mbs.s, aes->aes_mbs.length, sc);
4003                 *p = aes->aes_mbs_in_locale.s;
4004                 if (length != NULL)
4005                         *length = aes->aes_mbs_in_locale.length;
4006         } else {
4007                 *p = NULL;
4008                 if (length != NULL)
4009                         *length = 0;
4010         }
4011         return (ret);
4012 }
4013
4014 int
4015 archive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs)
4016 {
4017         if (mbs == NULL) {
4018                 aes->aes_set = 0;
4019                 return (0);
4020         }
4021         return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs)));
4022 }
4023
4024 int
4025 archive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs,
4026     size_t len)
4027 {
4028         if (mbs == NULL) {
4029                 aes->aes_set = 0;
4030                 return (0);
4031         }
4032         aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */
4033         archive_strncpy(&(aes->aes_mbs), mbs, len);
4034         archive_string_empty(&(aes->aes_utf8));
4035         archive_wstring_empty(&(aes->aes_wcs));
4036         return (0);
4037 }
4038
4039 int
4040 archive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs)
4041 {
4042         return archive_mstring_copy_wcs_len(aes, wcs,
4043                                 wcs == NULL ? 0 : wcslen(wcs));
4044 }
4045
4046 int
4047 archive_mstring_copy_utf8(struct archive_mstring *aes, const char *utf8)
4048 {
4049   if (utf8 == NULL) {
4050     aes->aes_set = 0;
4051   }
4052   aes->aes_set = AES_SET_UTF8;
4053   archive_string_empty(&(aes->aes_mbs));
4054   archive_string_empty(&(aes->aes_wcs));
4055   archive_strncpy(&(aes->aes_utf8), utf8, strlen(utf8));
4056   return (int)strlen(utf8);
4057 }
4058
4059 int
4060 archive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs,
4061     size_t len)
4062 {
4063         if (wcs == NULL) {
4064                 aes->aes_set = 0;
4065         }
4066         aes->aes_set = AES_SET_WCS; /* Only WCS form set. */
4067         archive_string_empty(&(aes->aes_mbs));
4068         archive_string_empty(&(aes->aes_utf8));
4069         archive_wstrncpy(&(aes->aes_wcs), wcs, len);
4070         return (0);
4071 }
4072
4073 int
4074 archive_mstring_copy_mbs_len_l(struct archive_mstring *aes,
4075     const char *mbs, size_t len, struct archive_string_conv *sc)
4076 {
4077         int r;
4078
4079         if (mbs == NULL) {
4080                 aes->aes_set = 0;
4081                 return (0);
4082         }
4083         archive_string_empty(&(aes->aes_mbs));
4084         archive_wstring_empty(&(aes->aes_wcs));
4085         archive_string_empty(&(aes->aes_utf8));
4086 #if defined(_WIN32) && !defined(__CYGWIN__)
4087         /*
4088          * Internationalization programming on Windows must use Wide
4089          * characters because Windows platform cannot make locale UTF-8.
4090          */
4091         if (sc == NULL) {
4092                 if (archive_string_append(&(aes->aes_mbs),
4093                         mbs, mbsnbytes(mbs, len)) == NULL) {
4094                         aes->aes_set = 0;
4095                         r = -1;
4096                 } else {
4097                         aes->aes_set = AES_SET_MBS;
4098                         r = 0;
4099                 }
4100 #if defined(HAVE_ICONV)
4101         } else if (sc != NULL && sc->cd_w != (iconv_t)-1) {
4102                 /*
4103                  * This case happens only when MultiByteToWideChar() cannot
4104                  * handle sc->from_cp, and we have to iconv in order to
4105                  * translate character-set to wchar_t,UTF-16.
4106                  */
4107                 iconv_t cd = sc->cd;
4108                 unsigned from_cp;
4109                 int flag;
4110
4111                 /*
4112                  * Translate multi-bytes from some character-set to UTF-8.
4113                  */
4114                 sc->cd = sc->cd_w;
4115                 r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc);
4116                 sc->cd = cd;
4117                 if (r != 0) {
4118                         aes->aes_set = 0;
4119                         return (r);
4120                 }
4121                 aes->aes_set = AES_SET_UTF8;
4122
4123                 /*
4124                  * Append the UTF-8 string into wstring.
4125                  */
4126                 flag = sc->flag;
4127                 sc->flag &= ~(SCONV_NORMALIZATION_C
4128                                 | SCONV_TO_UTF16| SCONV_FROM_UTF16);
4129                 from_cp = sc->from_cp;
4130                 sc->from_cp = CP_UTF8;
4131                 r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
4132                         aes->aes_utf8.s, aes->aes_utf8.length, sc);
4133                 sc->flag = flag;
4134                 sc->from_cp = from_cp;
4135                 if (r == 0)
4136                         aes->aes_set |= AES_SET_WCS;
4137 #endif
4138         } else {
4139                 r = archive_wstring_append_from_mbs_in_codepage(
4140                     &(aes->aes_wcs), mbs, len, sc);
4141                 if (r == 0)
4142                         aes->aes_set = AES_SET_WCS;
4143                 else
4144                         aes->aes_set = 0;
4145         }
4146 #else
4147         r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc);
4148         if (r == 0)
4149                 aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */
4150         else
4151                 aes->aes_set = 0;
4152 #endif
4153         return (r);
4154 }
4155
4156 /*
4157  * The 'update' form tries to proactively update all forms of
4158  * this string (WCS and MBS) and returns an error if any of
4159  * them fail.  This is used by the 'pax' handler, for instance,
4160  * to detect and report character-conversion failures early while
4161  * still allowing clients to get potentially useful values from
4162  * the more tolerant lazy conversions.  (get_mbs and get_wcs will
4163  * strive to give the user something useful, so you can get hopefully
4164  * usable values even if some of the character conversions are failing.)
4165  */
4166 int
4167 archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
4168     const char *utf8)
4169 {
4170         struct archive_string_conv *sc;
4171         int r;
4172
4173         if (utf8 == NULL) {
4174                 aes->aes_set = 0;
4175                 return (0); /* Succeeded in clearing everything. */
4176         }
4177
4178         /* Save the UTF8 string. */
4179         archive_strcpy(&(aes->aes_utf8), utf8);
4180
4181         /* Empty the mbs and wcs strings. */
4182         archive_string_empty(&(aes->aes_mbs));
4183         archive_wstring_empty(&(aes->aes_wcs));
4184
4185         aes->aes_set = AES_SET_UTF8;    /* Only UTF8 is set now. */
4186
4187         /* Try converting UTF-8 to MBS, return false on failure. */
4188         sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
4189         if (sc == NULL)
4190                 return (-1);/* Couldn't allocate memory for sc. */
4191         r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
4192         if (a == NULL)
4193                 free_sconv_object(sc);
4194         if (r != 0)
4195                 return (-1);
4196         aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */
4197
4198         /* Try converting MBS to WCS, return false on failure. */
4199         if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s,
4200             aes->aes_mbs.length))
4201                 return (-1);
4202         aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
4203
4204         /* All conversions succeeded. */
4205         return (0);
4206 }