]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - contrib/libarchive/libarchive/test/test_archive_string_conversion.c
MFC r368207,368607:
[FreeBSD/stable/10.git] / contrib / libarchive / libarchive / test / test_archive_string_conversion.c
1 /*-
2  * Copyright (c) 2011-2012 Michihiro NAKAJIMA
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 #include "test.h"
26 __FBSDID("$FreeBSD$");
27
28 #include <locale.h>
29
30 #define __LIBARCHIVE_TEST
31 #include "archive_string.h"
32
33 /*
34 Execute the following to rebuild the data for this program:
35    tail -n +36 test_archive_string_conversion.c | /bin/sh
36 #
37 # This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt
38 #
39 if="NormalizationTest.txt"
40 if [ ! -f ${if} ]; then
41   echo "Not found: \"${if}\""
42   exit 0
43 fi
44 of=test_archive_string_conversion.txt.Z
45 echo "\$FreeBSD\$" > ${of}.uu
46 awk -F ';'  '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} >> ${of}.uu
47 exit 1
48 */
49
50 static int
51 unicode_to_utf8(char *p, uint32_t uc)
52 {        
53         char *_p = p;
54
55         /* Translate code point to UTF8 */
56         if (uc <= 0x7f) {
57                 *p++ = (char)uc;
58         } else if (uc <= 0x7ff) {
59                 *p++ = 0xc0 | ((uc >> 6) & 0x1f);
60                 *p++ = 0x80 | (uc & 0x3f);
61         } else if (uc <= 0xffff) {
62                 *p++ = 0xe0 | ((uc >> 12) & 0x0f);
63                 *p++ = 0x80 | ((uc >> 6) & 0x3f);
64                 *p++ = 0x80 | (uc & 0x3f);
65         } else {
66                 *p++ = 0xf0 | ((uc >> 18) & 0x07);
67                 *p++ = 0x80 | ((uc >> 12) & 0x3f);
68                 *p++ = 0x80 | ((uc >> 6) & 0x3f);
69                 *p++ = 0x80 | (uc & 0x3f);
70         }
71         return ((int)(p - _p));
72 }
73
74 static void
75 archive_be16enc(void *pp, uint16_t u)
76 {
77         unsigned char *p = (unsigned char *)pp;
78
79         p[0] = (u >> 8) & 0xff;
80         p[1] = u & 0xff;
81 }
82
83 static int
84 unicode_to_utf16be(char *p, uint32_t uc)
85 {
86         char *utf16 = p;
87
88         if (uc > 0xffff) {
89                 /* We have a code point that won't fit into a
90                  * wchar_t; convert it to a surrogate pair. */
91                 uc -= 0x10000;
92                 archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
93                 archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
94                 return (4);
95         } else {
96                 archive_be16enc(utf16, uc);
97                 return (2);
98         }
99 }
100
101 static void
102 archive_le16enc(void *pp, uint16_t u)
103 {
104         unsigned char *p = (unsigned char *)pp;
105
106         p[0] = u & 0xff;
107         p[1] = (u >> 8) & 0xff;
108 }
109
110 static size_t
111 unicode_to_utf16le(char *p, uint32_t uc)
112 {
113         char *utf16 = p;
114
115         if (uc > 0xffff) {
116                 /* We have a code point that won't fit into a
117                  * wchar_t; convert it to a surrogate pair. */
118                 uc -= 0x10000;
119                 archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
120                 archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
121                 return (4);
122         } else {
123                 archive_le16enc(utf16, uc);
124                 return (2);
125         }
126 }
127
128 static int
129 wc_size(void)
130 {
131         return (sizeof(wchar_t));
132 }
133
134 static int
135 unicode_to_wc(wchar_t *wp, uint32_t uc)
136 {
137         if (wc_size() == 4) {
138                 *wp = (wchar_t)uc;
139                 return (1);
140         } 
141         if (uc > 0xffff) {
142                 /* We have a code point that won't fit into a
143                  * wchar_t; convert it to a surrogate pair. */
144                 uc -= 0x10000;
145                 *wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800);
146                 *wp = (wchar_t)((uc & 0x3ff) + 0xDC00);
147                 return (2);
148         } else {
149                 *wp = (wchar_t)uc;
150                 return (1);
151         }
152 }
153
154 /*
155  * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not
156  * converted to NFD on Mac OS.
157  * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
158  */
159 static int
160 scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le,
161     const char *pattern, int mac_nfd)
162 {
163         unsigned uc = 0;
164         const char *p = pattern;
165         char *op = out;
166         wchar_t *owp = wout;
167         char *op16be = u16be;
168         char *op16le = u16le;
169         int ret = 0;
170
171         for (;;) {
172                 if (*p >= '0' && *p <= '9')
173                         uc = (uc << 4) + (*p - '0');
174                 else if (*p >= 'A' && *p <= 'F')
175                         uc = (uc << 4) + (*p - 'A' + 0x0a);
176                 else {
177                         if (mac_nfd && op == out) {
178                                 /*
179                                  * These are not converted to NFD on Mac OS.
180                                  * U+2000 - U+2FFF
181                                  * U+F900 - U+FAFF
182                                  * U+2F800 - U+2FAFF
183                                  */
184                                 switch (uc) {
185                                 case 0x2194: case 0x219A: case 0x219B:
186                                 case 0x21AE: case 0x21CD: case 0x21CE:
187                                 case 0x21CF: case 0x2204: case 0x2209:
188                                 case 0x220C: case 0x2224: case 0x2226:
189                                 case 0x2241: case 0x2244: case 0x2247:
190                                 case 0x2249: case 0x2260: case 0x2262:
191                                 case 0x226D: case 0x226E: case 0x226F:
192                                 case 0x2270: case 0x2271: case 0x2274:
193                                 case 0x2275: case 0x2276: case 0x2278:
194                                 case 0x2279: case 0x227A: case 0x227B:
195                                 case 0x2280: case 0x2281: case 0x2284:
196                                 case 0x2285: case 0x2288: case 0x2289:
197                                 case 0x22AC: case 0x22AD: case 0x22AE:
198                                 case 0x22AF: case 0x22E0: case 0x22E1:
199                                 case 0x22E2: case 0x22E3: case 0x22EA:
200                                 case 0x22EB: case 0x22EC: case 0x22ED:
201                                 
202                                 /*
203                                  * Those code points are not converted to
204                                  * NFD on Mac OS. I do not know the reason
205                                  * because it is undocumented.
206                                  *   NFC        NFD
207                                  *   1109A  ==> 11099 110BA
208                                  *   1109C  ==> 1109B 110BA
209                                  *   110AB  ==> 110A5 110BA
210                                  */
211                                 case 0x1109A: case 0x1109C: case 0x110AB:
212                                         ret = 1;
213                                         break;
214                                 }
215                         }
216                         op16be += unicode_to_utf16be(op16be, uc);
217                         op16le += unicode_to_utf16le(op16le, uc);
218                         owp += unicode_to_wc(owp, uc);
219                         op += unicode_to_utf8(op, uc);
220                         if (!*p) {
221                                 *op16be++ = 0;
222                                 *op16be = 0;
223                                 *op16le++ = 0;
224                                 *op16le = 0;
225                                 *owp = L'\0';
226                                 *op = '\0';
227                                 break;
228                         }
229                         uc = 0;
230                 }
231                 p++;
232         }
233         return (ret);
234 }
235
236 static int
237 is_wc_unicode(void)
238 {
239 #if defined(_WIN32) && !defined(__CYGWIN__)
240         return (1);
241 #else
242         return (0);
243 #endif
244 }
245
246 /*
247  * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters.
248  * On Mac OS, the characters to be Form D.
249  * On other platforms, the characters to be Form C.
250  */
251 static void
252 test_archive_string_normalization_nfc(const char *testdata)
253 {
254         struct archive *a, *a2;
255         struct archive_string utf8;
256         struct archive_mstring mstr;
257         struct archive_string_conv *f_sconv8, *t_sconv8;
258         struct archive_string_conv *f_sconv16be, *f_sconv16le;
259         FILE *fp;
260         char buff[512];
261         int line = 0;
262         int locale_is_utf8, wc_is_unicode;
263         int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C;
264
265         locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
266         wc_is_unicode = is_wc_unicode();
267         /* If it doesn't exist, just warn and return. */
268         if (!locale_is_utf8 && !wc_is_unicode) {
269                 skipping("A test of string normalization for NFC requires "
270                     "a suitable locale; en_US.UTF-8 not available on this "
271                     "system");
272                 return;
273         }
274
275         archive_string_init(&utf8);
276         memset(&mstr, 0, sizeof(mstr));
277
278         /*
279          * Create string conversion objects.
280          */
281         assert((a = archive_read_new()) != NULL);
282         assertA(NULL != (f_sconv8 =
283             archive_string_conversion_from_charset(a, "UTF-8", 0)));
284         assertA(NULL != (f_sconv16be =
285             archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
286         assertA(NULL != (f_sconv16le =
287             archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
288         assert((a2 = archive_write_new()) != NULL);
289         assertA(NULL != (t_sconv8 =
290             archive_string_conversion_to_charset(a2, "UTF-8", 0)));
291         if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
292             t_sconv8 == NULL) {
293                 /* We cannot continue this test. */
294                 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
295                 return;
296         }
297         archive_string_conversion_set_opt(f_sconv8, sconv_opt);
298         archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
299         archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
300         archive_string_conversion_set_opt(t_sconv8, sconv_opt);
301
302         /* Open a test pattern file. */
303         assert((fp = fopen(testdata, "r")) != NULL);
304
305         /*
306          * Read test data.
307          *  Test data format:
308          *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
309          *  Unicode pattern format:
310          *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
311          */
312         while (fgets(buff, sizeof(buff), fp) != NULL) {
313                 char nfc[80], nfd[80];
314                 char utf8_nfc[80], utf8_nfd[80];
315                 char utf16be_nfc[80], utf16be_nfd[80];
316                 char utf16le_nfc[80], utf16le_nfd[80];
317                 wchar_t wc_nfc[40], wc_nfd[40];
318                 char *e, *p;
319                 const wchar_t *wp;
320                 const char *mp;
321                 size_t mplen;
322
323                 line++;
324                 if (buff[0] == '#')
325                         continue;
326                 p = strchr(buff, ';');
327                 if (p == NULL)
328                         continue;
329                 *p++ = '\0';
330                 /* Copy an NFC pattern */
331                 strncpy(nfc, buff, sizeof(nfc)-1);
332                 nfc[sizeof(nfc)-1] = '\0';
333                 e = p;
334                 p = strchr(p, '\n');
335                 if (p == NULL)
336                         continue;
337                 *p = '\0';
338                 /* Copy an NFD pattern */
339                 strncpy(nfd, e, sizeof(nfd)-1);
340                 nfd[sizeof(nfd)-1] = '\0';
341
342                 /*
343                  * Get an NFC patterns.
344                  */
345                 scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc,
346                     nfc, 0);
347
348                 /*
349                  * Get an NFD patterns.
350                  */
351                 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
352                     nfd, 0);
353
354                 if (locale_is_utf8) {
355                         /*
356                          * Normalize an NFD string for import.
357                          */
358                         assertEqualInt(0, archive_strcpy_l(
359                             &utf8, utf8_nfd, f_sconv8));
360                         failure("NFD(%s) should be converted to NFC(%s):%d",
361                             nfd, nfc, line);
362                         assertEqualUTF8String(utf8_nfc, utf8.s);
363
364                         /*
365                          * Normalize an NFC string for import.
366                          */
367                         assertEqualInt(0, archive_strcpy_l(
368                             &utf8, utf8_nfc, f_sconv8));
369                         failure("NFC(%s) should not be any changed:%d",
370                             nfc, line);
371                         assertEqualUTF8String(utf8_nfc, utf8.s);
372
373                         /*
374                          * Copy an NFC string for export.
375                          */
376                         assertEqualInt(0, archive_strcpy_l(
377                             &utf8, utf8_nfc, t_sconv8));
378                         failure("NFC(%s) should not be any changed:%d",
379                             nfc, line);
380                         assertEqualUTF8String(utf8_nfc, utf8.s);
381
382                         /*
383                          * Normalize an NFD string in UTF-16BE for import.
384                          */
385                         assertEqualInt(0, archive_strncpy_l(
386                             &utf8, utf16be_nfd, 100000, f_sconv16be));
387                         failure("NFD(%s) should be converted to NFC(%s):%d",
388                             nfd, nfc, line);
389                         assertEqualUTF8String(utf8_nfc, utf8.s);
390
391                         /*
392                          * Normalize an NFD string in UTF-16LE for import.
393                          */
394                         assertEqualInt(0, archive_strncpy_l(
395                             &utf8, utf16le_nfd, 100000, f_sconv16le));
396                         failure("NFD(%s) should be converted to NFC(%s):%d",
397                             nfd, nfc, line);
398                         assertEqualUTF8String(utf8_nfc, utf8.s);
399                 }
400
401                 /*
402                  * Test for archive_mstring interface.
403                  * In specific, Windows platform UTF-16BE is directly
404                  * converted to/from wide-character to avoid the effect of
405                  * current locale since windows platform cannot make
406                  * locale UTF-8.
407                  */
408                 if (locale_is_utf8 || wc_is_unicode) {
409                         /*
410                          * Normalize an NFD string in UTF-8 for import.
411                          */
412                         assertEqualInt(0, archive_mstring_copy_mbs_len_l(
413                             &mstr, utf8_nfd, 100000, f_sconv8));
414                         assertEqualInt(0,
415                             archive_mstring_get_wcs(a, &mstr, &wp));
416                         failure("UTF-8 NFD(%s) should be converted "
417                             "to WCS NFC(%s):%d", nfd, nfc, line);
418                         assertEqualWString(wc_nfc, wp);
419
420                         /*
421                          * Normalize an NFD string in UTF-16BE for import.
422                          */
423                         assertEqualInt(0, archive_mstring_copy_mbs_len_l(
424                             &mstr, utf16be_nfd, 100000, f_sconv16be));
425                         assertEqualInt(0,
426                             archive_mstring_get_wcs(a, &mstr, &wp));
427                         failure("UTF-8 NFD(%s) should be converted "
428                             "to WCS NFC(%s):%d", nfd, nfc, line);
429                         assertEqualWString(wc_nfc, wp);
430
431                         /*
432                          * Normalize an NFD string in UTF-16LE for import.
433                          */
434                         assertEqualInt(0, archive_mstring_copy_mbs_len_l(
435                             &mstr, utf16le_nfd, 100000, f_sconv16le));
436                         assertEqualInt(0,
437                             archive_mstring_get_wcs(a, &mstr, &wp));
438                         failure("UTF-8 NFD(%s) should be converted "
439                             "to WCS NFC(%s):%d", nfd, nfc, line);
440                         assertEqualWString(wc_nfc, wp);
441
442                         /*
443                          * Copy an NFC wide-string for export.
444                          */
445                         assertEqualInt(0,
446                             archive_mstring_copy_wcs(&mstr, wc_nfc));
447                         assertEqualInt(0, archive_mstring_get_mbs_l(
448                             a, &mstr, &mp, &mplen, t_sconv8));
449                         failure("WCS NFC(%s) should be UTF-8 NFC:%d"
450                             ,nfc, line);
451                         assertEqualUTF8String(utf8_nfc, mp);
452                 }
453         }
454
455         archive_string_free(&utf8);
456         archive_mstring_clean(&mstr);
457         fclose(fp);
458         assertEqualInt(ARCHIVE_OK, archive_read_free(a));
459         assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
460 }
461
462 static void
463 test_archive_string_normalization_mac_nfd(const char *testdata)
464 {
465         struct archive *a, *a2;
466         struct archive_string utf8;
467         struct archive_mstring mstr;
468         struct archive_string_conv *f_sconv8, *t_sconv8;
469         struct archive_string_conv *f_sconv16be, *f_sconv16le;
470         FILE *fp;
471         char buff[512];
472         int line = 0;
473         int locale_is_utf8, wc_is_unicode;
474         int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D;
475
476         locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
477         wc_is_unicode = is_wc_unicode();
478         /* If it doesn't exist, just warn and return. */
479         if (!locale_is_utf8 && !wc_is_unicode) {
480                 skipping("A test of string normalization for NFD requires "
481                     "a suitable locale; en_US.UTF-8 not available on this "
482                     "system");
483                 return;
484         }
485
486         archive_string_init(&utf8);
487         memset(&mstr, 0, sizeof(mstr));
488
489         /*
490          * Create string conversion objects.
491          */
492         assert((a = archive_read_new()) != NULL);
493         assertA(NULL != (f_sconv8 =
494             archive_string_conversion_from_charset(a, "UTF-8", 0)));
495         assertA(NULL != (f_sconv16be =
496             archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
497         assertA(NULL != (f_sconv16le =
498             archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
499         assert((a2 = archive_write_new()) != NULL);
500         assertA(NULL != (t_sconv8 =
501             archive_string_conversion_to_charset(a2, "UTF-8", 0)));
502         if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
503             t_sconv8 == NULL) {
504                 /* We cannot continue this test. */
505                 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
506                 return;
507         }
508         archive_string_conversion_set_opt(f_sconv8, sconv_opt);
509         archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
510         archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
511         archive_string_conversion_set_opt(t_sconv8, sconv_opt);
512
513         /* Open a test pattern file. */
514         assert((fp = fopen(testdata, "r")) != NULL);
515
516         /*
517          * Read test data.
518          *  Test data format:
519          *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
520          *  Unicode pattern format:
521          *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
522          */
523         while (fgets(buff, sizeof(buff), fp) != NULL) {
524                 char nfc[80], nfd[80];
525                 char utf8_nfc[80], utf8_nfd[80];
526                 char utf16be_nfc[80], utf16be_nfd[80];
527                 char utf16le_nfc[80], utf16le_nfd[80];
528                 wchar_t wc_nfc[40], wc_nfd[40];
529                 char *e, *p;
530                 const wchar_t *wp;
531                 const char *mp;
532                 size_t mplen;
533                 int should_be_nfc;
534
535                 line++;
536                 if (buff[0] == '#')
537                         continue;
538                 p = strchr(buff, ';');
539                 if (p == NULL)
540                         continue;
541                 *p++ = '\0';
542                 /* Copy an NFC pattern */
543                 strncpy(nfc, buff, sizeof(nfc)-1);
544                 nfc[sizeof(nfc)-1] = '\0';
545                 e = p;
546                 p = strchr(p, '\n');
547                 if (p == NULL)
548                         continue;
549                 *p = '\0';
550                 /* Copy an NFD pattern */
551                 strncpy(nfd, e, sizeof(nfd)-1);
552                 nfd[sizeof(nfd)-1] = '\0';
553
554                 /*
555                  * Get an NFC patterns.
556                  */
557                 should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc,
558                         utf16be_nfc, utf16le_nfc, nfc, 1);
559
560                 /*
561                  * Get an NFD patterns.
562                  */
563                 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
564                     nfd, 0);
565
566                 if (locale_is_utf8) {
567                         /*
568                          * Normalize an NFC string for import.
569                          */
570                         assertEqualInt(0, archive_strcpy_l(
571                             &utf8, utf8_nfc, f_sconv8));
572                         if (should_be_nfc) {
573                                 failure("NFC(%s) should not be converted to"
574                                     " NFD(%s):%d", nfc, nfd, line);
575                                 assertEqualUTF8String(utf8_nfc, utf8.s);
576                         } else {
577                                 failure("NFC(%s) should be converted to"
578                                     " NFD(%s):%d", nfc, nfd, line);
579                                 assertEqualUTF8String(utf8_nfd, utf8.s);
580                         }
581
582                         /*
583                          * Normalize an NFD string for import.
584                          */
585                         assertEqualInt(0, archive_strcpy_l(
586                             &utf8, utf8_nfd, f_sconv8));
587                         failure("NFD(%s) should not be any changed:%d",
588                             nfd, line);
589                         assertEqualUTF8String(utf8_nfd, utf8.s);
590
591                         /*
592                          * Copy an NFD string for export.
593                          */
594                         assertEqualInt(0, archive_strcpy_l(
595                             &utf8, utf8_nfd, t_sconv8));
596                         failure("NFD(%s) should not be any changed:%d",
597                             nfd, line);
598                         assertEqualUTF8String(utf8_nfd, utf8.s);
599
600                         /*
601                          * Normalize an NFC string in UTF-16BE for import.
602                          */
603                         assertEqualInt(0, archive_strncpy_l(
604                             &utf8, utf16be_nfc, 100000, f_sconv16be));
605                         if (should_be_nfc) {
606                                 failure("NFC(%s) should not be converted to"
607                                     " NFD(%s):%d", nfc, nfd, line);
608                                 assertEqualUTF8String(utf8_nfc, utf8.s);
609                         } else {
610                                 failure("NFC(%s) should be converted to"
611                                     " NFD(%s):%d", nfc, nfd, line);
612                                 assertEqualUTF8String(utf8_nfd, utf8.s);
613                         }
614
615                         /*
616                          * Normalize an NFC string in UTF-16LE for import.
617                          */
618                         assertEqualInt(0, archive_strncpy_l(
619                             &utf8, utf16le_nfc, 100000, f_sconv16le));
620                         if (should_be_nfc) {
621                                 failure("NFC(%s) should not be converted to"
622                                     " NFD(%s):%d", nfc, nfd, line);
623                                 assertEqualUTF8String(utf8_nfc, utf8.s);
624                         } else {
625                                 failure("NFC(%s) should be converted to"
626                                     " NFD(%s):%d", nfc, nfd, line);
627                                 assertEqualUTF8String(utf8_nfd, utf8.s);
628                         }
629                 }
630
631                 /*
632                  * Test for archive_mstring interface.
633                  * In specific, Windows platform UTF-16BE is directly
634                  * converted to/from wide-character to avoid the effect of
635                  * current locale since windows platform cannot make
636                  * locale UTF-8.
637                  */
638                 if (locale_is_utf8 || wc_is_unicode) {
639                         /*
640                          * Normalize an NFD string in UTF-8 for import.
641                          */
642                         assertEqualInt(0, archive_mstring_copy_mbs_len_l(
643                             &mstr, utf8_nfc, 100000, f_sconv8));
644                         assertEqualInt(0,
645                             archive_mstring_get_wcs(a, &mstr, &wp));
646                         if (should_be_nfc) {
647                                 failure("UTF-8 NFC(%s) should not be converted "
648                                     "to WCS NFD(%s):%d", nfc, nfd, line);
649                                 assertEqualWString(wc_nfc, wp);
650                         } else {
651                                 failure("UTF-8 NFC(%s) should be converted "
652                                     "to WCS NFD(%s):%d", nfc, nfd, line);
653                                 assertEqualWString(wc_nfd, wp);
654                         }
655
656                         /*
657                          * Normalize an NFD string in UTF-16BE for import.
658                          */
659                         assertEqualInt(0, archive_mstring_copy_mbs_len_l(
660                             &mstr, utf16be_nfc, 100000, f_sconv16be));
661                         assertEqualInt(0,
662                             archive_mstring_get_wcs(a, &mstr, &wp));
663                         if (should_be_nfc) {
664                                 failure("UTF-16BE NFC(%s) should not be "
665                                     "converted to WCS NFD(%s):%d",
666                                     nfc, nfd, line);
667                                 assertEqualWString(wc_nfc, wp);
668                         } else {
669                                 failure("UTF-16BE NFC(%s) should be converted "
670                                     "to WCS NFD(%s):%d", nfc, nfd, line);
671                                 assertEqualWString(wc_nfd, wp);
672                         }
673
674                         /*
675                          * Normalize an NFD string in UTF-16LE for import.
676                          */
677                         assertEqualInt(0, archive_mstring_copy_mbs_len_l(
678                             &mstr, utf16le_nfc, 100000, f_sconv16le));
679                         assertEqualInt(0,
680                             archive_mstring_get_wcs(a, &mstr, &wp));
681                         if (should_be_nfc) {
682                                 failure("UTF-16LE NFC(%s) should not be "
683                                     "converted to WCS NFD(%s):%d",
684                                     nfc, nfd, line);
685                                 assertEqualWString(wc_nfc, wp);
686                         } else {
687                                 failure("UTF-16LE NFC(%s) should be converted "
688                                     "to WCS NFD(%s):%d", nfc, nfd, line);
689                                 assertEqualWString(wc_nfd, wp);
690                         }
691
692                         /*
693                          * Copy an NFD wide-string for export.
694                          */
695                         assertEqualInt(0, archive_mstring_copy_wcs(
696                             &mstr, wc_nfd));
697                         assertEqualInt(0, archive_mstring_get_mbs_l(
698                             a, &mstr, &mp, &mplen, t_sconv8));
699                         failure("WCS NFD(%s) should be UTF-8 NFD:%d"
700                             ,nfd, line);
701                         assertEqualUTF8String(utf8_nfd, mp);
702                 }
703         }
704
705         archive_string_free(&utf8);
706         archive_mstring_clean(&mstr);
707         fclose(fp);
708         assertEqualInt(ARCHIVE_OK, archive_read_free(a));
709         assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
710 }
711
712 static void
713 test_archive_string_canonicalization(void)
714 {
715         struct archive *a;
716         struct archive_string_conv *sconv;
717
718         setlocale(LC_ALL, "en_US.UTF-8");
719
720         assert((a = archive_read_new()) != NULL);
721
722         assertA(NULL != (sconv =
723             archive_string_conversion_to_charset(a, "UTF-8", 1)));
724         failure("Charset name should be UTF-8");
725         assertEqualString("UTF-8",
726             archive_string_conversion_charset_name(sconv));
727
728         assertA(NULL != (sconv =
729             archive_string_conversion_to_charset(a, "UTF8", 1)));
730         failure("Charset name should be UTF-8");
731         assertEqualString("UTF-8",
732             archive_string_conversion_charset_name(sconv));
733
734         assertA(NULL != (sconv =
735             archive_string_conversion_to_charset(a, "utf8", 1)));
736         failure("Charset name should be UTF-8");
737         assertEqualString("UTF-8",
738             archive_string_conversion_charset_name(sconv));
739
740         assertA(NULL != (sconv =
741             archive_string_conversion_to_charset(a, "UTF-16BE", 1)));
742         failure("Charset name should be UTF-16BE");
743         assertEqualString("UTF-16BE",
744             archive_string_conversion_charset_name(sconv));
745
746         assertA(NULL != (sconv =
747             archive_string_conversion_to_charset(a, "UTF16BE", 1)));
748         failure("Charset name should be UTF-16BE");
749         assertEqualString("UTF-16BE",
750             archive_string_conversion_charset_name(sconv));
751
752         assertA(NULL != (sconv =
753             archive_string_conversion_to_charset(a, "utf16be", 1)));
754         failure("Charset name should be UTF-16BE");
755         assertEqualString("UTF-16BE",
756             archive_string_conversion_charset_name(sconv));
757
758         assertA(NULL != (sconv =
759             archive_string_conversion_to_charset(a, "UTF-16LE", 1)));
760         failure("Charset name should be UTF-16LE");
761         assertEqualString("UTF-16LE",
762             archive_string_conversion_charset_name(sconv));
763
764         assertA(NULL != (sconv =
765             archive_string_conversion_to_charset(a, "UTF16LE", 1)));
766         failure("Charset name should be UTF-16LE");
767         assertEqualString("UTF-16LE",
768             archive_string_conversion_charset_name(sconv));
769
770         assertA(NULL != (sconv =
771             archive_string_conversion_to_charset(a, "utf16le", 1)));
772         failure("Charset name should be UTF-16LE");
773         assertEqualString("UTF-16LE",
774             archive_string_conversion_charset_name(sconv));
775
776         assertEqualInt(ARCHIVE_OK, archive_read_free(a));
777
778 }
779
780 static void
781 check_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc,
782   const char *exp, const wchar_t *wexp)
783 {
784         /* Do all the tests on a copy so that we can have a clear initial state every time */
785         struct archive_mstring mstr2;
786         const char *p = NULL;
787         const wchar_t *wp = NULL;
788         size_t len = 0;
789
790         memset(&mstr2, 0, sizeof(mstr2));
791
792         archive_mstring_copy(&mstr2, mstr);
793         assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p));
794         assertEqualString(exp, p);
795         p = NULL;
796
797         archive_mstring_copy(&mstr2, mstr);
798         assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p));
799         assertEqualString(exp, p);
800         p = NULL;
801
802         archive_mstring_copy(&mstr2, mstr);
803         assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp));
804         assertEqualWString(wexp, wp);
805         wp = NULL;
806
807         archive_mstring_copy(&mstr2, mstr);
808         assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc));
809         assertEqualString(exp, p);
810         assertEqualInt(len, strlen(exp));
811         p = NULL;
812         len = 0;
813
814         archive_mstring_clean(&mstr2);
815 }
816
817 /*
818  * Make sure no matter what the input encoding is, the string can be
819  * converted too all the output encodings.
820  */
821 static void
822 test_archive_string_set_get(void)
823 {
824         struct archive *a;
825         struct archive_mstring mstr;
826         struct archive_string_conv *sc;
827
828         setlocale(LC_ALL, "en_US.UTF-8");
829
830         assert((a = archive_read_new()) != NULL);
831         memset(&mstr, 0, sizeof(mstr));
832
833         assertA(NULL != (sc =
834             archive_string_conversion_to_charset(a, "UTF-8", 1)));
835         failure("Charset name should be UTF-8");
836         assertEqualString("UTF-8",
837             archive_string_conversion_charset_name(sc));
838
839         assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA"));
840         check_string(a, &mstr, sc, "AAA", L"AAA");
841         assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB"));
842         check_string(a, &mstr, sc, "BBBB", L"BBBB");
843         assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12"));
844         check_string(a, &mstr, sc, "CCC12", L"CCC12");
845         assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc));
846         check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l");
847         assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H"));
848         check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H");
849
850         assertEqualInt(ARCHIVE_OK, archive_read_free(a));
851
852 }
853
854 DEFINE_TEST(test_archive_string_conversion)
855 {
856         static const char reffile[] = "test_archive_string_conversion.txt.Z";
857         static const char testdata[] = "testdata.txt";
858         struct archive *a;
859         struct archive_entry *ae;
860         char buff[512];
861         ssize_t size;
862         FILE *fp;
863
864         /*
865          * Extract a test pattern file.
866          */
867         extract_reference_file(reffile);
868         assert((a = archive_read_new()) != NULL);
869         assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
870         assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a));
871         assertEqualIntA(a, ARCHIVE_OK,
872             archive_read_open_filename(a, reffile, 512));
873
874         assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
875         assert((fp = fopen(testdata, "w")) != NULL);
876         while ((size = archive_read_data(a, buff, 512)) > 0)
877                 assertEqualInt(size, fwrite(buff, 1, size, fp));
878         assertEqualInt(0, fclose(fp));
879         assertEqualInt(ARCHIVE_OK, archive_read_free(a));
880
881         test_archive_string_normalization_nfc(testdata);
882         test_archive_string_normalization_mac_nfd(testdata);
883         test_archive_string_canonicalization();
884         test_archive_string_set_get();
885 }