]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - lib/libiconv_modules/UTF1632/citrus_utf1632.c
zfs: merge openzfs/zfs@a9d6b0690
[FreeBSD/FreeBSD.git] / lib / libiconv_modules / UTF1632 / citrus_utf1632.c
1 /* $FreeBSD$ */
2 /*      $NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $       */
3
4 /*-
5  * SPDX-License-Identifier: BSD-2-Clause
6  *
7  * Copyright (c)2003 Citrus Project,
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 #include <sys/cdefs.h>
33 #include <sys/endian.h>
34 #include <sys/types.h>
35
36 #include <assert.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <stddef.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <wchar.h>
44
45 #include "citrus_namespace.h"
46 #include "citrus_types.h"
47 #include "citrus_module.h"
48 #include "citrus_stdenc.h"
49 #include "citrus_bcs.h"
50
51 #include "citrus_utf1632.h"
52
53
54 /* ----------------------------------------------------------------------
55  * private stuffs used by templates
56  */
57
58 typedef struct {
59         int              chlen;
60         int              current_endian;
61         uint8_t          ch[4];
62 } _UTF1632State;
63
64 #define _ENDIAN_UNKNOWN         0
65 #define _ENDIAN_BIG             1
66 #define _ENDIAN_LITTLE          2
67 #if BYTE_ORDER == BIG_ENDIAN
68 #define _ENDIAN_INTERNAL        _ENDIAN_BIG
69 #define _ENDIAN_SWAPPED         _ENDIAN_LITTLE
70 #else
71 #define _ENDIAN_INTERNAL        _ENDIAN_LITTLE
72 #define _ENDIAN_SWAPPED _ENDIAN_BIG
73 #endif
74 #define _MODE_UTF32             0x00000001U
75 #define _MODE_FORCE_ENDIAN      0x00000002U
76
77 typedef struct {
78         int              preffered_endian;
79         unsigned int     cur_max;
80         unsigned int     cur_min;
81         uint32_t         mode;
82 } _UTF1632EncodingInfo;
83
84 #define _FUNCNAME(m)                    _citrus_UTF1632_##m
85 #define _ENCODING_INFO                  _UTF1632EncodingInfo
86 #define _ENCODING_STATE                 _UTF1632State
87 #define _ENCODING_MB_CUR_MAX(_ei_)      ((_ei_)->cur_max)
88 #define _ENCODING_MB_CUR_MIN(_ei_)      ((_ei_)->cur_min)
89 #define _ENCODING_IS_STATE_DEPENDENT    0
90 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_)        0
91
92
93 static __inline void
94 /*ARGSUSED*/
95 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei __unused,
96     _UTF1632State *s)
97 {
98
99         memset(s, 0, sizeof(*s));
100 }
101
102 static int
103 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc,
104     char **s, size_t n, _UTF1632State *psenc, size_t *nresult)
105 {
106         char *s0;
107         size_t result;
108         wchar_t wc = L'\0';
109         int chlenbak, endian, needlen;
110
111         s0 = *s;
112
113         if (s0 == NULL) {
114                 _citrus_UTF1632_init_state(ei, psenc);
115                 *nresult = 0; /* state independent */
116                 return (0);
117         }
118
119         result = 0;
120         chlenbak = psenc->chlen;
121
122 refetch:
123         needlen = ((ei->mode & _MODE_UTF32) != 0 || chlenbak >= 2) ? 4 : 2;
124
125         while (chlenbak < needlen) {
126                 if (n == 0)
127                         goto restart;
128                 psenc->ch[chlenbak++] = *s0++;
129                 n--;
130                 result++;
131         }
132
133         /* judge endian marker */
134         if ((ei->mode & _MODE_UTF32) == 0) {
135                 /* UTF16 */
136                 if (psenc->ch[0] == 0xFE && psenc->ch[1] == 0xFF) {
137                         psenc->current_endian = _ENDIAN_BIG;
138                         chlenbak = 0;
139                         goto refetch;
140                 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE) {
141                         psenc->current_endian = _ENDIAN_LITTLE;
142                         chlenbak = 0;
143                         goto refetch;
144                 }
145         } else {
146                 /* UTF32 */
147                 if (psenc->ch[0] == 0x00 && psenc->ch[1] == 0x00 &&
148                     psenc->ch[2] == 0xFE && psenc->ch[3] == 0xFF) {
149                         psenc->current_endian = _ENDIAN_BIG;
150                         chlenbak = 0;
151                         goto refetch;
152                 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE &&
153                            psenc->ch[2] == 0x00 && psenc->ch[3] == 0x00) {
154                         psenc->current_endian = _ENDIAN_LITTLE;
155                         chlenbak = 0;
156                         goto refetch;
157                 }
158         }
159         endian = ((ei->mode & _MODE_FORCE_ENDIAN) != 0 ||
160             psenc->current_endian == _ENDIAN_UNKNOWN) ? ei->preffered_endian :
161             psenc->current_endian;
162
163         /* get wc */
164         if ((ei->mode & _MODE_UTF32) == 0) {
165                 /* UTF16 */
166                 if (needlen == 2) {
167                         switch (endian) {
168                         case _ENDIAN_LITTLE:
169                                 wc = (psenc->ch[0] |
170                                     ((wchar_t)psenc->ch[1] << 8));
171                                 break;
172                         case _ENDIAN_BIG:
173                                 wc = (psenc->ch[1] |
174                                     ((wchar_t)psenc->ch[0] << 8));
175                                 break;
176                         default:
177                                 goto ilseq;
178                         }
179                         if (wc >= 0xD800 && wc <= 0xDBFF) {
180                                 /* surrogate high */
181                                 needlen = 4;
182                                 goto refetch;
183                         }
184                 } else {
185                         /* surrogate low */
186                         wc -= 0xD800; /* wc : surrogate high (see above) */
187                         wc <<= 10;
188                         switch (endian) {
189                         case _ENDIAN_LITTLE:
190                                 if (psenc->ch[3] < 0xDC || psenc->ch[3] > 0xDF)
191                                         goto ilseq;
192                                 wc |= psenc->ch[2];
193                                 wc |= (wchar_t)(psenc->ch[3] & 3) << 8;
194                                 break;
195                         case _ENDIAN_BIG:
196                                 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF)
197                                         goto ilseq;
198                                 wc |= psenc->ch[3];
199                                 wc |= (wchar_t)(psenc->ch[2] & 3) << 8;
200                                 break;
201                         default:
202                                 goto ilseq;
203                         }
204                         wc += 0x10000;
205                 }
206         } else {
207                 /* UTF32 */
208                 switch (endian) {
209                 case _ENDIAN_LITTLE:
210                         wc = (psenc->ch[0] |
211                             ((wchar_t)psenc->ch[1] << 8) |
212                             ((wchar_t)psenc->ch[2] << 16) |
213                             ((wchar_t)psenc->ch[3] << 24));
214                         break;
215                 case _ENDIAN_BIG:
216                         wc = (psenc->ch[3] |
217                             ((wchar_t)psenc->ch[2] << 8) |
218                             ((wchar_t)psenc->ch[1] << 16) |
219                             ((wchar_t)psenc->ch[0] << 24));
220                         break;
221                 default:
222                         goto ilseq;
223                 }
224                 if (wc >= 0xD800 && wc <= 0xDFFF)
225                         goto ilseq;
226         }
227
228
229         *pwc = wc;
230         psenc->chlen = 0;
231         *nresult = result;
232         *s = s0;
233
234         return (0);
235
236 ilseq:
237         *nresult = (size_t)-1;
238         psenc->chlen = 0;
239         return (EILSEQ);
240
241 restart:
242         *nresult = (size_t)-2;
243         psenc->chlen = chlenbak;
244         *s = s0;
245         return (0);
246 }
247
248 static int
249 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n,
250     wchar_t wc, _UTF1632State *psenc, size_t *nresult)
251 {
252         wchar_t wc2;
253         static const char _bom[4] = {
254             0x00, 0x00, 0xFE, 0xFF,
255         };
256         const char *bom = &_bom[0];
257         size_t cnt;
258
259         cnt = (size_t)0;
260         if (psenc->current_endian == _ENDIAN_UNKNOWN) {
261                 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) {
262                         if (ei->mode & _MODE_UTF32)
263                                 cnt = 4;
264                         else {
265                                 cnt = 2;
266                                 bom += 2;
267                         }
268                         if (n < cnt)
269                                 goto e2big;
270                         memcpy(s, bom, cnt);
271                         s += cnt, n -= cnt;
272                 }
273                 psenc->current_endian = ei->preffered_endian;
274         }
275
276         wc2 = 0;
277         if ((ei->mode & _MODE_UTF32)==0) {
278                 /* UTF16 */
279                 if (wc > 0xFFFF) {
280                         /* surrogate */
281                         if (wc > 0x10FFFF)
282                                 goto ilseq;
283                         if (n < 4)
284                                 goto e2big;
285                         cnt += 4;
286                         wc -= 0x10000;
287                         wc2 = (wc & 0x3FF) | 0xDC00;
288                         wc = (wc>>10) | 0xD800;
289                 } else {
290                         if (n < 2)
291                                 goto e2big;
292                         cnt += 2;
293                 }
294
295 surrogate:
296                 switch (psenc->current_endian) {
297                 case _ENDIAN_BIG:
298                         s[1] = wc;
299                         s[0] = (wc >>= 8);
300                         break;
301                 case _ENDIAN_LITTLE:
302                         s[0] = wc;
303                         s[1] = (wc >>= 8);
304                         break;
305                 }
306                 if (wc2 != 0) {
307                         wc = wc2;
308                         wc2 = 0;
309                         s += 2;
310                         goto surrogate;
311                 }
312         } else {
313                 /* UTF32 */
314                 if (wc >= 0xD800 && wc <= 0xDFFF)
315                         goto ilseq;
316                 if (n < 4)
317                         goto e2big;
318                 cnt += 4;
319                 switch (psenc->current_endian) {
320                 case _ENDIAN_BIG:
321                         s[3] = wc;
322                         s[2] = (wc >>= 8);
323                         s[1] = (wc >>= 8);
324                         s[0] = (wc >>= 8);
325                         break;
326                 case _ENDIAN_LITTLE:
327                         s[0] = wc;
328                         s[1] = (wc >>= 8);
329                         s[2] = (wc >>= 8);
330                         s[3] = (wc >>= 8);
331                         break;
332                 }
333         }
334         *nresult = cnt;
335
336         return (0);
337
338 ilseq:
339         *nresult = (size_t)-1;
340         return (EILSEQ);
341 e2big:
342         *nresult = (size_t)-1;
343         return (E2BIG);
344 }
345
346 static void
347 parse_variable(_UTF1632EncodingInfo * __restrict ei,
348     const void * __restrict var, size_t lenvar)
349 {
350         const char *p;
351
352         p = var;
353         while (lenvar > 0) {
354                 switch (*p) {
355                 case 'B':
356                 case 'b':
357                         MATCH(big, ei->preffered_endian = _ENDIAN_BIG);
358                         break;
359                 case 'L':
360                 case 'l':
361                         MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE);
362                         break;
363                 case 'i':
364                 case 'I':
365                         MATCH(internal, ei->preffered_endian = _ENDIAN_INTERNAL);
366                         break;
367                 case 's':
368                 case 'S':
369                         MATCH(swapped, ei->preffered_endian = _ENDIAN_SWAPPED);
370                         break;
371                 case 'F':
372                 case 'f':
373                         MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN);
374                         break;
375                 case 'U':
376                 case 'u':
377                         MATCH(utf32, ei->mode |= _MODE_UTF32);
378                         break;
379                 }
380                 p++;
381                 lenvar--;
382         }
383 }
384
385 static int
386 /*ARGSUSED*/
387 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei,
388     const void * __restrict var, size_t lenvar)
389 {
390
391         memset((void *)ei, 0, sizeof(*ei));
392
393         parse_variable(ei, var, lenvar);
394
395         ei->cur_min = ((ei->mode&_MODE_UTF32) == 0) ? 2 : 4;
396         ei->cur_max = ((ei->mode&_MODE_UTF32) == 0) ? 6 : 8;
397         /* 6: endian + surrogate */
398         /* 8: endian + normal */
399
400         if (ei->preffered_endian == _ENDIAN_UNKNOWN) {
401                 ei->preffered_endian = _ENDIAN_BIG;
402         }
403
404         return (0);
405 }
406
407 static void
408 /*ARGSUSED*/
409 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei __unused)
410 {
411
412 }
413
414 static __inline int
415 /*ARGSUSED*/
416 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei __unused,
417      _csid_t * __restrict csid, _index_t * __restrict idx, _wc_t wc)
418 {
419
420         *csid = 0;
421         *idx = (_index_t)wc;
422
423         return (0);
424 }
425
426 static __inline int
427 /*ARGSUSED*/
428 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei __unused,
429     _wc_t * __restrict wc, _csid_t csid, _index_t idx)
430 {
431
432         if (csid != 0)
433                 return (EILSEQ);
434
435         *wc = (_wc_t)idx;
436
437         return (0);
438 }
439
440 static __inline int
441 /*ARGSUSED*/
442 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei __unused,
443     _UTF1632State * __restrict psenc, int * __restrict rstate)
444 {
445
446         *rstate = (psenc->chlen == 0) ? _STDENC_SDGEN_INITIAL :
447             _STDENC_SDGEN_INCOMPLETE_CHAR;
448         return (0);
449 }
450
451 /* ----------------------------------------------------------------------
452  * public interface for stdenc
453  */
454
455 _CITRUS_STDENC_DECLS(UTF1632);
456 _CITRUS_STDENC_DEF_OPS(UTF1632);
457
458 #include "citrus_stdenc_template.h"