]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - lib/libiconv_modules/UTF1632/citrus_utf1632.c
Merge libc++ trunk r338150 (just before the 7.0.0 branch point), and
[FreeBSD/FreeBSD.git] / lib / libiconv_modules / UTF1632 / citrus_utf1632.c
1 /* $FreeBSD$ */
2 /*      $NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $       */
3
4 /*-
5  * SPDX-License-Identifier: BSD-2-Clause
6  *
7  * Copyright (c)2003 Citrus Project,
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 #include <sys/cdefs.h>
33 #include <sys/endian.h>
34 #include <sys/types.h>
35
36 #include <assert.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <stddef.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <wchar.h>
44
45 #include "citrus_namespace.h"
46 #include "citrus_types.h"
47 #include "citrus_module.h"
48 #include "citrus_stdenc.h"
49 #include "citrus_bcs.h"
50
51 #include "citrus_utf1632.h"
52
53
54 /* ----------------------------------------------------------------------
55  * private stuffs used by templates
56  */
57
58 typedef struct {
59         int              chlen;
60         int              current_endian;
61         uint8_t          ch[4];
62 } _UTF1632State;
63
64 #define _ENDIAN_UNKNOWN         0
65 #define _ENDIAN_BIG             1
66 #define _ENDIAN_LITTLE          2
67 #if BYTE_ORDER == BIG_ENDIAN
68 #define _ENDIAN_INTERNAL        _ENDIAN_BIG
69 #define _ENDIAN_SWAPPED         _ENDIAN_LITTLE
70 #else
71 #define _ENDIAN_INTERNAL        _ENDIAN_LITTLE
72 #define _ENDIAN_SWAPPED _ENDIAN_BIG
73 #endif
74 #define _MODE_UTF32             0x00000001U
75 #define _MODE_FORCE_ENDIAN      0x00000002U
76
77 typedef struct {
78         int              preffered_endian;
79         unsigned int     cur_max;
80         uint32_t         mode;
81 } _UTF1632EncodingInfo;
82
83 #define _FUNCNAME(m)                    _citrus_UTF1632_##m
84 #define _ENCODING_INFO                  _UTF1632EncodingInfo
85 #define _ENCODING_STATE                 _UTF1632State
86 #define _ENCODING_MB_CUR_MAX(_ei_)      ((_ei_)->cur_max)
87 #define _ENCODING_IS_STATE_DEPENDENT    0
88 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_)        0
89
90
91 static __inline void
92 /*ARGSUSED*/
93 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei __unused,
94     _UTF1632State *s)
95 {
96
97         memset(s, 0, sizeof(*s));
98 }
99
100 static int
101 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc,
102     char **s, size_t n, _UTF1632State *psenc, size_t *nresult)
103 {
104         char *s0;
105         size_t result;
106         wchar_t wc = L'\0';
107         int chlenbak, endian, needlen;
108
109         s0 = *s;
110
111         if (s0 == NULL) {
112                 _citrus_UTF1632_init_state(ei, psenc);
113                 *nresult = 0; /* state independent */
114                 return (0);
115         }
116
117         result = 0;
118         chlenbak = psenc->chlen;
119
120 refetch:
121         needlen = ((ei->mode & _MODE_UTF32) != 0 || chlenbak >= 2) ? 4 : 2;
122
123         while (chlenbak < needlen) {
124                 if (n == 0)
125                         goto restart;
126                 psenc->ch[chlenbak++] = *s0++;
127                 n--;
128                 result++;
129         }
130
131         /* judge endian marker */
132         if ((ei->mode & _MODE_UTF32) == 0) {
133                 /* UTF16 */
134                 if (psenc->ch[0] == 0xFE && psenc->ch[1] == 0xFF) {
135                         psenc->current_endian = _ENDIAN_BIG;
136                         chlenbak = 0;
137                         goto refetch;
138                 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE) {
139                         psenc->current_endian = _ENDIAN_LITTLE;
140                         chlenbak = 0;
141                         goto refetch;
142                 }
143         } else {
144                 /* UTF32 */
145                 if (psenc->ch[0] == 0x00 && psenc->ch[1] == 0x00 &&
146                     psenc->ch[2] == 0xFE && psenc->ch[3] == 0xFF) {
147                         psenc->current_endian = _ENDIAN_BIG;
148                         chlenbak = 0;
149                         goto refetch;
150                 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE &&
151                            psenc->ch[2] == 0x00 && psenc->ch[3] == 0x00) {
152                         psenc->current_endian = _ENDIAN_LITTLE;
153                         chlenbak = 0;
154                         goto refetch;
155                 }
156         }
157         endian = ((ei->mode & _MODE_FORCE_ENDIAN) != 0 ||
158             psenc->current_endian == _ENDIAN_UNKNOWN) ? ei->preffered_endian :
159             psenc->current_endian;
160
161         /* get wc */
162         if ((ei->mode & _MODE_UTF32) == 0) {
163                 /* UTF16 */
164                 if (needlen == 2) {
165                         switch (endian) {
166                         case _ENDIAN_LITTLE:
167                                 wc = (psenc->ch[0] |
168                                     ((wchar_t)psenc->ch[1] << 8));
169                                 break;
170                         case _ENDIAN_BIG:
171                                 wc = (psenc->ch[1] |
172                                     ((wchar_t)psenc->ch[0] << 8));
173                                 break;
174                         default:
175                                 goto ilseq;
176                         }
177                         if (wc >= 0xD800 && wc <= 0xDBFF) {
178                                 /* surrogate high */
179                                 needlen = 4;
180                                 goto refetch;
181                         }
182                 } else {
183                         /* surrogate low */
184                         wc -= 0xD800; /* wc : surrogate high (see above) */
185                         wc <<= 10;
186                         switch (endian) {
187                         case _ENDIAN_LITTLE:
188                                 if (psenc->ch[3] < 0xDC || psenc->ch[3] > 0xDF)
189                                         goto ilseq;
190                                 wc |= psenc->ch[2];
191                                 wc |= (wchar_t)(psenc->ch[3] & 3) << 8;
192                                 break;
193                         case _ENDIAN_BIG:
194                                 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF)
195                                         goto ilseq;
196                                 wc |= psenc->ch[3];
197                                 wc |= (wchar_t)(psenc->ch[2] & 3) << 8;
198                                 break;
199                         default:
200                                 goto ilseq;
201                         }
202                         wc += 0x10000;
203                 }
204         } else {
205                 /* UTF32 */
206                 switch (endian) {
207                 case _ENDIAN_LITTLE:
208                         wc = (psenc->ch[0] |
209                             ((wchar_t)psenc->ch[1] << 8) |
210                             ((wchar_t)psenc->ch[2] << 16) |
211                             ((wchar_t)psenc->ch[3] << 24));
212                         break;
213                 case _ENDIAN_BIG:
214                         wc = (psenc->ch[3] |
215                             ((wchar_t)psenc->ch[2] << 8) |
216                             ((wchar_t)psenc->ch[1] << 16) |
217                             ((wchar_t)psenc->ch[0] << 24));
218                         break;
219                 default:
220                         goto ilseq;
221                 }
222                 if (wc >= 0xD800 && wc <= 0xDFFF)
223                         goto ilseq;
224         }
225
226
227         *pwc = wc;
228         psenc->chlen = 0;
229         *nresult = result;
230         *s = s0;
231
232         return (0);
233
234 ilseq:
235         *nresult = (size_t)-1;
236         psenc->chlen = 0;
237         return (EILSEQ);
238
239 restart:
240         *nresult = (size_t)-2;
241         psenc->chlen = chlenbak;
242         *s = s0;
243         return (0);
244 }
245
246 static int
247 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n,
248     wchar_t wc, _UTF1632State *psenc, size_t *nresult)
249 {
250         wchar_t wc2;
251         static const char _bom[4] = {
252             0x00, 0x00, 0xFE, 0xFF,
253         };
254         const char *bom = &_bom[0];
255         size_t cnt;
256
257         cnt = (size_t)0;
258         if (psenc->current_endian == _ENDIAN_UNKNOWN) {
259                 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) {
260                         if (ei->mode & _MODE_UTF32)
261                                 cnt = 4;
262                         else {
263                                 cnt = 2;
264                                 bom += 2;
265                         }
266                         if (n < cnt)
267                                 goto e2big;
268                         memcpy(s, bom, cnt);
269                         s += cnt, n -= cnt;
270                 }
271                 psenc->current_endian = ei->preffered_endian;
272         }
273
274         wc2 = 0;
275         if ((ei->mode & _MODE_UTF32)==0) {
276                 /* UTF16 */
277                 if (wc > 0xFFFF) {
278                         /* surrogate */
279                         if (wc > 0x10FFFF)
280                                 goto ilseq;
281                         if (n < 4)
282                                 goto e2big;
283                         cnt += 4;
284                         wc -= 0x10000;
285                         wc2 = (wc & 0x3FF) | 0xDC00;
286                         wc = (wc>>10) | 0xD800;
287                 } else {
288                         if (n < 2)
289                                 goto e2big;
290                         cnt += 2;
291                 }
292
293 surrogate:
294                 switch (psenc->current_endian) {
295                 case _ENDIAN_BIG:
296                         s[1] = wc;
297                         s[0] = (wc >>= 8);
298                         break;
299                 case _ENDIAN_LITTLE:
300                         s[0] = wc;
301                         s[1] = (wc >>= 8);
302                         break;
303                 }
304                 if (wc2 != 0) {
305                         wc = wc2;
306                         wc2 = 0;
307                         s += 2;
308                         goto surrogate;
309                 }
310         } else {
311                 /* UTF32 */
312                 if (wc >= 0xD800 && wc <= 0xDFFF)
313                         goto ilseq;
314                 if (n < 4)
315                         goto e2big;
316                 cnt += 4;
317                 switch (psenc->current_endian) {
318                 case _ENDIAN_BIG:
319                         s[3] = wc;
320                         s[2] = (wc >>= 8);
321                         s[1] = (wc >>= 8);
322                         s[0] = (wc >>= 8);
323                         break;
324                 case _ENDIAN_LITTLE:
325                         s[0] = wc;
326                         s[1] = (wc >>= 8);
327                         s[2] = (wc >>= 8);
328                         s[3] = (wc >>= 8);
329                         break;
330                 }
331         }
332         *nresult = cnt;
333
334         return (0);
335
336 ilseq:
337         *nresult = (size_t)-1;
338         return (EILSEQ);
339 e2big:
340         *nresult = (size_t)-1;
341         return (E2BIG);
342 }
343
344 static void
345 parse_variable(_UTF1632EncodingInfo * __restrict ei,
346     const void * __restrict var, size_t lenvar)
347 {
348         const char *p;
349
350         p = var;
351         while (lenvar > 0) {
352                 switch (*p) {
353                 case 'B':
354                 case 'b':
355                         MATCH(big, ei->preffered_endian = _ENDIAN_BIG);
356                         break;
357                 case 'L':
358                 case 'l':
359                         MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE);
360                         break;
361                 case 'i':
362                 case 'I':
363                         MATCH(internal, ei->preffered_endian = _ENDIAN_INTERNAL);
364                         break;
365                 case 's':
366                 case 'S':
367                         MATCH(swapped, ei->preffered_endian = _ENDIAN_SWAPPED);
368                         break;
369                 case 'F':
370                 case 'f':
371                         MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN);
372                         break;
373                 case 'U':
374                 case 'u':
375                         MATCH(utf32, ei->mode |= _MODE_UTF32);
376                         break;
377                 }
378                 p++;
379                 lenvar--;
380         }
381 }
382
383 static int
384 /*ARGSUSED*/
385 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei,
386     const void * __restrict var, size_t lenvar)
387 {
388
389         memset((void *)ei, 0, sizeof(*ei));
390
391         parse_variable(ei, var, lenvar);
392
393         ei->cur_max = ((ei->mode&_MODE_UTF32) == 0) ? 6 : 8;
394         /* 6: endian + surrogate */
395         /* 8: endian + normal */
396
397         if (ei->preffered_endian == _ENDIAN_UNKNOWN) {
398                 ei->preffered_endian = _ENDIAN_BIG;
399         }
400
401         return (0);
402 }
403
404 static void
405 /*ARGSUSED*/
406 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei __unused)
407 {
408
409 }
410
411 static __inline int
412 /*ARGSUSED*/
413 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei __unused,
414      _csid_t * __restrict csid, _index_t * __restrict idx, _wc_t wc)
415 {
416
417         *csid = 0;
418         *idx = (_index_t)wc;
419
420         return (0);
421 }
422
423 static __inline int
424 /*ARGSUSED*/
425 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei __unused,
426     _wc_t * __restrict wc, _csid_t csid, _index_t idx)
427 {
428
429         if (csid != 0)
430                 return (EILSEQ);
431
432         *wc = (_wc_t)idx;
433
434         return (0);
435 }
436
437 static __inline int
438 /*ARGSUSED*/
439 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei __unused,
440     _UTF1632State * __restrict psenc, int * __restrict rstate)
441 {
442
443         *rstate = (psenc->chlen == 0) ? _STDENC_SDGEN_INITIAL :
444             _STDENC_SDGEN_INCOMPLETE_CHAR;
445         return (0);
446 }
447
448 /* ----------------------------------------------------------------------
449  * public interface for stdenc
450  */
451
452 _CITRUS_STDENC_DECLS(UTF1632);
453 _CITRUS_STDENC_DEF_OPS(UTF1632);
454
455 #include "citrus_stdenc_template.h"