4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
34 #include <string.h> /* memcpy */
36 #if defined(_MSC_VER) && (_MSC_VER <= 1700)
37 /* for vs2012/11.0/1700 and earlier Visual Studio compilers */
47 #include "winconfig.h"
49 #ifdef HAVE_EXPAT_CONFIG_H
50 #include <expat_config.h>
52 #endif /* ndef _WIN32 */
54 #include "expat_external.h"
60 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
62 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
66 { PREFIX(prologTok), PREFIX(contentTok), \
67 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
68 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
69 PREFIX(nameMatchesAscii), \
73 PREFIX(charRefNumber), \
74 PREFIX(predefinedEntityName), \
75 PREFIX(updatePosition), \
78 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
80 #define UCS2_GET_NAMING(pages, hi, lo) \
81 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
83 /* A 2 byte UTF-8 representation splits the characters 11 bits between
84 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
85 pages, 3 bits to add to that index and 5 bits to generate the mask.
87 #define UTF8_GET_NAMING2(pages, byte) \
88 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
89 + ((((byte)[0]) & 3) << 1) \
90 + ((((byte)[1]) >> 5) & 1)] \
91 & (1u << (((byte)[1]) & 0x1F)))
93 /* A 3 byte UTF-8 representation splits the characters 16 bits between
94 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
95 into pages, 3 bits to add to that index and 5 bits to generate the
98 #define UTF8_GET_NAMING3(pages, byte) \
99 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
100 + ((((byte)[1]) >> 2) & 0xF)] \
102 + ((((byte)[1]) & 3) << 1) \
103 + ((((byte)[2]) >> 5) & 1)] \
104 & (1u << (((byte)[2]) & 0x1F)))
106 #define UTF8_GET_NAMING(pages, p, n) \
108 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
110 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
113 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
114 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
115 with the additional restriction of not allowing the Unicode
116 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
117 Implementation details:
118 (A & 0x80) == 0 means A < 0x80
120 (A & 0xC0) == 0xC0 means A > 0xBF
123 #define UTF8_INVALID2(p) \
124 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
126 #define UTF8_INVALID3(p) \
127 (((p)[2] & 0x80) == 0 \
129 ((*p) == 0xEF && (p)[1] == 0xBF \
133 ((p)[2] & 0xC0) == 0xC0) \
137 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
139 ((p)[1] & 0x80) == 0 \
141 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
143 #define UTF8_INVALID4(p) \
144 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
146 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
150 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
152 ((p)[1] & 0x80) == 0 \
154 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
156 static int PTRFASTCALL
157 isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p))
162 static int PTRFASTCALL
163 utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p)
165 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
168 static int PTRFASTCALL
169 utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p)
171 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
174 #define utf8_isName4 isNever
176 static int PTRFASTCALL
177 utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p)
179 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
182 static int PTRFASTCALL
183 utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p)
185 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
188 #define utf8_isNmstrt4 isNever
190 static int PTRFASTCALL
191 utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p)
193 return UTF8_INVALID2((const unsigned char *)p);
196 static int PTRFASTCALL
197 utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p)
199 return UTF8_INVALID3((const unsigned char *)p);
202 static int PTRFASTCALL
203 utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p)
205 return UTF8_INVALID4((const unsigned char *)p);
208 struct normal_encoding {
210 unsigned char type[256];
212 int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
213 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
214 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
215 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
216 int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
217 #endif /* XML_MIN_SIZE */
218 int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
219 int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
220 int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
221 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
222 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
223 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
224 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
225 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
226 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
229 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
233 #define STANDARD_VTABLE(E) \
242 #define STANDARD_VTABLE(E) /* as nothing */
246 #define NORMAL_VTABLE(E) \
257 #define NULL_VTABLE \
258 /* isName2 */ NULL, \
259 /* isName3 */ NULL, \
260 /* isName4 */ NULL, \
261 /* isNmstrt2 */ NULL, \
262 /* isNmstrt3 */ NULL, \
263 /* isNmstrt4 */ NULL, \
264 /* isInvalid2 */ NULL, \
265 /* isInvalid3 */ NULL, \
266 /* isInvalid4 */ NULL
268 static int FASTCALL checkCharRefNumber(int);
270 #include "xmltok_impl.h"
274 #define sb_isNameMin isNever
275 #define sb_isNmstrtMin isNever
279 #define MINBPC(enc) ((enc)->minBytesPerChar)
281 /* minimum bytes per character */
282 #define MINBPC(enc) 1
285 #define SB_BYTE_TYPE(enc, p) \
286 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
289 static int PTRFASTCALL
290 sb_byteType(const ENCODING *enc, const char *p)
292 return SB_BYTE_TYPE(enc, p);
294 #define BYTE_TYPE(enc, p) \
295 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
297 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
301 #define BYTE_TO_ASCII(enc, p) \
302 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
303 static int PTRFASTCALL
304 sb_byteToAscii(const ENCODING *enc, const char *p)
309 #define BYTE_TO_ASCII(enc, p) (*(p))
312 #define IS_NAME_CHAR(enc, p, n) \
313 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
314 #define IS_NMSTRT_CHAR(enc, p, n) \
315 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
316 #define IS_INVALID_CHAR(enc, p, n) \
317 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
320 #define IS_NAME_CHAR_MINBPC(enc, p) \
321 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
322 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
323 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
325 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
326 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
330 #define CHAR_MATCHES(enc, p, c) \
331 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
333 sb_charMatches(const ENCODING *enc, const char *p, int c)
338 /* c is an ASCII character */
339 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
342 #define PREFIX(ident) normal_ ## ident
343 #define XML_TOK_IMPL_C
344 #include "xmltok_impl.c"
345 #undef XML_TOK_IMPL_C
352 #undef IS_NAME_CHAR_MINBPC
353 #undef IS_NMSTRT_CHAR
354 #undef IS_NMSTRT_CHAR_MINBPC
355 #undef IS_INVALID_CHAR
357 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
365 _INTERNAL_trim_to_complete_utf8_characters(const char * from, const char ** fromLimRef)
367 const char * fromLim = *fromLimRef;
369 for (; fromLim > from; fromLim--, walked++) {
370 const unsigned char prev = (unsigned char)fromLim[-1];
371 if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
372 if (walked + 1 >= 4) {
378 } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
379 if (walked + 1 >= 3) {
385 } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
386 if (walked + 1 >= 2) {
392 } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
396 *fromLimRef = fromLim;
399 static enum XML_Convert_Result PTRCALL
400 utf8_toUtf8(const ENCODING *UNUSED_P(enc),
401 const char **fromP, const char *fromLim,
402 char **toP, const char *toLim)
404 bool input_incomplete = false;
405 bool output_exhausted = false;
407 /* Avoid copying partial characters (due to limited space). */
408 const ptrdiff_t bytesAvailable = fromLim - *fromP;
409 const ptrdiff_t bytesStorable = toLim - *toP;
410 if (bytesAvailable > bytesStorable) {
411 fromLim = *fromP + bytesStorable;
412 output_exhausted = true;
415 /* Avoid copying partial characters (from incomplete input). */
417 const char * const fromLimBefore = fromLim;
418 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
419 if (fromLim < fromLimBefore) {
420 input_incomplete = true;
425 const ptrdiff_t bytesToCopy = fromLim - *fromP;
426 memcpy(*toP, *fromP, bytesToCopy);
427 *fromP += bytesToCopy;
431 if (output_exhausted) /* needs to go first */
432 return XML_CONVERT_OUTPUT_EXHAUSTED;
433 else if (input_incomplete)
434 return XML_CONVERT_INPUT_INCOMPLETE;
436 return XML_CONVERT_COMPLETED;
439 static enum XML_Convert_Result PTRCALL
440 utf8_toUtf16(const ENCODING *enc,
441 const char **fromP, const char *fromLim,
442 unsigned short **toP, const unsigned short *toLim)
444 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
445 unsigned short *to = *toP;
446 const char *from = *fromP;
447 while (from < fromLim && to < toLim) {
448 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
450 if (fromLim - from < 2) {
451 res = XML_CONVERT_INPUT_INCOMPLETE;
454 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
458 if (fromLim - from < 3) {
459 res = XML_CONVERT_INPUT_INCOMPLETE;
462 *to++ = (unsigned short)(((from[0] & 0xf) << 12)
463 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
469 if (toLim - to < 2) {
470 res = XML_CONVERT_OUTPUT_EXHAUSTED;
473 if (fromLim - from < 4) {
474 res = XML_CONVERT_INPUT_INCOMPLETE;
477 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
478 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
480 to[0] = (unsigned short)((n >> 10) | 0xD800);
481 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
492 res = XML_CONVERT_OUTPUT_EXHAUSTED;
500 static const struct normal_encoding utf8_encoding_ns = {
501 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
503 #include "asciitab.h"
506 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
510 static const struct normal_encoding utf8_encoding = {
511 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
513 #define BT_COLON BT_NMSTRT
514 #include "asciitab.h"
518 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
523 static const struct normal_encoding internal_utf8_encoding_ns = {
524 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
526 #include "iasciitab.h"
529 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
534 static const struct normal_encoding internal_utf8_encoding = {
535 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
537 #define BT_COLON BT_NMSTRT
538 #include "iasciitab.h"
542 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
545 static enum XML_Convert_Result PTRCALL
546 latin1_toUtf8(const ENCODING *UNUSED_P(enc),
547 const char **fromP, const char *fromLim,
548 char **toP, const char *toLim)
552 if (*fromP == fromLim)
553 return XML_CONVERT_COMPLETED;
554 c = (unsigned char)**fromP;
556 if (toLim - *toP < 2)
557 return XML_CONVERT_OUTPUT_EXHAUSTED;
558 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
559 *(*toP)++ = (char)((c & 0x3f) | 0x80);
564 return XML_CONVERT_OUTPUT_EXHAUSTED;
565 *(*toP)++ = *(*fromP)++;
570 static enum XML_Convert_Result PTRCALL
571 latin1_toUtf16(const ENCODING *UNUSED_P(enc),
572 const char **fromP, const char *fromLim,
573 unsigned short **toP, const unsigned short *toLim)
575 while (*fromP < fromLim && *toP < toLim)
576 *(*toP)++ = (unsigned char)*(*fromP)++;
578 if ((*toP == toLim) && (*fromP < fromLim))
579 return XML_CONVERT_OUTPUT_EXHAUSTED;
581 return XML_CONVERT_COMPLETED;
586 static const struct normal_encoding latin1_encoding_ns = {
587 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
589 #include "asciitab.h"
590 #include "latin1tab.h"
592 STANDARD_VTABLE(sb_) NULL_VTABLE
597 static const struct normal_encoding latin1_encoding = {
598 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
600 #define BT_COLON BT_NMSTRT
601 #include "asciitab.h"
603 #include "latin1tab.h"
605 STANDARD_VTABLE(sb_) NULL_VTABLE
608 static enum XML_Convert_Result PTRCALL
609 ascii_toUtf8(const ENCODING *UNUSED_P(enc),
610 const char **fromP, const char *fromLim,
611 char **toP, const char *toLim)
613 while (*fromP < fromLim && *toP < toLim)
614 *(*toP)++ = *(*fromP)++;
616 if ((*toP == toLim) && (*fromP < fromLim))
617 return XML_CONVERT_OUTPUT_EXHAUSTED;
619 return XML_CONVERT_COMPLETED;
624 static const struct normal_encoding ascii_encoding_ns = {
625 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
627 #include "asciitab.h"
630 STANDARD_VTABLE(sb_) NULL_VTABLE
635 static const struct normal_encoding ascii_encoding = {
636 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
638 #define BT_COLON BT_NMSTRT
639 #include "asciitab.h"
643 STANDARD_VTABLE(sb_) NULL_VTABLE
646 static int PTRFASTCALL
647 unicode_byte_type(char hi, char lo)
649 switch ((unsigned char)hi) {
650 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
652 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
655 switch ((unsigned char)lo) {
665 #define DEFINE_UTF16_TO_UTF8(E) \
666 static enum XML_Convert_Result PTRCALL \
667 E ## toUtf8(const ENCODING *UNUSED_P(enc), \
668 const char **fromP, const char *fromLim, \
669 char **toP, const char *toLim) \
671 const char *from = *fromP; \
672 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
673 for (; from < fromLim; from += 2) { \
676 unsigned char lo = GET_LO(from); \
677 unsigned char hi = GET_HI(from); \
681 if (*toP == toLim) { \
683 return XML_CONVERT_OUTPUT_EXHAUSTED; \
689 case 0x1: case 0x2: case 0x3: \
690 case 0x4: case 0x5: case 0x6: case 0x7: \
691 if (toLim - *toP < 2) { \
693 return XML_CONVERT_OUTPUT_EXHAUSTED; \
695 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
696 *(*toP)++ = ((lo & 0x3f) | 0x80); \
699 if (toLim - *toP < 3) { \
701 return XML_CONVERT_OUTPUT_EXHAUSTED; \
703 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
704 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
705 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
706 *(*toP)++ = ((lo & 0x3f) | 0x80); \
708 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
709 if (toLim - *toP < 4) { \
711 return XML_CONVERT_OUTPUT_EXHAUSTED; \
713 if (fromLim - from < 4) { \
715 return XML_CONVERT_INPUT_INCOMPLETE; \
717 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
718 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
719 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
721 lo2 = GET_LO(from); \
722 *(*toP)++ = (((lo & 0x3) << 4) \
723 | ((GET_HI(from) & 0x3) << 2) \
726 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
731 if (from < fromLim) \
732 return XML_CONVERT_INPUT_INCOMPLETE; \
734 return XML_CONVERT_COMPLETED; \
737 #define DEFINE_UTF16_TO_UTF16(E) \
738 static enum XML_Convert_Result PTRCALL \
739 E ## toUtf16(const ENCODING *UNUSED_P(enc), \
740 const char **fromP, const char *fromLim, \
741 unsigned short **toP, const unsigned short *toLim) \
743 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
744 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
745 /* Avoid copying first half only of surrogate */ \
746 if (fromLim - *fromP > ((toLim - *toP) << 1) \
747 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
749 res = XML_CONVERT_INPUT_INCOMPLETE; \
751 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
752 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
753 if ((*toP == toLim) && (*fromP < fromLim)) \
754 return XML_CONVERT_OUTPUT_EXHAUSTED; \
759 #define SET2(ptr, ch) \
760 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
761 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
762 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
764 DEFINE_UTF16_TO_UTF8(little2_)
765 DEFINE_UTF16_TO_UTF16(little2_)
771 #define SET2(ptr, ch) \
772 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
773 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
774 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
776 DEFINE_UTF16_TO_UTF8(big2_)
777 DEFINE_UTF16_TO_UTF16(big2_)
783 #define LITTLE2_BYTE_TYPE(enc, p) \
785 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
786 : unicode_byte_type((p)[1], (p)[0]))
787 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
788 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
789 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
790 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
791 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
792 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
796 static int PTRFASTCALL
797 little2_byteType(const ENCODING *enc, const char *p)
799 return LITTLE2_BYTE_TYPE(enc, p);
802 static int PTRFASTCALL
803 little2_byteToAscii(const ENCODING *enc, const char *p)
805 return LITTLE2_BYTE_TO_ASCII(enc, p);
809 little2_charMatches(const ENCODING *enc, const char *p, int c)
811 return LITTLE2_CHAR_MATCHES(enc, p, c);
814 static int PTRFASTCALL
815 little2_isNameMin(const ENCODING *enc, const char *p)
817 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
820 static int PTRFASTCALL
821 little2_isNmstrtMin(const ENCODING *enc, const char *p)
823 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
827 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
829 #else /* not XML_MIN_SIZE */
832 #define PREFIX(ident) little2_ ## ident
833 #define MINBPC(enc) 2
834 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
835 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
836 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
837 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
838 #define IS_NAME_CHAR(enc, p, n) 0
839 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
840 #define IS_NMSTRT_CHAR(enc, p, n) (0)
841 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
843 #define XML_TOK_IMPL_C
844 #include "xmltok_impl.c"
845 #undef XML_TOK_IMPL_C
852 #undef IS_NAME_CHAR_MINBPC
853 #undef IS_NMSTRT_CHAR
854 #undef IS_NMSTRT_CHAR_MINBPC
855 #undef IS_INVALID_CHAR
857 #endif /* not XML_MIN_SIZE */
861 static const struct normal_encoding little2_encoding_ns = {
863 #if BYTEORDER == 1234
870 #include "asciitab.h"
871 #include "latin1tab.h"
873 STANDARD_VTABLE(little2_) NULL_VTABLE
878 static const struct normal_encoding little2_encoding = {
880 #if BYTEORDER == 1234
887 #define BT_COLON BT_NMSTRT
888 #include "asciitab.h"
890 #include "latin1tab.h"
892 STANDARD_VTABLE(little2_) NULL_VTABLE
895 #if BYTEORDER != 4321
899 static const struct normal_encoding internal_little2_encoding_ns = {
902 #include "iasciitab.h"
903 #include "latin1tab.h"
905 STANDARD_VTABLE(little2_) NULL_VTABLE
910 static const struct normal_encoding internal_little2_encoding = {
913 #define BT_COLON BT_NMSTRT
914 #include "iasciitab.h"
916 #include "latin1tab.h"
918 STANDARD_VTABLE(little2_) NULL_VTABLE
924 #define BIG2_BYTE_TYPE(enc, p) \
926 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
927 : unicode_byte_type((p)[0], (p)[1]))
928 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
929 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
930 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
931 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
932 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
933 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
937 static int PTRFASTCALL
938 big2_byteType(const ENCODING *enc, const char *p)
940 return BIG2_BYTE_TYPE(enc, p);
943 static int PTRFASTCALL
944 big2_byteToAscii(const ENCODING *enc, const char *p)
946 return BIG2_BYTE_TO_ASCII(enc, p);
950 big2_charMatches(const ENCODING *enc, const char *p, int c)
952 return BIG2_CHAR_MATCHES(enc, p, c);
955 static int PTRFASTCALL
956 big2_isNameMin(const ENCODING *enc, const char *p)
958 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
961 static int PTRFASTCALL
962 big2_isNmstrtMin(const ENCODING *enc, const char *p)
964 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
968 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
970 #else /* not XML_MIN_SIZE */
973 #define PREFIX(ident) big2_ ## ident
974 #define MINBPC(enc) 2
975 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
976 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
977 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
978 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
979 #define IS_NAME_CHAR(enc, p, n) 0
980 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
981 #define IS_NMSTRT_CHAR(enc, p, n) (0)
982 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
984 #define XML_TOK_IMPL_C
985 #include "xmltok_impl.c"
986 #undef XML_TOK_IMPL_C
993 #undef IS_NAME_CHAR_MINBPC
994 #undef IS_NMSTRT_CHAR
995 #undef IS_NMSTRT_CHAR_MINBPC
996 #undef IS_INVALID_CHAR
998 #endif /* not XML_MIN_SIZE */
1002 static const struct normal_encoding big2_encoding_ns = {
1004 #if BYTEORDER == 4321
1011 #include "asciitab.h"
1012 #include "latin1tab.h"
1014 STANDARD_VTABLE(big2_) NULL_VTABLE
1019 static const struct normal_encoding big2_encoding = {
1021 #if BYTEORDER == 4321
1028 #define BT_COLON BT_NMSTRT
1029 #include "asciitab.h"
1031 #include "latin1tab.h"
1033 STANDARD_VTABLE(big2_) NULL_VTABLE
1036 #if BYTEORDER != 1234
1040 static const struct normal_encoding internal_big2_encoding_ns = {
1041 { VTABLE, 2, 0, 1 },
1043 #include "iasciitab.h"
1044 #include "latin1tab.h"
1046 STANDARD_VTABLE(big2_) NULL_VTABLE
1051 static const struct normal_encoding internal_big2_encoding = {
1052 { VTABLE, 2, 0, 1 },
1054 #define BT_COLON BT_NMSTRT
1055 #include "iasciitab.h"
1057 #include "latin1tab.h"
1059 STANDARD_VTABLE(big2_) NULL_VTABLE
1067 streqci(const char *s1, const char *s2)
1072 if (ASCII_a <= c1 && c1 <= ASCII_z)
1073 c1 += ASCII_A - ASCII_a;
1074 if (ASCII_a <= c2 && c2 <= ASCII_z)
1075 /* The following line will never get executed. streqci() is
1076 * only called from two places, both of which guarantee to put
1077 * upper-case strings into s2.
1079 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1089 initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr,
1090 const char *end, POSITION *pos)
1092 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1096 toAscii(const ENCODING *enc, const char *ptr, const char *end)
1100 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1120 /* Return 1 if there's just optional white space or there's an S
1121 followed by name=val.
1124 parsePseudoAttribute(const ENCODING *enc,
1127 const char **namePtr,
1128 const char **nameEndPtr,
1129 const char **valPtr,
1130 const char **nextTokPtr)
1138 if (!isSpace(toAscii(enc, ptr, end))) {
1143 ptr += enc->minBytesPerChar;
1144 } while (isSpace(toAscii(enc, ptr, end)));
1151 c = toAscii(enc, ptr, end);
1156 if (c == ASCII_EQUALS) {
1163 ptr += enc->minBytesPerChar;
1164 } while (isSpace(c = toAscii(enc, ptr, end)));
1165 if (c != ASCII_EQUALS) {
1171 ptr += enc->minBytesPerChar;
1173 if (ptr == *namePtr) {
1177 ptr += enc->minBytesPerChar;
1178 c = toAscii(enc, ptr, end);
1179 while (isSpace(c)) {
1180 ptr += enc->minBytesPerChar;
1181 c = toAscii(enc, ptr, end);
1183 if (c != ASCII_QUOT && c != ASCII_APOS) {
1188 ptr += enc->minBytesPerChar;
1190 for (;; ptr += enc->minBytesPerChar) {
1191 c = toAscii(enc, ptr, end);
1194 if (!(ASCII_a <= c && c <= ASCII_z)
1195 && !(ASCII_A <= c && c <= ASCII_Z)
1196 && !(ASCII_0 <= c && c <= ASCII_9)
1197 && c != ASCII_PERIOD
1199 && c != ASCII_UNDERSCORE) {
1204 *nextTokPtr = ptr + enc->minBytesPerChar;
1208 static const char KW_version[] = {
1209 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1212 static const char KW_encoding[] = {
1213 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1216 static const char KW_standalone[] = {
1217 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1218 ASCII_n, ASCII_e, '\0'
1221 static const char KW_yes[] = {
1222 ASCII_y, ASCII_e, ASCII_s, '\0'
1225 static const char KW_no[] = {
1226 ASCII_n, ASCII_o, '\0'
1230 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1233 int isGeneralTextEntity,
1234 const ENCODING *enc,
1237 const char **badPtr,
1238 const char **versionPtr,
1239 const char **versionEndPtr,
1240 const char **encodingName,
1241 const ENCODING **encoding,
1244 const char *val = NULL;
1245 const char *name = NULL;
1246 const char *nameEnd = NULL;
1247 ptr += 5 * enc->minBytesPerChar;
1248 end -= 2 * enc->minBytesPerChar;
1249 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1254 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1255 if (!isGeneralTextEntity) {
1264 *versionEndPtr = ptr;
1265 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1270 if (isGeneralTextEntity) {
1271 /* a TextDecl must have an EncodingDecl */
1278 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1279 int c = toAscii(enc, val, end);
1280 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1285 *encodingName = val;
1287 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1288 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1295 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1296 || isGeneralTextEntity) {
1300 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1304 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1312 while (isSpace(toAscii(enc, ptr, end)))
1313 ptr += enc->minBytesPerChar;
1322 checkCharRefNumber(int result)
1324 switch (result >> 8) {
1325 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1326 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1329 if (latin1_encoding.type[result] == BT_NONXML)
1333 if (result == 0xFFFE || result == 0xFFFF)
1341 XmlUtf8Encode(int c, char *buf)
1344 /* minN is minimum legal resulting value for N byte sequence */
1351 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1353 buf[0] = (char)(c | UTF8_cval1);
1357 buf[0] = (char)((c >> 6) | UTF8_cval2);
1358 buf[1] = (char)((c & 0x3f) | 0x80);
1362 buf[0] = (char)((c >> 12) | UTF8_cval3);
1363 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1364 buf[2] = (char)((c & 0x3f) | 0x80);
1368 buf[0] = (char)((c >> 18) | UTF8_cval4);
1369 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1370 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1371 buf[3] = (char)((c & 0x3f) | 0x80);
1374 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1378 XmlUtf16Encode(int charNum, unsigned short *buf)
1382 if (charNum < 0x10000) {
1383 buf[0] = (unsigned short)charNum;
1386 if (charNum < 0x110000) {
1388 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1389 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1395 struct unknown_encoding {
1396 struct normal_encoding normal;
1399 unsigned short utf16[256];
1403 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1406 XmlSizeOfUnknownEncoding(void)
1408 return sizeof(struct unknown_encoding);
1411 static int PTRFASTCALL
1412 unknown_isName(const ENCODING *enc, const char *p)
1414 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1415 int c = uenc->convert(uenc->userData, p);
1418 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1421 static int PTRFASTCALL
1422 unknown_isNmstrt(const ENCODING *enc, const char *p)
1424 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1425 int c = uenc->convert(uenc->userData, p);
1428 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1431 static int PTRFASTCALL
1432 unknown_isInvalid(const ENCODING *enc, const char *p)
1434 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1435 int c = uenc->convert(uenc->userData, p);
1436 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1439 static enum XML_Convert_Result PTRCALL
1440 unknown_toUtf8(const ENCODING *enc,
1441 const char **fromP, const char *fromLim,
1442 char **toP, const char *toLim)
1444 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1445 char buf[XML_UTF8_ENCODE_MAX];
1449 if (*fromP == fromLim)
1450 return XML_CONVERT_COMPLETED;
1451 utf8 = uenc->utf8[(unsigned char)**fromP];
1454 int c = uenc->convert(uenc->userData, *fromP);
1455 n = XmlUtf8Encode(c, buf);
1456 if (n > toLim - *toP)
1457 return XML_CONVERT_OUTPUT_EXHAUSTED;
1459 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1463 if (n > toLim - *toP)
1464 return XML_CONVERT_OUTPUT_EXHAUSTED;
1467 memcpy(*toP, utf8, n);
1472 static enum XML_Convert_Result PTRCALL
1473 unknown_toUtf16(const ENCODING *enc,
1474 const char **fromP, const char *fromLim,
1475 unsigned short **toP, const unsigned short *toLim)
1477 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1478 while (*fromP < fromLim && *toP < toLim) {
1479 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1481 c = (unsigned short)
1482 uenc->convert(uenc->userData, *fromP);
1483 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1491 if ((*toP == toLim) && (*fromP < fromLim))
1492 return XML_CONVERT_OUTPUT_EXHAUSTED;
1494 return XML_CONVERT_COMPLETED;
1498 XmlInitUnknownEncoding(void *mem,
1504 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1505 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1506 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1507 for (i = 0; i < 128; i++)
1508 if (latin1_encoding.type[i] != BT_OTHER
1509 && latin1_encoding.type[i] != BT_NONXML
1512 for (i = 0; i < 256; i++) {
1515 e->normal.type[i] = BT_MALFORM;
1516 /* This shouldn't really get used. */
1517 e->utf16[i] = 0xFFFF;
1524 /* Multi-byte sequences need a converter function */
1527 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1531 else if (c < 0x80) {
1532 if (latin1_encoding.type[c] != BT_OTHER
1533 && latin1_encoding.type[c] != BT_NONXML
1536 e->normal.type[i] = latin1_encoding.type[c];
1538 e->utf8[i][1] = (char)c;
1539 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1541 else if (checkCharRefNumber(c) < 0) {
1542 e->normal.type[i] = BT_NONXML;
1543 /* This shouldn't really get used. */
1544 e->utf16[i] = 0xFFFF;
1551 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1552 e->normal.type[i] = BT_NMSTRT;
1553 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1554 e->normal.type[i] = BT_NAME;
1556 e->normal.type[i] = BT_OTHER;
1557 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1558 e->utf16[i] = (unsigned short)c;
1561 e->userData = userData;
1562 e->convert = convert;
1564 e->normal.isName2 = unknown_isName;
1565 e->normal.isName3 = unknown_isName;
1566 e->normal.isName4 = unknown_isName;
1567 e->normal.isNmstrt2 = unknown_isNmstrt;
1568 e->normal.isNmstrt3 = unknown_isNmstrt;
1569 e->normal.isNmstrt4 = unknown_isNmstrt;
1570 e->normal.isInvalid2 = unknown_isInvalid;
1571 e->normal.isInvalid3 = unknown_isInvalid;
1572 e->normal.isInvalid4 = unknown_isInvalid;
1574 e->normal.enc.utf8Convert = unknown_toUtf8;
1575 e->normal.enc.utf16Convert = unknown_toUtf16;
1576 return &(e->normal.enc);
1579 /* If this enumeration is changed, getEncodingIndex and encodings
1580 must also be changed. */
1589 /* must match encodingNames up to here */
1593 static const char KW_ISO_8859_1[] = {
1594 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1595 ASCII_MINUS, ASCII_1, '\0'
1597 static const char KW_US_ASCII[] = {
1598 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1601 static const char KW_UTF_8[] = {
1602 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1604 static const char KW_UTF_16[] = {
1605 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1607 static const char KW_UTF_16BE[] = {
1608 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1611 static const char KW_UTF_16LE[] = {
1612 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1617 getEncodingIndex(const char *name)
1619 static const char * const encodingNames[] = {
1630 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1631 if (streqci(name, encodingNames[i]))
1636 /* For binary compatibility, we store the index of the encoding
1637 specified at initialization in the isUtf16 member.
1640 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1641 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1643 /* This is what detects the encoding. encodingTable maps from
1644 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1645 the external (protocol) specified encoding; state is
1646 XML_CONTENT_STATE if we're parsing an external text entity, and
1647 XML_PROLOG_STATE otherwise.
1652 initScan(const ENCODING * const *encodingTable,
1653 const INIT_ENCODING *enc,
1657 const char **nextTokPtr)
1659 const ENCODING **encPtr;
1662 return XML_TOK_NONE;
1663 encPtr = enc->encPtr;
1664 if (ptr + 1 == end) {
1665 /* only a single byte available for auto-detection */
1666 #ifndef XML_DTD /* FIXME */
1667 /* a well-formed document entity must have more than one byte */
1668 if (state != XML_CONTENT_STATE)
1669 return XML_TOK_PARTIAL;
1671 /* so we're parsing an external text entity... */
1672 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1673 switch (INIT_ENC_INDEX(enc)) {
1677 return XML_TOK_PARTIAL;
1679 switch ((unsigned char)*ptr) {
1682 case 0xEF: /* possibly first byte of UTF-8 BOM */
1683 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1684 && state == XML_CONTENT_STATE)
1689 return XML_TOK_PARTIAL;
1693 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1695 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1696 && state == XML_CONTENT_STATE)
1698 *nextTokPtr = ptr + 2;
1699 *encPtr = encodingTable[UTF_16BE_ENC];
1701 /* 00 3C is handled in the default case */
1703 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1704 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1705 && state == XML_CONTENT_STATE)
1707 *encPtr = encodingTable[UTF_16LE_ENC];
1708 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1710 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1711 && state == XML_CONTENT_STATE)
1713 *nextTokPtr = ptr + 2;
1714 *encPtr = encodingTable[UTF_16LE_ENC];
1717 /* Maybe a UTF-8 BOM (EF BB BF) */
1718 /* If there's an explicitly specified (external) encoding
1719 of ISO-8859-1 or some flavour of UTF-16
1720 and this is an external text entity,
1721 don't look for the BOM,
1722 because it might be a legal data.
1724 if (state == XML_CONTENT_STATE) {
1725 int e = INIT_ENC_INDEX(enc);
1726 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1727 || e == UTF_16LE_ENC || e == UTF_16_ENC)
1731 return XML_TOK_PARTIAL;
1732 if ((unsigned char)ptr[2] == 0xBF) {
1733 *nextTokPtr = ptr + 3;
1734 *encPtr = encodingTable[UTF_8_ENC];
1739 if (ptr[0] == '\0') {
1740 /* 0 isn't a legal data character. Furthermore a document
1741 entity can only start with ASCII characters. So the only
1742 way this can fail to be big-endian UTF-16 if it it's an
1743 external parsed general entity that's labelled as
1746 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1748 *encPtr = encodingTable[UTF_16BE_ENC];
1749 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1751 else if (ptr[1] == '\0') {
1752 /* We could recover here in the case:
1753 - parsing an external entity
1755 - no externally specified encoding
1756 - no encoding declaration
1757 by assuming UTF-16LE. But we don't, because this would mean when
1758 presented just with a single byte, we couldn't reliably determine
1759 whether we needed further bytes.
1761 if (state == XML_CONTENT_STATE)
1763 *encPtr = encodingTable[UTF_16LE_ENC];
1764 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1769 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1770 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1776 #define XML_TOK_NS_C
1777 #include "xmltok_ns.c"
1784 #define NS(x) x ## NS
1785 #define ns(x) x ## _ns
1787 #define XML_TOK_NS_C
1788 #include "xmltok_ns.c"
1795 XmlInitUnknownEncodingNS(void *mem,
1800 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1802 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;