1 /* This file is included!
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
35 #ifndef IS_INVALID_CHAR
36 #define IS_INVALID_CHAR(enc, ptr, n) (0)
39 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
42 return XML_TOK_PARTIAL_CHAR; \
43 if (IS_INVALID_CHAR(enc, ptr, n)) { \
44 *(nextTokPtr) = (ptr); \
45 return XML_TOK_INVALID; \
50 #define INVALID_CASES(ptr, nextTokPtr) \
51 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
52 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
53 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
57 *(nextTokPtr) = (ptr); \
58 return XML_TOK_INVALID;
60 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
63 return XML_TOK_PARTIAL_CHAR; \
64 if (!IS_NAME_CHAR(enc, ptr, n)) { \
66 return XML_TOK_INVALID; \
71 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
73 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
75 return XML_TOK_INVALID; \
85 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
86 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
87 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
89 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
92 return XML_TOK_PARTIAL_CHAR; \
93 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
95 return XML_TOK_INVALID; \
100 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
102 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
104 return XML_TOK_INVALID; \
109 ptr += MINBPC(enc); \
111 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
112 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
113 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
116 #define PREFIX(ident) ident
120 #define HAS_CHARS(enc, ptr, end, count) \
121 (end - ptr >= count * MINBPC(enc))
123 #define HAS_CHAR(enc, ptr, end) \
124 HAS_CHARS(enc, ptr, end, 1)
126 #define REQUIRE_CHARS(enc, ptr, end, count) \
128 if (! HAS_CHARS(enc, ptr, end, count)) { \
129 return XML_TOK_PARTIAL; \
133 #define REQUIRE_CHAR(enc, ptr, end) \
134 REQUIRE_CHARS(enc, ptr, end, 1)
137 /* ptr points to character following "<!-" */
140 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
141 const char *end, const char **nextTokPtr)
143 if (HAS_CHAR(enc, ptr, end)) {
144 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
146 return XML_TOK_INVALID;
149 while (HAS_CHAR(enc, ptr, end)) {
150 switch (BYTE_TYPE(enc, ptr)) {
151 INVALID_CASES(ptr, nextTokPtr)
154 REQUIRE_CHAR(enc, ptr, end);
155 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
157 REQUIRE_CHAR(enc, ptr, end);
158 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
160 return XML_TOK_INVALID;
162 *nextTokPtr = ptr + MINBPC(enc);
163 return XML_TOK_COMMENT;
172 return XML_TOK_PARTIAL;
175 /* ptr points to character following "<!" */
178 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
179 const char *end, const char **nextTokPtr)
181 REQUIRE_CHAR(enc, ptr, end);
182 switch (BYTE_TYPE(enc, ptr)) {
184 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
186 *nextTokPtr = ptr + MINBPC(enc);
187 return XML_TOK_COND_SECT_OPEN;
194 return XML_TOK_INVALID;
196 while (HAS_CHAR(enc, ptr, end)) {
197 switch (BYTE_TYPE(enc, ptr)) {
199 REQUIRE_CHARS(enc, ptr, end, 2);
200 /* don't allow <!ENTITY% foo "whatever"> */
201 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
202 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
204 return XML_TOK_INVALID;
207 case BT_S: case BT_CR: case BT_LF:
209 return XML_TOK_DECL_OPEN;
216 return XML_TOK_INVALID;
219 return XML_TOK_PARTIAL;
223 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
224 const char *end, int *tokPtr)
227 *tokPtr = XML_TOK_PI;
228 if (end - ptr != MINBPC(enc)*3)
230 switch (BYTE_TO_ASCII(enc, ptr)) {
240 switch (BYTE_TO_ASCII(enc, ptr)) {
250 switch (BYTE_TO_ASCII(enc, ptr)) {
261 *tokPtr = XML_TOK_XML_DECL;
265 /* ptr points to character following "<?" */
268 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
269 const char *end, const char **nextTokPtr)
272 const char *target = ptr;
273 REQUIRE_CHAR(enc, ptr, end);
274 switch (BYTE_TYPE(enc, ptr)) {
275 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
278 return XML_TOK_INVALID;
280 while (HAS_CHAR(enc, ptr, end)) {
281 switch (BYTE_TYPE(enc, ptr)) {
282 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
283 case BT_S: case BT_CR: case BT_LF:
284 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
286 return XML_TOK_INVALID;
289 while (HAS_CHAR(enc, ptr, end)) {
290 switch (BYTE_TYPE(enc, ptr)) {
291 INVALID_CASES(ptr, nextTokPtr)
294 REQUIRE_CHAR(enc, ptr, end);
295 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
296 *nextTokPtr = ptr + MINBPC(enc);
305 return XML_TOK_PARTIAL;
307 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
309 return XML_TOK_INVALID;
312 REQUIRE_CHAR(enc, ptr, end);
313 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
314 *nextTokPtr = ptr + MINBPC(enc);
320 return XML_TOK_INVALID;
323 return XML_TOK_PARTIAL;
327 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
328 const char *end, const char **nextTokPtr)
330 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
331 ASCII_T, ASCII_A, ASCII_LSQB };
334 REQUIRE_CHARS(enc, ptr, end, 6);
335 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
336 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
338 return XML_TOK_INVALID;
342 return XML_TOK_CDATA_SECT_OPEN;
346 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
347 const char *end, const char **nextTokPtr)
351 if (MINBPC(enc) > 1) {
352 size_t n = end - ptr;
353 if (n & (MINBPC(enc) - 1)) {
354 n &= ~(MINBPC(enc) - 1);
356 return XML_TOK_PARTIAL;
360 switch (BYTE_TYPE(enc, ptr)) {
363 REQUIRE_CHAR(enc, ptr, end);
364 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
367 REQUIRE_CHAR(enc, ptr, end);
368 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
372 *nextTokPtr = ptr + MINBPC(enc);
373 return XML_TOK_CDATA_SECT_CLOSE;
376 REQUIRE_CHAR(enc, ptr, end);
377 if (BYTE_TYPE(enc, ptr) == BT_LF)
380 return XML_TOK_DATA_NEWLINE;
382 *nextTokPtr = ptr + MINBPC(enc);
383 return XML_TOK_DATA_NEWLINE;
384 INVALID_CASES(ptr, nextTokPtr)
389 while (HAS_CHAR(enc, ptr, end)) {
390 switch (BYTE_TYPE(enc, ptr)) {
391 #define LEAD_CASE(n) \
393 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
395 return XML_TOK_DATA_CHARS; \
399 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
408 return XML_TOK_DATA_CHARS;
415 return XML_TOK_DATA_CHARS;
418 /* ptr points to character following "</" */
421 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
422 const char *end, const char **nextTokPtr)
424 REQUIRE_CHAR(enc, ptr, end);
425 switch (BYTE_TYPE(enc, ptr)) {
426 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
429 return XML_TOK_INVALID;
431 while (HAS_CHAR(enc, ptr, end)) {
432 switch (BYTE_TYPE(enc, ptr)) {
433 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
434 case BT_S: case BT_CR: case BT_LF:
435 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
436 switch (BYTE_TYPE(enc, ptr)) {
437 case BT_S: case BT_CR: case BT_LF:
440 *nextTokPtr = ptr + MINBPC(enc);
441 return XML_TOK_END_TAG;
444 return XML_TOK_INVALID;
447 return XML_TOK_PARTIAL;
450 /* no need to check qname syntax here,
451 since end-tag must match exactly */
456 *nextTokPtr = ptr + MINBPC(enc);
457 return XML_TOK_END_TAG;
460 return XML_TOK_INVALID;
463 return XML_TOK_PARTIAL;
466 /* ptr points to character following "&#X" */
469 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
470 const char *end, const char **nextTokPtr)
472 if (HAS_CHAR(enc, ptr, end)) {
473 switch (BYTE_TYPE(enc, ptr)) {
479 return XML_TOK_INVALID;
481 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
482 switch (BYTE_TYPE(enc, ptr)) {
487 *nextTokPtr = ptr + MINBPC(enc);
488 return XML_TOK_CHAR_REF;
491 return XML_TOK_INVALID;
495 return XML_TOK_PARTIAL;
498 /* ptr points to character following "&#" */
501 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
502 const char *end, const char **nextTokPtr)
504 if (HAS_CHAR(enc, ptr, end)) {
505 if (CHAR_MATCHES(enc, ptr, ASCII_x))
506 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
507 switch (BYTE_TYPE(enc, ptr)) {
512 return XML_TOK_INVALID;
514 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
515 switch (BYTE_TYPE(enc, ptr)) {
519 *nextTokPtr = ptr + MINBPC(enc);
520 return XML_TOK_CHAR_REF;
523 return XML_TOK_INVALID;
527 return XML_TOK_PARTIAL;
530 /* ptr points to character following "&" */
533 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
534 const char **nextTokPtr)
536 REQUIRE_CHAR(enc, ptr, end);
537 switch (BYTE_TYPE(enc, ptr)) {
538 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
540 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
543 return XML_TOK_INVALID;
545 while (HAS_CHAR(enc, ptr, end)) {
546 switch (BYTE_TYPE(enc, ptr)) {
547 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
549 *nextTokPtr = ptr + MINBPC(enc);
550 return XML_TOK_ENTITY_REF;
553 return XML_TOK_INVALID;
556 return XML_TOK_PARTIAL;
559 /* ptr points to character following first character of attribute name */
562 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
563 const char **nextTokPtr)
568 while (HAS_CHAR(enc, ptr, end)) {
569 switch (BYTE_TYPE(enc, ptr)) {
570 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
575 return XML_TOK_INVALID;
579 REQUIRE_CHAR(enc, ptr, end);
580 switch (BYTE_TYPE(enc, ptr)) {
581 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
584 return XML_TOK_INVALID;
588 case BT_S: case BT_CR: case BT_LF:
593 REQUIRE_CHAR(enc, ptr, end);
594 t = BYTE_TYPE(enc, ptr);
604 return XML_TOK_INVALID;
616 REQUIRE_CHAR(enc, ptr, end);
617 open = BYTE_TYPE(enc, ptr);
618 if (open == BT_QUOT || open == BT_APOS)
627 return XML_TOK_INVALID;
631 /* in attribute value */
634 REQUIRE_CHAR(enc, ptr, end);
635 t = BYTE_TYPE(enc, ptr);
639 INVALID_CASES(ptr, nextTokPtr)
642 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
644 if (tok == XML_TOK_INVALID)
652 return XML_TOK_INVALID;
659 REQUIRE_CHAR(enc, ptr, end);
660 switch (BYTE_TYPE(enc, ptr)) {
671 return XML_TOK_INVALID;
673 /* ptr points to closing quote */
676 REQUIRE_CHAR(enc, ptr, end);
677 switch (BYTE_TYPE(enc, ptr)) {
678 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
679 case BT_S: case BT_CR: case BT_LF:
683 *nextTokPtr = ptr + MINBPC(enc);
684 return XML_TOK_START_TAG_WITH_ATTS;
688 REQUIRE_CHAR(enc, ptr, end);
689 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
691 return XML_TOK_INVALID;
693 *nextTokPtr = ptr + MINBPC(enc);
694 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
697 return XML_TOK_INVALID;
705 return XML_TOK_INVALID;
708 return XML_TOK_PARTIAL;
711 /* ptr points to character following "<" */
714 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
715 const char **nextTokPtr)
720 REQUIRE_CHAR(enc, ptr, end);
721 switch (BYTE_TYPE(enc, ptr)) {
722 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
725 REQUIRE_CHAR(enc, ptr, end);
726 switch (BYTE_TYPE(enc, ptr)) {
728 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
730 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
734 return XML_TOK_INVALID;
736 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
738 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
741 return XML_TOK_INVALID;
746 /* we have a start-tag */
747 while (HAS_CHAR(enc, ptr, end)) {
748 switch (BYTE_TYPE(enc, ptr)) {
749 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
754 return XML_TOK_INVALID;
758 REQUIRE_CHAR(enc, ptr, end);
759 switch (BYTE_TYPE(enc, ptr)) {
760 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
763 return XML_TOK_INVALID;
767 case BT_S: case BT_CR: case BT_LF:
770 while (HAS_CHAR(enc, ptr, end)) {
771 switch (BYTE_TYPE(enc, ptr)) {
772 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
777 case BT_S: case BT_CR: case BT_LF:
782 return XML_TOK_INVALID;
784 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
786 return XML_TOK_PARTIAL;
790 *nextTokPtr = ptr + MINBPC(enc);
791 return XML_TOK_START_TAG_NO_ATTS;
795 REQUIRE_CHAR(enc, ptr, end);
796 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
798 return XML_TOK_INVALID;
800 *nextTokPtr = ptr + MINBPC(enc);
801 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
804 return XML_TOK_INVALID;
807 return XML_TOK_PARTIAL;
811 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
812 const char **nextTokPtr)
816 if (MINBPC(enc) > 1) {
817 size_t n = end - ptr;
818 if (n & (MINBPC(enc) - 1)) {
819 n &= ~(MINBPC(enc) - 1);
821 return XML_TOK_PARTIAL;
825 switch (BYTE_TYPE(enc, ptr)) {
827 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
829 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
832 if (! HAS_CHAR(enc, ptr, end))
833 return XML_TOK_TRAILING_CR;
834 if (BYTE_TYPE(enc, ptr) == BT_LF)
837 return XML_TOK_DATA_NEWLINE;
839 *nextTokPtr = ptr + MINBPC(enc);
840 return XML_TOK_DATA_NEWLINE;
843 if (! HAS_CHAR(enc, ptr, end))
844 return XML_TOK_TRAILING_RSQB;
845 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
848 if (! HAS_CHAR(enc, ptr, end))
849 return XML_TOK_TRAILING_RSQB;
850 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
855 return XML_TOK_INVALID;
856 INVALID_CASES(ptr, nextTokPtr)
861 while (HAS_CHAR(enc, ptr, end)) {
862 switch (BYTE_TYPE(enc, ptr)) {
863 #define LEAD_CASE(n) \
865 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
867 return XML_TOK_DATA_CHARS; \
871 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
874 if (HAS_CHARS(enc, ptr, end, 2)) {
875 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
879 if (HAS_CHARS(enc, ptr, end, 3)) {
880 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
884 *nextTokPtr = ptr + 2*MINBPC(enc);
885 return XML_TOK_INVALID;
897 return XML_TOK_DATA_CHARS;
904 return XML_TOK_DATA_CHARS;
907 /* ptr points to character following "%" */
910 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
911 const char **nextTokPtr)
913 REQUIRE_CHAR(enc, ptr, end);
914 switch (BYTE_TYPE(enc, ptr)) {
915 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
916 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
918 return XML_TOK_PERCENT;
921 return XML_TOK_INVALID;
923 while (HAS_CHAR(enc, ptr, end)) {
924 switch (BYTE_TYPE(enc, ptr)) {
925 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
927 *nextTokPtr = ptr + MINBPC(enc);
928 return XML_TOK_PARAM_ENTITY_REF;
931 return XML_TOK_INVALID;
934 return XML_TOK_PARTIAL;
938 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
939 const char **nextTokPtr)
941 REQUIRE_CHAR(enc, ptr, end);
942 switch (BYTE_TYPE(enc, ptr)) {
943 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
946 return XML_TOK_INVALID;
948 while (HAS_CHAR(enc, ptr, end)) {
949 switch (BYTE_TYPE(enc, ptr)) {
950 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
951 case BT_CR: case BT_LF: case BT_S:
952 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
954 return XML_TOK_POUND_NAME;
957 return XML_TOK_INVALID;
960 return -XML_TOK_POUND_NAME;
964 PREFIX(scanLit)(int open, const ENCODING *enc,
965 const char *ptr, const char *end,
966 const char **nextTokPtr)
968 while (HAS_CHAR(enc, ptr, end)) {
969 int t = BYTE_TYPE(enc, ptr);
971 INVALID_CASES(ptr, nextTokPtr)
977 if (! HAS_CHAR(enc, ptr, end))
978 return -XML_TOK_LITERAL;
980 switch (BYTE_TYPE(enc, ptr)) {
981 case BT_S: case BT_CR: case BT_LF:
982 case BT_GT: case BT_PERCNT: case BT_LSQB:
983 return XML_TOK_LITERAL;
985 return XML_TOK_INVALID;
992 return XML_TOK_PARTIAL;
996 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
997 const char **nextTokPtr)
1001 return XML_TOK_NONE;
1002 if (MINBPC(enc) > 1) {
1003 size_t n = end - ptr;
1004 if (n & (MINBPC(enc) - 1)) {
1005 n &= ~(MINBPC(enc) - 1);
1007 return XML_TOK_PARTIAL;
1011 switch (BYTE_TYPE(enc, ptr)) {
1013 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1015 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1019 REQUIRE_CHAR(enc, ptr, end);
1020 switch (BYTE_TYPE(enc, ptr)) {
1022 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1024 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1031 *nextTokPtr = ptr - MINBPC(enc);
1032 return XML_TOK_INSTANCE_START;
1035 return XML_TOK_INVALID;
1038 if (ptr + MINBPC(enc) == end) {
1040 /* indicate that this might be part of a CR/LF pair */
1041 return -XML_TOK_PROLOG_S;
1044 case BT_S: case BT_LF:
1047 if (! HAS_CHAR(enc, ptr, end))
1049 switch (BYTE_TYPE(enc, ptr)) {
1050 case BT_S: case BT_LF:
1053 /* don't split CR/LF pair */
1054 if (ptr + MINBPC(enc) != end)
1059 return XML_TOK_PROLOG_S;
1063 return XML_TOK_PROLOG_S;
1065 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1067 *nextTokPtr = ptr + MINBPC(enc);
1068 return XML_TOK_COMMA;
1070 *nextTokPtr = ptr + MINBPC(enc);
1071 return XML_TOK_OPEN_BRACKET;
1074 if (! HAS_CHAR(enc, ptr, end))
1075 return -XML_TOK_CLOSE_BRACKET;
1076 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1077 REQUIRE_CHARS(enc, ptr, end, 2);
1078 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1079 *nextTokPtr = ptr + 2*MINBPC(enc);
1080 return XML_TOK_COND_SECT_CLOSE;
1084 return XML_TOK_CLOSE_BRACKET;
1086 *nextTokPtr = ptr + MINBPC(enc);
1087 return XML_TOK_OPEN_PAREN;
1090 if (! HAS_CHAR(enc, ptr, end))
1091 return -XML_TOK_CLOSE_PAREN;
1092 switch (BYTE_TYPE(enc, ptr)) {
1094 *nextTokPtr = ptr + MINBPC(enc);
1095 return XML_TOK_CLOSE_PAREN_ASTERISK;
1097 *nextTokPtr = ptr + MINBPC(enc);
1098 return XML_TOK_CLOSE_PAREN_QUESTION;
1100 *nextTokPtr = ptr + MINBPC(enc);
1101 return XML_TOK_CLOSE_PAREN_PLUS;
1102 case BT_CR: case BT_LF: case BT_S:
1103 case BT_GT: case BT_COMMA: case BT_VERBAR:
1106 return XML_TOK_CLOSE_PAREN;
1109 return XML_TOK_INVALID;
1111 *nextTokPtr = ptr + MINBPC(enc);
1114 *nextTokPtr = ptr + MINBPC(enc);
1115 return XML_TOK_DECL_CLOSE;
1117 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1118 #define LEAD_CASE(n) \
1119 case BT_LEAD ## n: \
1120 if (end - ptr < n) \
1121 return XML_TOK_PARTIAL_CHAR; \
1122 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1124 tok = XML_TOK_NAME; \
1127 if (IS_NAME_CHAR(enc, ptr, n)) { \
1129 tok = XML_TOK_NMTOKEN; \
1132 *nextTokPtr = ptr; \
1133 return XML_TOK_INVALID;
1134 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1147 tok = XML_TOK_NMTOKEN;
1151 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1156 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1158 tok = XML_TOK_NMTOKEN;
1164 return XML_TOK_INVALID;
1166 while (HAS_CHAR(enc, ptr, end)) {
1167 switch (BYTE_TYPE(enc, ptr)) {
1168 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1169 case BT_GT: case BT_RPAR: case BT_COMMA:
1170 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1171 case BT_S: case BT_CR: case BT_LF:
1179 REQUIRE_CHAR(enc, ptr, end);
1180 tok = XML_TOK_PREFIXED_NAME;
1181 switch (BYTE_TYPE(enc, ptr)) {
1182 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1184 tok = XML_TOK_NMTOKEN;
1188 case XML_TOK_PREFIXED_NAME:
1189 tok = XML_TOK_NMTOKEN;
1195 if (tok == XML_TOK_NMTOKEN) {
1197 return XML_TOK_INVALID;
1199 *nextTokPtr = ptr + MINBPC(enc);
1200 return XML_TOK_NAME_PLUS;
1202 if (tok == XML_TOK_NMTOKEN) {
1204 return XML_TOK_INVALID;
1206 *nextTokPtr = ptr + MINBPC(enc);
1207 return XML_TOK_NAME_ASTERISK;
1209 if (tok == XML_TOK_NMTOKEN) {
1211 return XML_TOK_INVALID;
1213 *nextTokPtr = ptr + MINBPC(enc);
1214 return XML_TOK_NAME_QUESTION;
1217 return XML_TOK_INVALID;
1224 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1225 const char *end, const char **nextTokPtr)
1229 return XML_TOK_NONE;
1230 else if (! HAS_CHAR(enc, ptr, end)) {
1231 /* This line cannot be executed. The incoming data has already
1232 * been tokenized once, so incomplete characters like this have
1233 * already been eliminated from the input. Retaining the paranoia
1234 * check is still valuable, however.
1236 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1239 while (HAS_CHAR(enc, ptr, end)) {
1240 switch (BYTE_TYPE(enc, ptr)) {
1241 #define LEAD_CASE(n) \
1242 case BT_LEAD ## n: ptr += n; break;
1243 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1247 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1249 return XML_TOK_DATA_CHARS;
1251 /* this is for inside entity references */
1253 return XML_TOK_INVALID;
1256 *nextTokPtr = ptr + MINBPC(enc);
1257 return XML_TOK_DATA_NEWLINE;
1260 return XML_TOK_DATA_CHARS;
1264 if (! HAS_CHAR(enc, ptr, end))
1265 return XML_TOK_TRAILING_CR;
1266 if (BYTE_TYPE(enc, ptr) == BT_LF)
1269 return XML_TOK_DATA_NEWLINE;
1272 return XML_TOK_DATA_CHARS;
1275 *nextTokPtr = ptr + MINBPC(enc);
1276 return XML_TOK_ATTRIBUTE_VALUE_S;
1279 return XML_TOK_DATA_CHARS;
1286 return XML_TOK_DATA_CHARS;
1290 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1291 const char *end, const char **nextTokPtr)
1295 return XML_TOK_NONE;
1296 else if (! HAS_CHAR(enc, ptr, end)) {
1297 /* This line cannot be executed. The incoming data has already
1298 * been tokenized once, so incomplete characters like this have
1299 * already been eliminated from the input. Retaining the paranoia
1300 * check is still valuable, however.
1302 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1305 while (HAS_CHAR(enc, ptr, end)) {
1306 switch (BYTE_TYPE(enc, ptr)) {
1307 #define LEAD_CASE(n) \
1308 case BT_LEAD ## n: ptr += n; break;
1309 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1313 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1315 return XML_TOK_DATA_CHARS;
1318 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1320 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1323 return XML_TOK_DATA_CHARS;
1326 *nextTokPtr = ptr + MINBPC(enc);
1327 return XML_TOK_DATA_NEWLINE;
1330 return XML_TOK_DATA_CHARS;
1334 if (! HAS_CHAR(enc, ptr, end))
1335 return XML_TOK_TRAILING_CR;
1336 if (BYTE_TYPE(enc, ptr) == BT_LF)
1339 return XML_TOK_DATA_NEWLINE;
1342 return XML_TOK_DATA_CHARS;
1349 return XML_TOK_DATA_CHARS;
1355 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1356 const char *end, const char **nextTokPtr)
1359 if (MINBPC(enc) > 1) {
1360 size_t n = end - ptr;
1361 if (n & (MINBPC(enc) - 1)) {
1362 n &= ~(MINBPC(enc) - 1);
1366 while (HAS_CHAR(enc, ptr, end)) {
1367 switch (BYTE_TYPE(enc, ptr)) {
1368 INVALID_CASES(ptr, nextTokPtr)
1371 REQUIRE_CHAR(enc, ptr, end);
1372 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1374 REQUIRE_CHAR(enc, ptr, end);
1375 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1383 REQUIRE_CHAR(enc, ptr, end);
1384 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1386 REQUIRE_CHAR(enc, ptr, end);
1387 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1391 return XML_TOK_IGNORE_SECT;
1402 return XML_TOK_PARTIAL;
1405 #endif /* XML_DTD */
1408 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1409 const char **badPtr)
1413 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1414 switch (BYTE_TYPE(enc, ptr)) {
1438 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1445 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1449 switch (BYTE_TO_ASCII(enc, ptr)) {
1463 /* This must only be called for a well-formed start-tag or empty
1464 element tag. Returns the number of attributes. Pointers to the
1465 first attsMax attributes are stored in atts.
1469 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1470 int attsMax, ATTRIBUTE *atts)
1472 enum { other, inName, inValue } state = inName;
1474 int open = 0; /* defined when state == inValue;
1475 initialization just to shut up compilers */
1477 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1478 switch (BYTE_TYPE(enc, ptr)) {
1479 #define START_NAME \
1480 if (state == other) { \
1481 if (nAtts < attsMax) { \
1482 atts[nAtts].name = ptr; \
1483 atts[nAtts].normalized = 1; \
1487 #define LEAD_CASE(n) \
1488 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1489 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1498 if (state != inValue) {
1499 if (nAtts < attsMax)
1500 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1504 else if (open == BT_QUOT) {
1506 if (nAtts < attsMax)
1507 atts[nAtts].valueEnd = ptr;
1512 if (state != inValue) {
1513 if (nAtts < attsMax)
1514 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1518 else if (open == BT_APOS) {
1520 if (nAtts < attsMax)
1521 atts[nAtts].valueEnd = ptr;
1526 if (nAtts < attsMax)
1527 atts[nAtts].normalized = 0;
1530 if (state == inName)
1532 else if (state == inValue
1534 && atts[nAtts].normalized
1535 && (ptr == atts[nAtts].valuePtr
1536 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1537 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1538 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1539 atts[nAtts].normalized = 0;
1541 case BT_CR: case BT_LF:
1542 /* This case ensures that the first attribute name is counted
1543 Apart from that we could just change state on the quote. */
1544 if (state == inName)
1546 else if (state == inValue && nAtts < attsMax)
1547 atts[nAtts].normalized = 0;
1551 if (state != inValue)
1561 static int PTRFASTCALL
1562 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1566 ptr += 2*MINBPC(enc);
1567 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1568 for (ptr += MINBPC(enc);
1569 !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1570 ptr += MINBPC(enc)) {
1571 int c = BYTE_TO_ASCII(enc, ptr);
1573 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1574 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1576 result |= (c - ASCII_0);
1578 case ASCII_A: case ASCII_B: case ASCII_C:
1579 case ASCII_D: case ASCII_E: case ASCII_F:
1581 result += 10 + (c - ASCII_A);
1583 case ASCII_a: case ASCII_b: case ASCII_c:
1584 case ASCII_d: case ASCII_e: case ASCII_f:
1586 result += 10 + (c - ASCII_a);
1589 if (result >= 0x110000)
1594 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1595 int c = BYTE_TO_ASCII(enc, ptr);
1597 result += (c - ASCII_0);
1598 if (result >= 0x110000)
1602 return checkCharRefNumber(result);
1606 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1609 switch ((end - ptr)/MINBPC(enc)) {
1611 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1612 switch (BYTE_TO_ASCII(enc, ptr)) {
1621 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1623 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1625 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1631 switch (BYTE_TO_ASCII(enc, ptr)) {
1634 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1636 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1638 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1645 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1647 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1649 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1660 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1661 const char *end1, const char *ptr2)
1663 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1664 if (end1 - ptr1 < MINBPC(enc)) {
1665 /* This line cannot be executed. The incoming data has already
1666 * been tokenized once, so incomplete characters like this have
1667 * already been eliminated from the input. Retaining the
1668 * paranoia check is still valuable, however.
1670 return 0; /* LCOV_EXCL_LINE */
1672 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1675 return ptr1 == end1;
1678 static int PTRFASTCALL
1679 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1681 const char *start = ptr;
1683 switch (BYTE_TYPE(enc, ptr)) {
1684 #define LEAD_CASE(n) \
1685 case BT_LEAD ## n: ptr += n; break;
1686 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1700 return (int)(ptr - start);
1705 static const char * PTRFASTCALL
1706 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1709 switch (BYTE_TYPE(enc, ptr)) {
1722 PREFIX(updatePosition)(const ENCODING *enc,
1727 while (HAS_CHAR(enc, ptr, end)) {
1728 switch (BYTE_TYPE(enc, ptr)) {
1729 #define LEAD_CASE(n) \
1730 case BT_LEAD ## n: \
1733 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1736 pos->columnNumber = (XML_Size)-1;
1743 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1745 pos->columnNumber = (XML_Size)-1;
1751 pos->columnNumber++;
1756 #undef MULTIBYTE_CASES
1757 #undef INVALID_CASES
1758 #undef CHECK_NAME_CASE
1759 #undef CHECK_NAME_CASES
1760 #undef CHECK_NMSTRT_CASE
1761 #undef CHECK_NMSTRT_CASES
1763 #endif /* XML_TOK_IMPL_C */