1 /* This file is included!
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
35 # ifndef IS_INVALID_CHAR
36 # define IS_INVALID_CHAR(enc, ptr, n) (0)
39 # define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
42 return XML_TOK_PARTIAL_CHAR; \
43 if (IS_INVALID_CHAR(enc, ptr, n)) { \
44 *(nextTokPtr) = (ptr); \
45 return XML_TOK_INVALID; \
50 # define INVALID_CASES(ptr, nextTokPtr) \
51 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
52 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
53 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
57 *(nextTokPtr) = (ptr); \
58 return XML_TOK_INVALID;
60 # define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
63 return XML_TOK_PARTIAL_CHAR; \
64 if (! IS_NAME_CHAR(enc, ptr, n)) { \
66 return XML_TOK_INVALID; \
71 # define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
73 if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \
75 return XML_TOK_INVALID; \
85 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
86 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
87 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
89 # define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
92 return XML_TOK_PARTIAL_CHAR; \
93 if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \
95 return XML_TOK_INVALID; \
100 # define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
102 if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
104 return XML_TOK_INVALID; \
109 ptr += MINBPC(enc); \
111 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
112 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
113 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
116 # define PREFIX(ident) ident
119 # define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc))
121 # define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
123 # define REQUIRE_CHARS(enc, ptr, end, count) \
125 if (! HAS_CHARS(enc, ptr, end, count)) { \
126 return XML_TOK_PARTIAL; \
130 # define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
132 /* ptr points to character following "<!-" */
135 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
136 const char **nextTokPtr) {
137 if (HAS_CHAR(enc, ptr, end)) {
138 if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
140 return XML_TOK_INVALID;
143 while (HAS_CHAR(enc, ptr, end)) {
144 switch (BYTE_TYPE(enc, ptr)) {
145 INVALID_CASES(ptr, nextTokPtr)
148 REQUIRE_CHAR(enc, ptr, end);
149 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
151 REQUIRE_CHAR(enc, ptr, end);
152 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
154 return XML_TOK_INVALID;
156 *nextTokPtr = ptr + MINBPC(enc);
157 return XML_TOK_COMMENT;
166 return XML_TOK_PARTIAL;
169 /* ptr points to character following "<!" */
172 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
173 const char **nextTokPtr) {
174 REQUIRE_CHAR(enc, ptr, end);
175 switch (BYTE_TYPE(enc, ptr)) {
177 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
179 *nextTokPtr = ptr + MINBPC(enc);
180 return XML_TOK_COND_SECT_OPEN;
187 return XML_TOK_INVALID;
189 while (HAS_CHAR(enc, ptr, end)) {
190 switch (BYTE_TYPE(enc, ptr)) {
192 REQUIRE_CHARS(enc, ptr, end, 2);
193 /* don't allow <!ENTITY% foo "whatever"> */
194 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
200 return XML_TOK_INVALID;
207 return XML_TOK_DECL_OPEN;
214 return XML_TOK_INVALID;
217 return XML_TOK_PARTIAL;
221 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
225 *tokPtr = XML_TOK_PI;
226 if (end - ptr != MINBPC(enc) * 3)
228 switch (BYTE_TO_ASCII(enc, ptr)) {
238 switch (BYTE_TO_ASCII(enc, ptr)) {
248 switch (BYTE_TO_ASCII(enc, ptr)) {
259 *tokPtr = XML_TOK_XML_DECL;
263 /* ptr points to character following "<?" */
266 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
267 const char **nextTokPtr) {
269 const char *target = ptr;
270 REQUIRE_CHAR(enc, ptr, end);
271 switch (BYTE_TYPE(enc, ptr)) {
272 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
275 return XML_TOK_INVALID;
277 while (HAS_CHAR(enc, ptr, end)) {
278 switch (BYTE_TYPE(enc, ptr)) {
279 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
283 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
285 return XML_TOK_INVALID;
288 while (HAS_CHAR(enc, ptr, end)) {
289 switch (BYTE_TYPE(enc, ptr)) {
290 INVALID_CASES(ptr, nextTokPtr)
293 REQUIRE_CHAR(enc, ptr, end);
294 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
295 *nextTokPtr = ptr + MINBPC(enc);
304 return XML_TOK_PARTIAL;
306 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
308 return XML_TOK_INVALID;
311 REQUIRE_CHAR(enc, ptr, end);
312 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
313 *nextTokPtr = ptr + MINBPC(enc);
319 return XML_TOK_INVALID;
322 return XML_TOK_PARTIAL;
326 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
327 const char **nextTokPtr) {
328 static const char CDATA_LSQB[]
329 = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
333 REQUIRE_CHARS(enc, ptr, end, 6);
334 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
335 if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
337 return XML_TOK_INVALID;
341 return XML_TOK_CDATA_SECT_OPEN;
345 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
346 const char **nextTokPtr) {
349 if (MINBPC(enc) > 1) {
350 size_t n = end - ptr;
351 if (n & (MINBPC(enc) - 1)) {
352 n &= ~(MINBPC(enc) - 1);
354 return XML_TOK_PARTIAL;
358 switch (BYTE_TYPE(enc, ptr)) {
361 REQUIRE_CHAR(enc, ptr, end);
362 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
365 REQUIRE_CHAR(enc, ptr, end);
366 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
370 *nextTokPtr = ptr + MINBPC(enc);
371 return XML_TOK_CDATA_SECT_CLOSE;
374 REQUIRE_CHAR(enc, ptr, end);
375 if (BYTE_TYPE(enc, ptr) == BT_LF)
378 return XML_TOK_DATA_NEWLINE;
380 *nextTokPtr = ptr + MINBPC(enc);
381 return XML_TOK_DATA_NEWLINE;
382 INVALID_CASES(ptr, nextTokPtr)
387 while (HAS_CHAR(enc, ptr, end)) {
388 switch (BYTE_TYPE(enc, ptr)) {
389 # define LEAD_CASE(n) \
391 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
393 return XML_TOK_DATA_CHARS; \
408 return XML_TOK_DATA_CHARS;
415 return XML_TOK_DATA_CHARS;
418 /* ptr points to character following "</" */
421 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
422 const char **nextTokPtr) {
423 REQUIRE_CHAR(enc, ptr, end);
424 switch (BYTE_TYPE(enc, ptr)) {
425 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
428 return XML_TOK_INVALID;
430 while (HAS_CHAR(enc, ptr, end)) {
431 switch (BYTE_TYPE(enc, ptr)) {
432 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
436 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
437 switch (BYTE_TYPE(enc, ptr)) {
443 *nextTokPtr = ptr + MINBPC(enc);
444 return XML_TOK_END_TAG;
447 return XML_TOK_INVALID;
450 return XML_TOK_PARTIAL;
453 /* no need to check qname syntax here,
454 since end-tag must match exactly */
459 *nextTokPtr = ptr + MINBPC(enc);
460 return XML_TOK_END_TAG;
463 return XML_TOK_INVALID;
466 return XML_TOK_PARTIAL;
469 /* ptr points to character following "&#X" */
472 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
473 const char **nextTokPtr) {
474 if (HAS_CHAR(enc, ptr, end)) {
475 switch (BYTE_TYPE(enc, ptr)) {
481 return XML_TOK_INVALID;
483 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
484 switch (BYTE_TYPE(enc, ptr)) {
489 *nextTokPtr = ptr + MINBPC(enc);
490 return XML_TOK_CHAR_REF;
493 return XML_TOK_INVALID;
497 return XML_TOK_PARTIAL;
500 /* ptr points to character following "&#" */
503 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
504 const char **nextTokPtr) {
505 if (HAS_CHAR(enc, ptr, end)) {
506 if (CHAR_MATCHES(enc, ptr, ASCII_x))
507 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
508 switch (BYTE_TYPE(enc, ptr)) {
513 return XML_TOK_INVALID;
515 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
516 switch (BYTE_TYPE(enc, ptr)) {
520 *nextTokPtr = ptr + MINBPC(enc);
521 return XML_TOK_CHAR_REF;
524 return XML_TOK_INVALID;
528 return XML_TOK_PARTIAL;
531 /* ptr points to character following "&" */
534 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
535 const char **nextTokPtr) {
536 REQUIRE_CHAR(enc, ptr, end);
537 switch (BYTE_TYPE(enc, ptr)) {
538 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
540 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
543 return XML_TOK_INVALID;
545 while (HAS_CHAR(enc, ptr, end)) {
546 switch (BYTE_TYPE(enc, ptr)) {
547 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
549 *nextTokPtr = ptr + MINBPC(enc);
550 return XML_TOK_ENTITY_REF;
553 return XML_TOK_INVALID;
556 return XML_TOK_PARTIAL;
559 /* ptr points to character following first character of attribute name */
562 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
563 const char **nextTokPtr) {
567 while (HAS_CHAR(enc, ptr, end)) {
568 switch (BYTE_TYPE(enc, ptr)) {
569 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
574 return XML_TOK_INVALID;
578 REQUIRE_CHAR(enc, ptr, end);
579 switch (BYTE_TYPE(enc, ptr)) {
580 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
583 return XML_TOK_INVALID;
594 REQUIRE_CHAR(enc, ptr, end);
595 t = BYTE_TYPE(enc, ptr);
605 return XML_TOK_INVALID;
616 REQUIRE_CHAR(enc, ptr, end);
617 open = BYTE_TYPE(enc, ptr);
618 if (open == BT_QUOT || open == BT_APOS)
627 return XML_TOK_INVALID;
631 /* in attribute value */
634 REQUIRE_CHAR(enc, ptr, end);
635 t = BYTE_TYPE(enc, ptr);
639 INVALID_CASES(ptr, nextTokPtr)
641 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
643 if (tok == XML_TOK_INVALID)
651 return XML_TOK_INVALID;
658 REQUIRE_CHAR(enc, ptr, end);
659 switch (BYTE_TYPE(enc, ptr)) {
670 return XML_TOK_INVALID;
672 /* ptr points to closing quote */
675 REQUIRE_CHAR(enc, ptr, end);
676 switch (BYTE_TYPE(enc, ptr)) {
677 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
684 *nextTokPtr = ptr + MINBPC(enc);
685 return XML_TOK_START_TAG_WITH_ATTS;
689 REQUIRE_CHAR(enc, ptr, end);
690 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
692 return XML_TOK_INVALID;
694 *nextTokPtr = ptr + MINBPC(enc);
695 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
698 return XML_TOK_INVALID;
706 return XML_TOK_INVALID;
709 return XML_TOK_PARTIAL;
712 /* ptr points to character following "<" */
715 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
716 const char **nextTokPtr) {
720 REQUIRE_CHAR(enc, ptr, end);
721 switch (BYTE_TYPE(enc, ptr)) {
722 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
725 REQUIRE_CHAR(enc, ptr, end);
726 switch (BYTE_TYPE(enc, ptr)) {
728 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
730 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
733 return XML_TOK_INVALID;
735 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
737 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
740 return XML_TOK_INVALID;
745 /* we have a start-tag */
746 while (HAS_CHAR(enc, ptr, end)) {
747 switch (BYTE_TYPE(enc, ptr)) {
748 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
753 return XML_TOK_INVALID;
757 REQUIRE_CHAR(enc, ptr, end);
758 switch (BYTE_TYPE(enc, ptr)) {
759 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
762 return XML_TOK_INVALID;
770 while (HAS_CHAR(enc, ptr, end)) {
771 switch (BYTE_TYPE(enc, ptr)) {
772 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
784 return XML_TOK_INVALID;
786 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
788 return XML_TOK_PARTIAL;
792 *nextTokPtr = ptr + MINBPC(enc);
793 return XML_TOK_START_TAG_NO_ATTS;
797 REQUIRE_CHAR(enc, ptr, end);
798 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
800 return XML_TOK_INVALID;
802 *nextTokPtr = ptr + MINBPC(enc);
803 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
806 return XML_TOK_INVALID;
809 return XML_TOK_PARTIAL;
813 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
814 const char **nextTokPtr) {
817 if (MINBPC(enc) > 1) {
818 size_t n = end - ptr;
819 if (n & (MINBPC(enc) - 1)) {
820 n &= ~(MINBPC(enc) - 1);
822 return XML_TOK_PARTIAL;
826 switch (BYTE_TYPE(enc, ptr)) {
828 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
830 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
833 if (! HAS_CHAR(enc, ptr, end))
834 return XML_TOK_TRAILING_CR;
835 if (BYTE_TYPE(enc, ptr) == BT_LF)
838 return XML_TOK_DATA_NEWLINE;
840 *nextTokPtr = ptr + MINBPC(enc);
841 return XML_TOK_DATA_NEWLINE;
844 if (! HAS_CHAR(enc, ptr, end))
845 return XML_TOK_TRAILING_RSQB;
846 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
849 if (! HAS_CHAR(enc, ptr, end))
850 return XML_TOK_TRAILING_RSQB;
851 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
856 return XML_TOK_INVALID;
857 INVALID_CASES(ptr, nextTokPtr)
862 while (HAS_CHAR(enc, ptr, end)) {
863 switch (BYTE_TYPE(enc, ptr)) {
864 # define LEAD_CASE(n) \
866 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
868 return XML_TOK_DATA_CHARS; \
877 if (HAS_CHARS(enc, ptr, end, 2)) {
878 if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
882 if (HAS_CHARS(enc, ptr, end, 3)) {
883 if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
887 *nextTokPtr = ptr + 2 * MINBPC(enc);
888 return XML_TOK_INVALID;
900 return XML_TOK_DATA_CHARS;
907 return XML_TOK_DATA_CHARS;
910 /* ptr points to character following "%" */
913 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
914 const char **nextTokPtr) {
915 REQUIRE_CHAR(enc, ptr, end);
916 switch (BYTE_TYPE(enc, ptr)) {
917 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
923 return XML_TOK_PERCENT;
926 return XML_TOK_INVALID;
928 while (HAS_CHAR(enc, ptr, end)) {
929 switch (BYTE_TYPE(enc, ptr)) {
930 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
932 *nextTokPtr = ptr + MINBPC(enc);
933 return XML_TOK_PARAM_ENTITY_REF;
936 return XML_TOK_INVALID;
939 return XML_TOK_PARTIAL;
943 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
944 const char **nextTokPtr) {
945 REQUIRE_CHAR(enc, ptr, end);
946 switch (BYTE_TYPE(enc, ptr)) {
947 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
950 return XML_TOK_INVALID;
952 while (HAS_CHAR(enc, ptr, end)) {
953 switch (BYTE_TYPE(enc, ptr)) {
954 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
963 return XML_TOK_POUND_NAME;
966 return XML_TOK_INVALID;
969 return -XML_TOK_POUND_NAME;
973 PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
974 const char **nextTokPtr) {
975 while (HAS_CHAR(enc, ptr, end)) {
976 int t = BYTE_TYPE(enc, ptr);
978 INVALID_CASES(ptr, nextTokPtr)
984 if (! HAS_CHAR(enc, ptr, end))
985 return -XML_TOK_LITERAL;
987 switch (BYTE_TYPE(enc, ptr)) {
994 return XML_TOK_LITERAL;
996 return XML_TOK_INVALID;
1003 return XML_TOK_PARTIAL;
1007 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1008 const char **nextTokPtr) {
1011 return XML_TOK_NONE;
1012 if (MINBPC(enc) > 1) {
1013 size_t n = end - ptr;
1014 if (n & (MINBPC(enc) - 1)) {
1015 n &= ~(MINBPC(enc) - 1);
1017 return XML_TOK_PARTIAL;
1021 switch (BYTE_TYPE(enc, ptr)) {
1023 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1025 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1028 REQUIRE_CHAR(enc, ptr, end);
1029 switch (BYTE_TYPE(enc, ptr)) {
1031 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1033 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1040 *nextTokPtr = ptr - MINBPC(enc);
1041 return XML_TOK_INSTANCE_START;
1044 return XML_TOK_INVALID;
1047 if (ptr + MINBPC(enc) == end) {
1049 /* indicate that this might be part of a CR/LF pair */
1050 return -XML_TOK_PROLOG_S;
1057 if (! HAS_CHAR(enc, ptr, end))
1059 switch (BYTE_TYPE(enc, ptr)) {
1064 /* don't split CR/LF pair */
1065 if (ptr + MINBPC(enc) != end)
1070 return XML_TOK_PROLOG_S;
1074 return XML_TOK_PROLOG_S;
1076 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1078 *nextTokPtr = ptr + MINBPC(enc);
1079 return XML_TOK_COMMA;
1081 *nextTokPtr = ptr + MINBPC(enc);
1082 return XML_TOK_OPEN_BRACKET;
1085 if (! HAS_CHAR(enc, ptr, end))
1086 return -XML_TOK_CLOSE_BRACKET;
1087 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1088 REQUIRE_CHARS(enc, ptr, end, 2);
1089 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1090 *nextTokPtr = ptr + 2 * MINBPC(enc);
1091 return XML_TOK_COND_SECT_CLOSE;
1095 return XML_TOK_CLOSE_BRACKET;
1097 *nextTokPtr = ptr + MINBPC(enc);
1098 return XML_TOK_OPEN_PAREN;
1101 if (! HAS_CHAR(enc, ptr, end))
1102 return -XML_TOK_CLOSE_PAREN;
1103 switch (BYTE_TYPE(enc, ptr)) {
1105 *nextTokPtr = ptr + MINBPC(enc);
1106 return XML_TOK_CLOSE_PAREN_ASTERISK;
1108 *nextTokPtr = ptr + MINBPC(enc);
1109 return XML_TOK_CLOSE_PAREN_QUESTION;
1111 *nextTokPtr = ptr + MINBPC(enc);
1112 return XML_TOK_CLOSE_PAREN_PLUS;
1121 return XML_TOK_CLOSE_PAREN;
1124 return XML_TOK_INVALID;
1126 *nextTokPtr = ptr + MINBPC(enc);
1129 *nextTokPtr = ptr + MINBPC(enc);
1130 return XML_TOK_DECL_CLOSE;
1132 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1133 # define LEAD_CASE(n) \
1135 if (end - ptr < n) \
1136 return XML_TOK_PARTIAL_CHAR; \
1137 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1139 tok = XML_TOK_NAME; \
1142 if (IS_NAME_CHAR(enc, ptr, n)) { \
1144 tok = XML_TOK_NMTOKEN; \
1147 *nextTokPtr = ptr; \
1148 return XML_TOK_INVALID;
1164 tok = XML_TOK_NMTOKEN;
1168 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1173 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1175 tok = XML_TOK_NMTOKEN;
1181 return XML_TOK_INVALID;
1183 while (HAS_CHAR(enc, ptr, end)) {
1184 switch (BYTE_TYPE(enc, ptr)) {
1185 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1202 REQUIRE_CHAR(enc, ptr, end);
1203 tok = XML_TOK_PREFIXED_NAME;
1204 switch (BYTE_TYPE(enc, ptr)) {
1205 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1207 tok = XML_TOK_NMTOKEN;
1211 case XML_TOK_PREFIXED_NAME:
1212 tok = XML_TOK_NMTOKEN;
1218 if (tok == XML_TOK_NMTOKEN) {
1220 return XML_TOK_INVALID;
1222 *nextTokPtr = ptr + MINBPC(enc);
1223 return XML_TOK_NAME_PLUS;
1225 if (tok == XML_TOK_NMTOKEN) {
1227 return XML_TOK_INVALID;
1229 *nextTokPtr = ptr + MINBPC(enc);
1230 return XML_TOK_NAME_ASTERISK;
1232 if (tok == XML_TOK_NMTOKEN) {
1234 return XML_TOK_INVALID;
1236 *nextTokPtr = ptr + MINBPC(enc);
1237 return XML_TOK_NAME_QUESTION;
1240 return XML_TOK_INVALID;
1247 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1248 const char **nextTokPtr) {
1251 return XML_TOK_NONE;
1252 else if (! HAS_CHAR(enc, ptr, end)) {
1253 /* This line cannot be executed. The incoming data has already
1254 * been tokenized once, so incomplete characters like this have
1255 * already been eliminated from the input. Retaining the paranoia
1256 * check is still valuable, however.
1258 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1261 while (HAS_CHAR(enc, ptr, end)) {
1262 switch (BYTE_TYPE(enc, ptr)) {
1263 # define LEAD_CASE(n) \
1273 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1275 return XML_TOK_DATA_CHARS;
1277 /* this is for inside entity references */
1279 return XML_TOK_INVALID;
1282 *nextTokPtr = ptr + MINBPC(enc);
1283 return XML_TOK_DATA_NEWLINE;
1286 return XML_TOK_DATA_CHARS;
1290 if (! HAS_CHAR(enc, ptr, end))
1291 return XML_TOK_TRAILING_CR;
1292 if (BYTE_TYPE(enc, ptr) == BT_LF)
1295 return XML_TOK_DATA_NEWLINE;
1298 return XML_TOK_DATA_CHARS;
1301 *nextTokPtr = ptr + MINBPC(enc);
1302 return XML_TOK_ATTRIBUTE_VALUE_S;
1305 return XML_TOK_DATA_CHARS;
1312 return XML_TOK_DATA_CHARS;
1316 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1317 const char **nextTokPtr) {
1320 return XML_TOK_NONE;
1321 else if (! HAS_CHAR(enc, ptr, end)) {
1322 /* This line cannot be executed. The incoming data has already
1323 * been tokenized once, so incomplete characters like this have
1324 * already been eliminated from the input. Retaining the paranoia
1325 * check is still valuable, however.
1327 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1330 while (HAS_CHAR(enc, ptr, end)) {
1331 switch (BYTE_TYPE(enc, ptr)) {
1332 # define LEAD_CASE(n) \
1342 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1344 return XML_TOK_DATA_CHARS;
1347 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1348 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1351 return XML_TOK_DATA_CHARS;
1354 *nextTokPtr = ptr + MINBPC(enc);
1355 return XML_TOK_DATA_NEWLINE;
1358 return XML_TOK_DATA_CHARS;
1362 if (! HAS_CHAR(enc, ptr, end))
1363 return XML_TOK_TRAILING_CR;
1364 if (BYTE_TYPE(enc, ptr) == BT_LF)
1367 return XML_TOK_DATA_NEWLINE;
1370 return XML_TOK_DATA_CHARS;
1377 return XML_TOK_DATA_CHARS;
1383 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1384 const char **nextTokPtr) {
1386 if (MINBPC(enc) > 1) {
1387 size_t n = end - ptr;
1388 if (n & (MINBPC(enc) - 1)) {
1389 n &= ~(MINBPC(enc) - 1);
1393 while (HAS_CHAR(enc, ptr, end)) {
1394 switch (BYTE_TYPE(enc, ptr)) {
1395 INVALID_CASES(ptr, nextTokPtr)
1398 REQUIRE_CHAR(enc, ptr, end);
1399 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1401 REQUIRE_CHAR(enc, ptr, end);
1402 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1410 REQUIRE_CHAR(enc, ptr, end);
1411 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1413 REQUIRE_CHAR(enc, ptr, end);
1414 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1418 return XML_TOK_IGNORE_SECT;
1429 return XML_TOK_PARTIAL;
1432 # endif /* XML_DTD */
1435 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1436 const char **badPtr) {
1439 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1440 switch (BYTE_TYPE(enc, ptr)) {
1464 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1471 if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1475 switch (BYTE_TO_ASCII(enc, ptr)) {
1489 /* This must only be called for a well-formed start-tag or empty
1490 element tag. Returns the number of attributes. Pointers to the
1491 first attsMax attributes are stored in atts.
1495 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1497 enum { other, inName, inValue } state = inName;
1499 int open = 0; /* defined when state == inValue;
1500 initialization just to shut up compilers */
1502 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1503 switch (BYTE_TYPE(enc, ptr)) {
1504 # define START_NAME \
1505 if (state == other) { \
1506 if (nAtts < attsMax) { \
1507 atts[nAtts].name = ptr; \
1508 atts[nAtts].normalized = 1; \
1512 # define LEAD_CASE(n) \
1514 START_NAME ptr += (n - MINBPC(enc)); \
1527 if (state != inValue) {
1528 if (nAtts < attsMax)
1529 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1532 } else if (open == BT_QUOT) {
1534 if (nAtts < attsMax)
1535 atts[nAtts].valueEnd = ptr;
1540 if (state != inValue) {
1541 if (nAtts < attsMax)
1542 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1545 } else if (open == BT_APOS) {
1547 if (nAtts < attsMax)
1548 atts[nAtts].valueEnd = ptr;
1553 if (nAtts < attsMax)
1554 atts[nAtts].normalized = 0;
1557 if (state == inName)
1559 else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1560 && (ptr == atts[nAtts].valuePtr
1561 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1562 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1563 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1564 atts[nAtts].normalized = 0;
1568 /* This case ensures that the first attribute name is counted
1569 Apart from that we could just change state on the quote. */
1570 if (state == inName)
1572 else if (state == inValue && nAtts < attsMax)
1573 atts[nAtts].normalized = 0;
1577 if (state != inValue)
1587 static int PTRFASTCALL
1588 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1592 ptr += 2 * MINBPC(enc);
1593 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1594 for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1595 ptr += MINBPC(enc)) {
1596 int c = BYTE_TO_ASCII(enc, ptr);
1609 result |= (c - ASCII_0);
1618 result += 10 + (c - ASCII_A);
1627 result += 10 + (c - ASCII_a);
1630 if (result >= 0x110000)
1634 for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1635 int c = BYTE_TO_ASCII(enc, ptr);
1637 result += (c - ASCII_0);
1638 if (result >= 0x110000)
1642 return checkCharRefNumber(result);
1646 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1649 switch ((end - ptr) / MINBPC(enc)) {
1651 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1652 switch (BYTE_TO_ASCII(enc, ptr)) {
1661 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1663 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1665 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1671 switch (BYTE_TO_ASCII(enc, ptr)) {
1674 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1676 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1678 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1685 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1687 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1689 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1700 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1701 const char *end1, const char *ptr2) {
1703 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1704 if (end1 - ptr1 < MINBPC(enc)) {
1705 /* This line cannot be executed. The incoming data has already
1706 * been tokenized once, so incomplete characters like this have
1707 * already been eliminated from the input. Retaining the
1708 * paranoia check is still valuable, however.
1710 return 0; /* LCOV_EXCL_LINE */
1712 if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1715 return ptr1 == end1;
1718 static int PTRFASTCALL
1719 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1720 const char *start = ptr;
1722 switch (BYTE_TYPE(enc, ptr)) {
1723 # define LEAD_CASE(n) \
1743 return (int)(ptr - start);
1748 static const char *PTRFASTCALL
1749 PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1751 switch (BYTE_TYPE(enc, ptr)) {
1764 PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1766 while (HAS_CHAR(enc, ptr, end)) {
1767 switch (BYTE_TYPE(enc, ptr)) {
1768 # define LEAD_CASE(n) \
1777 pos->columnNumber = (XML_Size)-1;
1784 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1786 pos->columnNumber = (XML_Size)-1;
1792 pos->columnNumber++;
1796 # undef DO_LEAD_CASE
1797 # undef MULTIBYTE_CASES
1798 # undef INVALID_CASES
1799 # undef CHECK_NAME_CASE
1800 # undef CHECK_NAME_CASES
1801 # undef CHECK_NMSTRT_CASE
1802 # undef CHECK_NMSTRT_CASES
1804 #endif /* XML_TOK_IMPL_C */