1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2 See the file COPYING for copying permission.
5 /* This file is included! */
8 #ifndef IS_INVALID_CHAR
9 #define IS_INVALID_CHAR(enc, ptr, n) (0)
12 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
15 return XML_TOK_PARTIAL_CHAR; \
16 if (IS_INVALID_CHAR(enc, ptr, n)) { \
17 *(nextTokPtr) = (ptr); \
18 return XML_TOK_INVALID; \
23 #define INVALID_CASES(ptr, nextTokPtr) \
24 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
30 *(nextTokPtr) = (ptr); \
31 return XML_TOK_INVALID;
33 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
36 return XML_TOK_PARTIAL_CHAR; \
37 if (!IS_NAME_CHAR(enc, ptr, n)) { \
39 return XML_TOK_INVALID; \
44 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
46 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
48 return XML_TOK_INVALID; \
57 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
61 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
64 return XML_TOK_PARTIAL_CHAR; \
65 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
67 return XML_TOK_INVALID; \
72 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
74 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
76 return XML_TOK_INVALID; \
82 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
87 #define PREFIX(ident) ident
91 #define HAS_CHARS(enc, ptr, end, count) \
92 (end - ptr >= count * MINBPC(enc))
94 #define HAS_CHAR(enc, ptr, end) \
95 HAS_CHARS(enc, ptr, end, 1)
97 #define REQUIRE_CHARS(enc, ptr, end, count) \
99 if (! HAS_CHARS(enc, ptr, end, count)) { \
100 return XML_TOK_PARTIAL; \
104 #define REQUIRE_CHAR(enc, ptr, end) \
105 REQUIRE_CHARS(enc, ptr, end, 1)
108 /* ptr points to character following "<!-" */
111 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
112 const char *end, const char **nextTokPtr)
114 if (HAS_CHAR(enc, ptr, end)) {
115 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
117 return XML_TOK_INVALID;
120 while (HAS_CHAR(enc, ptr, end)) {
121 switch (BYTE_TYPE(enc, ptr)) {
122 INVALID_CASES(ptr, nextTokPtr)
125 REQUIRE_CHAR(enc, ptr, end);
126 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
128 REQUIRE_CHAR(enc, ptr, end);
129 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
131 return XML_TOK_INVALID;
133 *nextTokPtr = ptr + MINBPC(enc);
134 return XML_TOK_COMMENT;
143 return XML_TOK_PARTIAL;
146 /* ptr points to character following "<!" */
149 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
150 const char *end, const char **nextTokPtr)
152 REQUIRE_CHAR(enc, ptr, end);
153 switch (BYTE_TYPE(enc, ptr)) {
155 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
157 *nextTokPtr = ptr + MINBPC(enc);
158 return XML_TOK_COND_SECT_OPEN;
165 return XML_TOK_INVALID;
167 while (HAS_CHAR(enc, ptr, end)) {
168 switch (BYTE_TYPE(enc, ptr)) {
170 REQUIRE_CHARS(enc, ptr, end, 2);
171 /* don't allow <!ENTITY% foo "whatever"> */
172 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
173 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
175 return XML_TOK_INVALID;
178 case BT_S: case BT_CR: case BT_LF:
180 return XML_TOK_DECL_OPEN;
187 return XML_TOK_INVALID;
190 return XML_TOK_PARTIAL;
194 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
195 const char *end, int *tokPtr)
198 *tokPtr = XML_TOK_PI;
199 if (end - ptr != MINBPC(enc)*3)
201 switch (BYTE_TO_ASCII(enc, ptr)) {
211 switch (BYTE_TO_ASCII(enc, ptr)) {
221 switch (BYTE_TO_ASCII(enc, ptr)) {
232 *tokPtr = XML_TOK_XML_DECL;
236 /* ptr points to character following "<?" */
239 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
240 const char *end, const char **nextTokPtr)
243 const char *target = ptr;
244 REQUIRE_CHAR(enc, ptr, end);
245 switch (BYTE_TYPE(enc, ptr)) {
246 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
249 return XML_TOK_INVALID;
251 while (HAS_CHAR(enc, ptr, end)) {
252 switch (BYTE_TYPE(enc, ptr)) {
253 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
254 case BT_S: case BT_CR: case BT_LF:
255 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
257 return XML_TOK_INVALID;
260 while (HAS_CHAR(enc, ptr, end)) {
261 switch (BYTE_TYPE(enc, ptr)) {
262 INVALID_CASES(ptr, nextTokPtr)
265 REQUIRE_CHAR(enc, ptr, end);
266 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
267 *nextTokPtr = ptr + MINBPC(enc);
276 return XML_TOK_PARTIAL;
278 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
280 return XML_TOK_INVALID;
283 REQUIRE_CHAR(enc, ptr, end);
284 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
285 *nextTokPtr = ptr + MINBPC(enc);
291 return XML_TOK_INVALID;
294 return XML_TOK_PARTIAL;
298 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
299 const char *end, const char **nextTokPtr)
301 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
302 ASCII_T, ASCII_A, ASCII_LSQB };
305 REQUIRE_CHARS(enc, ptr, end, 6);
306 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
307 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
309 return XML_TOK_INVALID;
313 return XML_TOK_CDATA_SECT_OPEN;
317 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
318 const char *end, const char **nextTokPtr)
322 if (MINBPC(enc) > 1) {
323 size_t n = end - ptr;
324 if (n & (MINBPC(enc) - 1)) {
325 n &= ~(MINBPC(enc) - 1);
327 return XML_TOK_PARTIAL;
331 switch (BYTE_TYPE(enc, ptr)) {
334 REQUIRE_CHAR(enc, ptr, end);
335 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
338 REQUIRE_CHAR(enc, ptr, end);
339 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
343 *nextTokPtr = ptr + MINBPC(enc);
344 return XML_TOK_CDATA_SECT_CLOSE;
347 REQUIRE_CHAR(enc, ptr, end);
348 if (BYTE_TYPE(enc, ptr) == BT_LF)
351 return XML_TOK_DATA_NEWLINE;
353 *nextTokPtr = ptr + MINBPC(enc);
354 return XML_TOK_DATA_NEWLINE;
355 INVALID_CASES(ptr, nextTokPtr)
360 while (HAS_CHAR(enc, ptr, end)) {
361 switch (BYTE_TYPE(enc, ptr)) {
362 #define LEAD_CASE(n) \
364 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
366 return XML_TOK_DATA_CHARS; \
370 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
379 return XML_TOK_DATA_CHARS;
386 return XML_TOK_DATA_CHARS;
389 /* ptr points to character following "</" */
392 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
393 const char *end, const char **nextTokPtr)
395 REQUIRE_CHAR(enc, ptr, end);
396 switch (BYTE_TYPE(enc, ptr)) {
397 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
400 return XML_TOK_INVALID;
402 while (HAS_CHAR(enc, ptr, end)) {
403 switch (BYTE_TYPE(enc, ptr)) {
404 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
405 case BT_S: case BT_CR: case BT_LF:
406 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
407 switch (BYTE_TYPE(enc, ptr)) {
408 case BT_S: case BT_CR: case BT_LF:
411 *nextTokPtr = ptr + MINBPC(enc);
412 return XML_TOK_END_TAG;
415 return XML_TOK_INVALID;
418 return XML_TOK_PARTIAL;
421 /* no need to check qname syntax here,
422 since end-tag must match exactly */
427 *nextTokPtr = ptr + MINBPC(enc);
428 return XML_TOK_END_TAG;
431 return XML_TOK_INVALID;
434 return XML_TOK_PARTIAL;
437 /* ptr points to character following "&#X" */
440 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
441 const char *end, const char **nextTokPtr)
443 if (HAS_CHAR(enc, ptr, end)) {
444 switch (BYTE_TYPE(enc, ptr)) {
450 return XML_TOK_INVALID;
452 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
453 switch (BYTE_TYPE(enc, ptr)) {
458 *nextTokPtr = ptr + MINBPC(enc);
459 return XML_TOK_CHAR_REF;
462 return XML_TOK_INVALID;
466 return XML_TOK_PARTIAL;
469 /* ptr points to character following "&#" */
472 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
473 const char *end, const char **nextTokPtr)
475 if (HAS_CHAR(enc, ptr, end)) {
476 if (CHAR_MATCHES(enc, ptr, ASCII_x))
477 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
478 switch (BYTE_TYPE(enc, ptr)) {
483 return XML_TOK_INVALID;
485 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
486 switch (BYTE_TYPE(enc, ptr)) {
490 *nextTokPtr = ptr + MINBPC(enc);
491 return XML_TOK_CHAR_REF;
494 return XML_TOK_INVALID;
498 return XML_TOK_PARTIAL;
501 /* ptr points to character following "&" */
504 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
505 const char **nextTokPtr)
507 REQUIRE_CHAR(enc, ptr, end);
508 switch (BYTE_TYPE(enc, ptr)) {
509 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
511 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
514 return XML_TOK_INVALID;
516 while (HAS_CHAR(enc, ptr, end)) {
517 switch (BYTE_TYPE(enc, ptr)) {
518 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
520 *nextTokPtr = ptr + MINBPC(enc);
521 return XML_TOK_ENTITY_REF;
524 return XML_TOK_INVALID;
527 return XML_TOK_PARTIAL;
530 /* ptr points to character following first character of attribute name */
533 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
534 const char **nextTokPtr)
539 while (HAS_CHAR(enc, ptr, end)) {
540 switch (BYTE_TYPE(enc, ptr)) {
541 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
546 return XML_TOK_INVALID;
550 REQUIRE_CHAR(enc, ptr, end);
551 switch (BYTE_TYPE(enc, ptr)) {
552 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
555 return XML_TOK_INVALID;
559 case BT_S: case BT_CR: case BT_LF:
564 REQUIRE_CHAR(enc, ptr, end);
565 t = BYTE_TYPE(enc, ptr);
575 return XML_TOK_INVALID;
587 REQUIRE_CHAR(enc, ptr, end);
588 open = BYTE_TYPE(enc, ptr);
589 if (open == BT_QUOT || open == BT_APOS)
598 return XML_TOK_INVALID;
602 /* in attribute value */
605 REQUIRE_CHAR(enc, ptr, end);
606 t = BYTE_TYPE(enc, ptr);
610 INVALID_CASES(ptr, nextTokPtr)
613 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
615 if (tok == XML_TOK_INVALID)
623 return XML_TOK_INVALID;
630 REQUIRE_CHAR(enc, ptr, end);
631 switch (BYTE_TYPE(enc, ptr)) {
642 return XML_TOK_INVALID;
644 /* ptr points to closing quote */
647 REQUIRE_CHAR(enc, ptr, end);
648 switch (BYTE_TYPE(enc, ptr)) {
649 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
650 case BT_S: case BT_CR: case BT_LF:
654 *nextTokPtr = ptr + MINBPC(enc);
655 return XML_TOK_START_TAG_WITH_ATTS;
659 REQUIRE_CHAR(enc, ptr, end);
660 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
662 return XML_TOK_INVALID;
664 *nextTokPtr = ptr + MINBPC(enc);
665 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
668 return XML_TOK_INVALID;
676 return XML_TOK_INVALID;
679 return XML_TOK_PARTIAL;
682 /* ptr points to character following "<" */
685 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
686 const char **nextTokPtr)
691 REQUIRE_CHAR(enc, ptr, end);
692 switch (BYTE_TYPE(enc, ptr)) {
693 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
696 REQUIRE_CHAR(enc, ptr, end);
697 switch (BYTE_TYPE(enc, ptr)) {
699 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
701 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
705 return XML_TOK_INVALID;
707 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
709 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
712 return XML_TOK_INVALID;
717 /* we have a start-tag */
718 while (HAS_CHAR(enc, ptr, end)) {
719 switch (BYTE_TYPE(enc, ptr)) {
720 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
725 return XML_TOK_INVALID;
729 REQUIRE_CHAR(enc, ptr, end);
730 switch (BYTE_TYPE(enc, ptr)) {
731 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
734 return XML_TOK_INVALID;
738 case BT_S: case BT_CR: case BT_LF:
741 while (HAS_CHAR(enc, ptr, end)) {
742 switch (BYTE_TYPE(enc, ptr)) {
743 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
748 case BT_S: case BT_CR: case BT_LF:
753 return XML_TOK_INVALID;
755 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
757 return XML_TOK_PARTIAL;
761 *nextTokPtr = ptr + MINBPC(enc);
762 return XML_TOK_START_TAG_NO_ATTS;
766 REQUIRE_CHAR(enc, ptr, end);
767 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
769 return XML_TOK_INVALID;
771 *nextTokPtr = ptr + MINBPC(enc);
772 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
775 return XML_TOK_INVALID;
778 return XML_TOK_PARTIAL;
782 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
783 const char **nextTokPtr)
787 if (MINBPC(enc) > 1) {
788 size_t n = end - ptr;
789 if (n & (MINBPC(enc) - 1)) {
790 n &= ~(MINBPC(enc) - 1);
792 return XML_TOK_PARTIAL;
796 switch (BYTE_TYPE(enc, ptr)) {
798 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
800 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
803 if (! HAS_CHAR(enc, ptr, end))
804 return XML_TOK_TRAILING_CR;
805 if (BYTE_TYPE(enc, ptr) == BT_LF)
808 return XML_TOK_DATA_NEWLINE;
810 *nextTokPtr = ptr + MINBPC(enc);
811 return XML_TOK_DATA_NEWLINE;
814 if (! HAS_CHAR(enc, ptr, end))
815 return XML_TOK_TRAILING_RSQB;
816 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
819 if (! HAS_CHAR(enc, ptr, end))
820 return XML_TOK_TRAILING_RSQB;
821 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
826 return XML_TOK_INVALID;
827 INVALID_CASES(ptr, nextTokPtr)
832 while (HAS_CHAR(enc, ptr, end)) {
833 switch (BYTE_TYPE(enc, ptr)) {
834 #define LEAD_CASE(n) \
836 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
838 return XML_TOK_DATA_CHARS; \
842 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
845 if (HAS_CHARS(enc, ptr, end, 2)) {
846 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
850 if (HAS_CHARS(enc, ptr, end, 3)) {
851 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
855 *nextTokPtr = ptr + 2*MINBPC(enc);
856 return XML_TOK_INVALID;
868 return XML_TOK_DATA_CHARS;
875 return XML_TOK_DATA_CHARS;
878 /* ptr points to character following "%" */
881 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
882 const char **nextTokPtr)
884 REQUIRE_CHAR(enc, ptr, end);
885 switch (BYTE_TYPE(enc, ptr)) {
886 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
887 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
889 return XML_TOK_PERCENT;
892 return XML_TOK_INVALID;
894 while (HAS_CHAR(enc, ptr, end)) {
895 switch (BYTE_TYPE(enc, ptr)) {
896 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
898 *nextTokPtr = ptr + MINBPC(enc);
899 return XML_TOK_PARAM_ENTITY_REF;
902 return XML_TOK_INVALID;
905 return XML_TOK_PARTIAL;
909 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
910 const char **nextTokPtr)
912 REQUIRE_CHAR(enc, ptr, end);
913 switch (BYTE_TYPE(enc, ptr)) {
914 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
917 return XML_TOK_INVALID;
919 while (HAS_CHAR(enc, ptr, end)) {
920 switch (BYTE_TYPE(enc, ptr)) {
921 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
922 case BT_CR: case BT_LF: case BT_S:
923 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
925 return XML_TOK_POUND_NAME;
928 return XML_TOK_INVALID;
931 return -XML_TOK_POUND_NAME;
935 PREFIX(scanLit)(int open, const ENCODING *enc,
936 const char *ptr, const char *end,
937 const char **nextTokPtr)
939 while (HAS_CHAR(enc, ptr, end)) {
940 int t = BYTE_TYPE(enc, ptr);
942 INVALID_CASES(ptr, nextTokPtr)
948 if (! HAS_CHAR(enc, ptr, end))
949 return -XML_TOK_LITERAL;
951 switch (BYTE_TYPE(enc, ptr)) {
952 case BT_S: case BT_CR: case BT_LF:
953 case BT_GT: case BT_PERCNT: case BT_LSQB:
954 return XML_TOK_LITERAL;
956 return XML_TOK_INVALID;
963 return XML_TOK_PARTIAL;
967 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
968 const char **nextTokPtr)
973 if (MINBPC(enc) > 1) {
974 size_t n = end - ptr;
975 if (n & (MINBPC(enc) - 1)) {
976 n &= ~(MINBPC(enc) - 1);
978 return XML_TOK_PARTIAL;
982 switch (BYTE_TYPE(enc, ptr)) {
984 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
986 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
990 REQUIRE_CHAR(enc, ptr, end);
991 switch (BYTE_TYPE(enc, ptr)) {
993 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
995 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1002 *nextTokPtr = ptr - MINBPC(enc);
1003 return XML_TOK_INSTANCE_START;
1006 return XML_TOK_INVALID;
1009 if (ptr + MINBPC(enc) == end) {
1011 /* indicate that this might be part of a CR/LF pair */
1012 return -XML_TOK_PROLOG_S;
1015 case BT_S: case BT_LF:
1018 if (! HAS_CHAR(enc, ptr, end))
1020 switch (BYTE_TYPE(enc, ptr)) {
1021 case BT_S: case BT_LF:
1024 /* don't split CR/LF pair */
1025 if (ptr + MINBPC(enc) != end)
1030 return XML_TOK_PROLOG_S;
1034 return XML_TOK_PROLOG_S;
1036 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1038 *nextTokPtr = ptr + MINBPC(enc);
1039 return XML_TOK_COMMA;
1041 *nextTokPtr = ptr + MINBPC(enc);
1042 return XML_TOK_OPEN_BRACKET;
1045 if (! HAS_CHAR(enc, ptr, end))
1046 return -XML_TOK_CLOSE_BRACKET;
1047 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1048 REQUIRE_CHARS(enc, ptr, end, 2);
1049 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1050 *nextTokPtr = ptr + 2*MINBPC(enc);
1051 return XML_TOK_COND_SECT_CLOSE;
1055 return XML_TOK_CLOSE_BRACKET;
1057 *nextTokPtr = ptr + MINBPC(enc);
1058 return XML_TOK_OPEN_PAREN;
1061 if (! HAS_CHAR(enc, ptr, end))
1062 return -XML_TOK_CLOSE_PAREN;
1063 switch (BYTE_TYPE(enc, ptr)) {
1065 *nextTokPtr = ptr + MINBPC(enc);
1066 return XML_TOK_CLOSE_PAREN_ASTERISK;
1068 *nextTokPtr = ptr + MINBPC(enc);
1069 return XML_TOK_CLOSE_PAREN_QUESTION;
1071 *nextTokPtr = ptr + MINBPC(enc);
1072 return XML_TOK_CLOSE_PAREN_PLUS;
1073 case BT_CR: case BT_LF: case BT_S:
1074 case BT_GT: case BT_COMMA: case BT_VERBAR:
1077 return XML_TOK_CLOSE_PAREN;
1080 return XML_TOK_INVALID;
1082 *nextTokPtr = ptr + MINBPC(enc);
1085 *nextTokPtr = ptr + MINBPC(enc);
1086 return XML_TOK_DECL_CLOSE;
1088 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1089 #define LEAD_CASE(n) \
1090 case BT_LEAD ## n: \
1091 if (end - ptr < n) \
1092 return XML_TOK_PARTIAL_CHAR; \
1093 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1095 tok = XML_TOK_NAME; \
1098 if (IS_NAME_CHAR(enc, ptr, n)) { \
1100 tok = XML_TOK_NMTOKEN; \
1103 *nextTokPtr = ptr; \
1104 return XML_TOK_INVALID;
1105 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1118 tok = XML_TOK_NMTOKEN;
1122 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1127 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1129 tok = XML_TOK_NMTOKEN;
1135 return XML_TOK_INVALID;
1137 while (HAS_CHAR(enc, ptr, end)) {
1138 switch (BYTE_TYPE(enc, ptr)) {
1139 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1140 case BT_GT: case BT_RPAR: case BT_COMMA:
1141 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1142 case BT_S: case BT_CR: case BT_LF:
1150 REQUIRE_CHAR(enc, ptr, end);
1151 tok = XML_TOK_PREFIXED_NAME;
1152 switch (BYTE_TYPE(enc, ptr)) {
1153 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1155 tok = XML_TOK_NMTOKEN;
1159 case XML_TOK_PREFIXED_NAME:
1160 tok = XML_TOK_NMTOKEN;
1166 if (tok == XML_TOK_NMTOKEN) {
1168 return XML_TOK_INVALID;
1170 *nextTokPtr = ptr + MINBPC(enc);
1171 return XML_TOK_NAME_PLUS;
1173 if (tok == XML_TOK_NMTOKEN) {
1175 return XML_TOK_INVALID;
1177 *nextTokPtr = ptr + MINBPC(enc);
1178 return XML_TOK_NAME_ASTERISK;
1180 if (tok == XML_TOK_NMTOKEN) {
1182 return XML_TOK_INVALID;
1184 *nextTokPtr = ptr + MINBPC(enc);
1185 return XML_TOK_NAME_QUESTION;
1188 return XML_TOK_INVALID;
1195 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1196 const char *end, const char **nextTokPtr)
1200 return XML_TOK_NONE;
1201 else if (! HAS_CHAR(enc, ptr, end))
1202 return XML_TOK_PARTIAL;
1204 while (HAS_CHAR(enc, ptr, end)) {
1205 switch (BYTE_TYPE(enc, ptr)) {
1206 #define LEAD_CASE(n) \
1207 case BT_LEAD ## n: ptr += n; break;
1208 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1212 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1214 return XML_TOK_DATA_CHARS;
1216 /* this is for inside entity references */
1218 return XML_TOK_INVALID;
1221 *nextTokPtr = ptr + MINBPC(enc);
1222 return XML_TOK_DATA_NEWLINE;
1225 return XML_TOK_DATA_CHARS;
1229 if (! HAS_CHAR(enc, ptr, end))
1230 return XML_TOK_TRAILING_CR;
1231 if (BYTE_TYPE(enc, ptr) == BT_LF)
1234 return XML_TOK_DATA_NEWLINE;
1237 return XML_TOK_DATA_CHARS;
1240 *nextTokPtr = ptr + MINBPC(enc);
1241 return XML_TOK_ATTRIBUTE_VALUE_S;
1244 return XML_TOK_DATA_CHARS;
1251 return XML_TOK_DATA_CHARS;
1255 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1256 const char *end, const char **nextTokPtr)
1260 return XML_TOK_NONE;
1261 else if (! HAS_CHAR(enc, ptr, end))
1262 return XML_TOK_PARTIAL;
1264 while (HAS_CHAR(enc, ptr, end)) {
1265 switch (BYTE_TYPE(enc, ptr)) {
1266 #define LEAD_CASE(n) \
1267 case BT_LEAD ## n: ptr += n; break;
1268 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1272 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1274 return XML_TOK_DATA_CHARS;
1277 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1279 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1282 return XML_TOK_DATA_CHARS;
1285 *nextTokPtr = ptr + MINBPC(enc);
1286 return XML_TOK_DATA_NEWLINE;
1289 return XML_TOK_DATA_CHARS;
1293 if (! HAS_CHAR(enc, ptr, end))
1294 return XML_TOK_TRAILING_CR;
1295 if (BYTE_TYPE(enc, ptr) == BT_LF)
1298 return XML_TOK_DATA_NEWLINE;
1301 return XML_TOK_DATA_CHARS;
1308 return XML_TOK_DATA_CHARS;
1314 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1315 const char *end, const char **nextTokPtr)
1318 if (MINBPC(enc) > 1) {
1319 size_t n = end - ptr;
1320 if (n & (MINBPC(enc) - 1)) {
1321 n &= ~(MINBPC(enc) - 1);
1325 while (HAS_CHAR(enc, ptr, end)) {
1326 switch (BYTE_TYPE(enc, ptr)) {
1327 INVALID_CASES(ptr, nextTokPtr)
1330 REQUIRE_CHAR(enc, ptr, end);
1331 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1333 REQUIRE_CHAR(enc, ptr, end);
1334 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1342 REQUIRE_CHAR(enc, ptr, end);
1343 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1345 REQUIRE_CHAR(enc, ptr, end);
1346 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1350 return XML_TOK_IGNORE_SECT;
1361 return XML_TOK_PARTIAL;
1364 #endif /* XML_DTD */
1367 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1368 const char **badPtr)
1372 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1373 switch (BYTE_TYPE(enc, ptr)) {
1397 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1404 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1407 switch (BYTE_TO_ASCII(enc, ptr)) {
1421 /* This must only be called for a well-formed start-tag or empty
1422 element tag. Returns the number of attributes. Pointers to the
1423 first attsMax attributes are stored in atts.
1427 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1428 int attsMax, ATTRIBUTE *atts)
1430 enum { other, inName, inValue } state = inName;
1432 int open = 0; /* defined when state == inValue;
1433 initialization just to shut up compilers */
1435 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1436 switch (BYTE_TYPE(enc, ptr)) {
1437 #define START_NAME \
1438 if (state == other) { \
1439 if (nAtts < attsMax) { \
1440 atts[nAtts].name = ptr; \
1441 atts[nAtts].normalized = 1; \
1445 #define LEAD_CASE(n) \
1446 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1447 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1456 if (state != inValue) {
1457 if (nAtts < attsMax)
1458 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1462 else if (open == BT_QUOT) {
1464 if (nAtts < attsMax)
1465 atts[nAtts].valueEnd = ptr;
1470 if (state != inValue) {
1471 if (nAtts < attsMax)
1472 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1476 else if (open == BT_APOS) {
1478 if (nAtts < attsMax)
1479 atts[nAtts].valueEnd = ptr;
1484 if (nAtts < attsMax)
1485 atts[nAtts].normalized = 0;
1488 if (state == inName)
1490 else if (state == inValue
1492 && atts[nAtts].normalized
1493 && (ptr == atts[nAtts].valuePtr
1494 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1495 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1496 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1497 atts[nAtts].normalized = 0;
1499 case BT_CR: case BT_LF:
1500 /* This case ensures that the first attribute name is counted
1501 Apart from that we could just change state on the quote. */
1502 if (state == inName)
1504 else if (state == inValue && nAtts < attsMax)
1505 atts[nAtts].normalized = 0;
1509 if (state != inValue)
1519 static int PTRFASTCALL
1520 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1524 ptr += 2*MINBPC(enc);
1525 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1526 for (ptr += MINBPC(enc);
1527 !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1528 ptr += MINBPC(enc)) {
1529 int c = BYTE_TO_ASCII(enc, ptr);
1531 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1532 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1534 result |= (c - ASCII_0);
1536 case ASCII_A: case ASCII_B: case ASCII_C:
1537 case ASCII_D: case ASCII_E: case ASCII_F:
1539 result += 10 + (c - ASCII_A);
1541 case ASCII_a: case ASCII_b: case ASCII_c:
1542 case ASCII_d: case ASCII_e: case ASCII_f:
1544 result += 10 + (c - ASCII_a);
1547 if (result >= 0x110000)
1552 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1553 int c = BYTE_TO_ASCII(enc, ptr);
1555 result += (c - ASCII_0);
1556 if (result >= 0x110000)
1560 return checkCharRefNumber(result);
1564 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1567 switch ((end - ptr)/MINBPC(enc)) {
1569 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1570 switch (BYTE_TO_ASCII(enc, ptr)) {
1579 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1581 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1583 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1589 switch (BYTE_TO_ASCII(enc, ptr)) {
1592 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1594 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1596 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1603 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1605 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1607 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1618 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1621 switch (BYTE_TYPE(enc, ptr1)) {
1622 #define LEAD_CASE(n) \
1623 case BT_LEAD ## n: \
1624 if (*ptr1++ != *ptr2++) \
1626 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1629 if (*ptr1++ != *ptr2++)
1641 if (*ptr2++ != *ptr1++)
1643 if (MINBPC(enc) > 1) {
1644 if (*ptr2++ != *ptr1++)
1646 if (MINBPC(enc) > 2) {
1647 if (*ptr2++ != *ptr1++)
1649 if (MINBPC(enc) > 3) {
1650 if (*ptr2++ != *ptr1++)
1657 if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1659 switch (BYTE_TYPE(enc, ptr2)) {
1682 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1683 const char *end1, const char *ptr2)
1685 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1686 if (end1 - ptr1 < MINBPC(enc))
1688 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1691 return ptr1 == end1;
1694 static int PTRFASTCALL
1695 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1697 const char *start = ptr;
1699 switch (BYTE_TYPE(enc, ptr)) {
1700 #define LEAD_CASE(n) \
1701 case BT_LEAD ## n: ptr += n; break;
1702 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1716 return (int)(ptr - start);
1721 static const char * PTRFASTCALL
1722 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1725 switch (BYTE_TYPE(enc, ptr)) {
1738 PREFIX(updatePosition)(const ENCODING *enc,
1743 while (HAS_CHAR(enc, ptr, end)) {
1744 switch (BYTE_TYPE(enc, ptr)) {
1745 #define LEAD_CASE(n) \
1746 case BT_LEAD ## n: \
1749 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1752 pos->columnNumber = (XML_Size)-1;
1759 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1761 pos->columnNumber = (XML_Size)-1;
1767 pos->columnNumber++;
1772 #undef MULTIBYTE_CASES
1773 #undef INVALID_CASES
1774 #undef CHECK_NAME_CASE
1775 #undef CHECK_NAME_CASES
1776 #undef CHECK_NMSTRT_CASE
1777 #undef CHECK_NMSTRT_CASES
1779 #endif /* XML_TOK_IMPL_C */