1
/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2
See the file COPYING for copying permission.
5
#ifndef IS_INVALID_CHAR
6
#define IS_INVALID_CHAR(enc, ptr, n) (0)
9
#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
12
return XML_TOK_PARTIAL_CHAR; \
13
if (IS_INVALID_CHAR(enc, ptr, n)) { \
14
*(nextTokPtr) = (ptr); \
15
return XML_TOK_INVALID; \
20
#define INVALID_CASES(ptr, nextTokPtr) \
21
INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
22
INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
23
INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
27
*(nextTokPtr) = (ptr); \
28
return XML_TOK_INVALID;
30
#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
33
return XML_TOK_PARTIAL_CHAR; \
34
if (!IS_NAME_CHAR(enc, ptr, n)) { \
36
return XML_TOK_INVALID; \
41
#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
43
if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
45
return XML_TOK_INVALID; \
54
CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
55
CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
56
CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
58
#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
61
return XML_TOK_PARTIAL_CHAR; \
62
if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
64
return XML_TOK_INVALID; \
69
#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
71
if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
73
return XML_TOK_INVALID; \
79
CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
80
CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
81
CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
84
#define PREFIX(ident) ident
87
/* ptr points to character following "<!-" */
90
PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
91
const char *end, const char **nextTokPtr)
94
if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
96
return XML_TOK_INVALID;
100
switch (BYTE_TYPE(enc, ptr)) {
101
INVALID_CASES(ptr, nextTokPtr)
103
if ((ptr += MINBPC(enc)) == end)
104
return XML_TOK_PARTIAL;
105
if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
106
if ((ptr += MINBPC(enc)) == end)
107
return XML_TOK_PARTIAL;
108
if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
110
return XML_TOK_INVALID;
112
*nextTokPtr = ptr + MINBPC(enc);
113
return XML_TOK_COMMENT;
122
return XML_TOK_PARTIAL;
125
/* ptr points to character following "<!" */
128
PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
129
const char *end, const char **nextTokPtr)
132
return XML_TOK_PARTIAL;
133
switch (BYTE_TYPE(enc, ptr)) {
135
return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
137
*nextTokPtr = ptr + MINBPC(enc);
138
return XML_TOK_COND_SECT_OPEN;
145
return XML_TOK_INVALID;
148
switch (BYTE_TYPE(enc, ptr)) {
150
if (ptr + MINBPC(enc) == end)
151
return XML_TOK_PARTIAL;
152
/* don't allow <!ENTITY% foo "whatever"> */
153
switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
154
case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
156
return XML_TOK_INVALID;
159
case BT_S: case BT_CR: case BT_LF:
161
return XML_TOK_DECL_OPEN;
168
return XML_TOK_INVALID;
171
return XML_TOK_PARTIAL;
175
PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
176
const char *end, int *tokPtr)
179
*tokPtr = XML_TOK_PI;
180
if (end - ptr != MINBPC(enc)*3)
182
switch (BYTE_TO_ASCII(enc, ptr)) {
192
switch (BYTE_TO_ASCII(enc, ptr)) {
202
switch (BYTE_TO_ASCII(enc, ptr)) {
213
*tokPtr = XML_TOK_XML_DECL;
217
/* ptr points to character following "<?" */
220
PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
221
const char *end, const char **nextTokPtr)
224
const char *target = ptr;
226
return XML_TOK_PARTIAL;
227
switch (BYTE_TYPE(enc, ptr)) {
228
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
231
return XML_TOK_INVALID;
234
switch (BYTE_TYPE(enc, ptr)) {
235
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
236
case BT_S: case BT_CR: case BT_LF:
237
if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
239
return XML_TOK_INVALID;
243
switch (BYTE_TYPE(enc, ptr)) {
244
INVALID_CASES(ptr, nextTokPtr)
248
return XML_TOK_PARTIAL;
249
if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
250
*nextTokPtr = ptr + MINBPC(enc);
259
return XML_TOK_PARTIAL;
261
if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
263
return XML_TOK_INVALID;
267
return XML_TOK_PARTIAL;
268
if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
269
*nextTokPtr = ptr + MINBPC(enc);
275
return XML_TOK_INVALID;
278
return XML_TOK_PARTIAL;
282
PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
283
const char *end, const char **nextTokPtr)
285
static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
286
ASCII_T, ASCII_A, ASCII_LSQB };
289
if (end - ptr < 6 * MINBPC(enc))
290
return XML_TOK_PARTIAL;
291
for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
292
if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
294
return XML_TOK_INVALID;
298
return XML_TOK_CDATA_SECT_OPEN;
302
PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
303
const char *end, const char **nextTokPtr)
307
if (MINBPC(enc) > 1) {
308
size_t n = end - ptr;
309
if (n & (MINBPC(enc) - 1)) {
310
n &= ~(MINBPC(enc) - 1);
312
return XML_TOK_PARTIAL;
316
switch (BYTE_TYPE(enc, ptr)) {
320
return XML_TOK_PARTIAL;
321
if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
325
return XML_TOK_PARTIAL;
326
if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
330
*nextTokPtr = ptr + MINBPC(enc);
331
return XML_TOK_CDATA_SECT_CLOSE;
335
return XML_TOK_PARTIAL;
336
if (BYTE_TYPE(enc, ptr) == BT_LF)
339
return XML_TOK_DATA_NEWLINE;
341
*nextTokPtr = ptr + MINBPC(enc);
342
return XML_TOK_DATA_NEWLINE;
343
INVALID_CASES(ptr, nextTokPtr)
349
switch (BYTE_TYPE(enc, ptr)) {
350
#define LEAD_CASE(n) \
352
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
354
return XML_TOK_DATA_CHARS; \
358
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
367
return XML_TOK_DATA_CHARS;
374
return XML_TOK_DATA_CHARS;
377
/* ptr points to character following "</" */
380
PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
381
const char *end, const char **nextTokPtr)
384
return XML_TOK_PARTIAL;
385
switch (BYTE_TYPE(enc, ptr)) {
386
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
389
return XML_TOK_INVALID;
392
switch (BYTE_TYPE(enc, ptr)) {
393
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
394
case BT_S: case BT_CR: case BT_LF:
395
for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
396
switch (BYTE_TYPE(enc, ptr)) {
397
case BT_S: case BT_CR: case BT_LF:
400
*nextTokPtr = ptr + MINBPC(enc);
401
return XML_TOK_END_TAG;
404
return XML_TOK_INVALID;
407
return XML_TOK_PARTIAL;
410
/* no need to check qname syntax here,
411
since end-tag must match exactly */
416
*nextTokPtr = ptr + MINBPC(enc);
417
return XML_TOK_END_TAG;
420
return XML_TOK_INVALID;
423
return XML_TOK_PARTIAL;
426
/* ptr points to character following "&#X" */
429
PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
430
const char *end, const char **nextTokPtr)
433
switch (BYTE_TYPE(enc, ptr)) {
439
return XML_TOK_INVALID;
441
for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
442
switch (BYTE_TYPE(enc, ptr)) {
447
*nextTokPtr = ptr + MINBPC(enc);
448
return XML_TOK_CHAR_REF;
451
return XML_TOK_INVALID;
455
return XML_TOK_PARTIAL;
458
/* ptr points to character following "&#" */
461
PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
462
const char *end, const char **nextTokPtr)
465
if (CHAR_MATCHES(enc, ptr, ASCII_x))
466
return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
467
switch (BYTE_TYPE(enc, ptr)) {
472
return XML_TOK_INVALID;
474
for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
475
switch (BYTE_TYPE(enc, ptr)) {
479
*nextTokPtr = ptr + MINBPC(enc);
480
return XML_TOK_CHAR_REF;
483
return XML_TOK_INVALID;
487
return XML_TOK_PARTIAL;
490
/* ptr points to character following "&" */
493
PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
494
const char **nextTokPtr)
497
return XML_TOK_PARTIAL;
498
switch (BYTE_TYPE(enc, ptr)) {
499
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
501
return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
504
return XML_TOK_INVALID;
507
switch (BYTE_TYPE(enc, ptr)) {
508
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
510
*nextTokPtr = ptr + MINBPC(enc);
511
return XML_TOK_ENTITY_REF;
514
return XML_TOK_INVALID;
517
return XML_TOK_PARTIAL;
520
/* ptr points to character following first character of attribute name */
523
PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
524
const char **nextTokPtr)
530
switch (BYTE_TYPE(enc, ptr)) {
531
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
536
return XML_TOK_INVALID;
541
return XML_TOK_PARTIAL;
542
switch (BYTE_TYPE(enc, ptr)) {
543
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
546
return XML_TOK_INVALID;
550
case BT_S: case BT_CR: case BT_LF:
556
return XML_TOK_PARTIAL;
557
t = BYTE_TYPE(enc, ptr);
567
return XML_TOK_INVALID;
580
return XML_TOK_PARTIAL;
581
open = BYTE_TYPE(enc, ptr);
582
if (open == BT_QUOT || open == BT_APOS)
591
return XML_TOK_INVALID;
595
/* in attribute value */
599
return XML_TOK_PARTIAL;
600
t = BYTE_TYPE(enc, ptr);
604
INVALID_CASES(ptr, nextTokPtr)
607
int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
609
if (tok == XML_TOK_INVALID)
617
return XML_TOK_INVALID;
625
return XML_TOK_PARTIAL;
626
switch (BYTE_TYPE(enc, ptr)) {
637
return XML_TOK_INVALID;
639
/* ptr points to closing quote */
643
return XML_TOK_PARTIAL;
644
switch (BYTE_TYPE(enc, ptr)) {
645
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
646
case BT_S: case BT_CR: case BT_LF:
650
*nextTokPtr = ptr + MINBPC(enc);
651
return XML_TOK_START_TAG_WITH_ATTS;
656
return XML_TOK_PARTIAL;
657
if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
659
return XML_TOK_INVALID;
661
*nextTokPtr = ptr + MINBPC(enc);
662
return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
665
return XML_TOK_INVALID;
673
return XML_TOK_INVALID;
676
return XML_TOK_PARTIAL;
679
/* ptr points to character following "<" */
682
PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
683
const char **nextTokPtr)
689
return XML_TOK_PARTIAL;
690
switch (BYTE_TYPE(enc, ptr)) {
691
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
693
if ((ptr += MINBPC(enc)) == end)
694
return XML_TOK_PARTIAL;
695
switch (BYTE_TYPE(enc, ptr)) {
697
return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
699
return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
703
return XML_TOK_INVALID;
705
return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
707
return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
710
return XML_TOK_INVALID;
715
/* we have a start-tag */
717
switch (BYTE_TYPE(enc, ptr)) {
718
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
723
return XML_TOK_INVALID;
728
return XML_TOK_PARTIAL;
729
switch (BYTE_TYPE(enc, ptr)) {
730
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
733
return XML_TOK_INVALID;
737
case BT_S: case BT_CR: case BT_LF:
741
switch (BYTE_TYPE(enc, ptr)) {
742
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
747
case BT_S: case BT_CR: case BT_LF:
752
return XML_TOK_INVALID;
754
return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
756
return XML_TOK_PARTIAL;
760
*nextTokPtr = ptr + MINBPC(enc);
761
return XML_TOK_START_TAG_NO_ATTS;
766
return XML_TOK_PARTIAL;
767
if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
769
return XML_TOK_INVALID;
771
*nextTokPtr = ptr + MINBPC(enc);
772
return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
775
return XML_TOK_INVALID;
778
return XML_TOK_PARTIAL;
782
PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
783
const char **nextTokPtr)
787
if (MINBPC(enc) > 1) {
788
size_t n = end - ptr;
789
if (n & (MINBPC(enc) - 1)) {
790
n &= ~(MINBPC(enc) - 1);
792
return XML_TOK_PARTIAL;
796
switch (BYTE_TYPE(enc, ptr)) {
798
return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
800
return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
804
return XML_TOK_TRAILING_CR;
805
if (BYTE_TYPE(enc, ptr) == BT_LF)
808
return XML_TOK_DATA_NEWLINE;
810
*nextTokPtr = ptr + MINBPC(enc);
811
return XML_TOK_DATA_NEWLINE;
815
return XML_TOK_TRAILING_RSQB;
816
if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
820
return XML_TOK_TRAILING_RSQB;
821
if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
826
return XML_TOK_INVALID;
827
INVALID_CASES(ptr, nextTokPtr)
833
switch (BYTE_TYPE(enc, ptr)) {
834
#define LEAD_CASE(n) \
836
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
838
return XML_TOK_DATA_CHARS; \
842
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
845
if (ptr + MINBPC(enc) != end) {
846
if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
850
if (ptr + 2*MINBPC(enc) != end) {
851
if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
855
*nextTokPtr = ptr + 2*MINBPC(enc);
856
return XML_TOK_INVALID;
868
return XML_TOK_DATA_CHARS;
875
return XML_TOK_DATA_CHARS;
878
/* ptr points to character following "%" */
881
PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
882
const char **nextTokPtr)
885
return -XML_TOK_PERCENT;
886
switch (BYTE_TYPE(enc, ptr)) {
887
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
888
case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
890
return XML_TOK_PERCENT;
893
return XML_TOK_INVALID;
896
switch (BYTE_TYPE(enc, ptr)) {
897
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
899
*nextTokPtr = ptr + MINBPC(enc);
900
return XML_TOK_PARAM_ENTITY_REF;
903
return XML_TOK_INVALID;
906
return XML_TOK_PARTIAL;
910
PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
911
const char **nextTokPtr)
914
return XML_TOK_PARTIAL;
915
switch (BYTE_TYPE(enc, ptr)) {
916
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
919
return XML_TOK_INVALID;
922
switch (BYTE_TYPE(enc, ptr)) {
923
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
924
case BT_CR: case BT_LF: case BT_S:
925
case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
927
return XML_TOK_POUND_NAME;
930
return XML_TOK_INVALID;
933
return -XML_TOK_POUND_NAME;
937
PREFIX(scanLit)(int open, const ENCODING *enc,
938
const char *ptr, const char *end,
939
const char **nextTokPtr)
942
int t = BYTE_TYPE(enc, ptr);
944
INVALID_CASES(ptr, nextTokPtr)
951
return -XML_TOK_LITERAL;
953
switch (BYTE_TYPE(enc, ptr)) {
954
case BT_S: case BT_CR: case BT_LF:
955
case BT_GT: case BT_PERCNT: case BT_LSQB:
956
return XML_TOK_LITERAL;
958
return XML_TOK_INVALID;
965
return XML_TOK_PARTIAL;
969
PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
970
const char **nextTokPtr)
975
if (MINBPC(enc) > 1) {
976
size_t n = end - ptr;
977
if (n & (MINBPC(enc) - 1)) {
978
n &= ~(MINBPC(enc) - 1);
980
return XML_TOK_PARTIAL;
984
switch (BYTE_TYPE(enc, ptr)) {
986
return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
988
return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
993
return XML_TOK_PARTIAL;
994
switch (BYTE_TYPE(enc, ptr)) {
996
return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
998
return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1005
*nextTokPtr = ptr - MINBPC(enc);
1006
return XML_TOK_INSTANCE_START;
1009
return XML_TOK_INVALID;
1012
if (ptr + MINBPC(enc) == end) {
1014
/* indicate that this might be part of a CR/LF pair */
1015
return -XML_TOK_PROLOG_S;
1018
case BT_S: case BT_LF:
1023
switch (BYTE_TYPE(enc, ptr)) {
1024
case BT_S: case BT_LF:
1027
/* don't split CR/LF pair */
1028
if (ptr + MINBPC(enc) != end)
1033
return XML_TOK_PROLOG_S;
1037
return XML_TOK_PROLOG_S;
1039
return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1041
*nextTokPtr = ptr + MINBPC(enc);
1042
return XML_TOK_COMMA;
1044
*nextTokPtr = ptr + MINBPC(enc);
1045
return XML_TOK_OPEN_BRACKET;
1049
return -XML_TOK_CLOSE_BRACKET;
1050
if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1051
if (ptr + MINBPC(enc) == end)
1052
return XML_TOK_PARTIAL;
1053
if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1054
*nextTokPtr = ptr + 2*MINBPC(enc);
1055
return XML_TOK_COND_SECT_CLOSE;
1059
return XML_TOK_CLOSE_BRACKET;
1061
*nextTokPtr = ptr + MINBPC(enc);
1062
return XML_TOK_OPEN_PAREN;
1066
return -XML_TOK_CLOSE_PAREN;
1067
switch (BYTE_TYPE(enc, ptr)) {
1069
*nextTokPtr = ptr + MINBPC(enc);
1070
return XML_TOK_CLOSE_PAREN_ASTERISK;
1072
*nextTokPtr = ptr + MINBPC(enc);
1073
return XML_TOK_CLOSE_PAREN_QUESTION;
1075
*nextTokPtr = ptr + MINBPC(enc);
1076
return XML_TOK_CLOSE_PAREN_PLUS;
1077
case BT_CR: case BT_LF: case BT_S:
1078
case BT_GT: case BT_COMMA: case BT_VERBAR:
1081
return XML_TOK_CLOSE_PAREN;
1084
return XML_TOK_INVALID;
1086
*nextTokPtr = ptr + MINBPC(enc);
1089
*nextTokPtr = ptr + MINBPC(enc);
1090
return XML_TOK_DECL_CLOSE;
1092
return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1093
#define LEAD_CASE(n) \
1094
case BT_LEAD ## n: \
1095
if (end - ptr < n) \
1096
return XML_TOK_PARTIAL_CHAR; \
1097
if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1099
tok = XML_TOK_NAME; \
1102
if (IS_NAME_CHAR(enc, ptr, n)) { \
1104
tok = XML_TOK_NMTOKEN; \
1107
*nextTokPtr = ptr; \
1108
return XML_TOK_INVALID;
1109
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1122
tok = XML_TOK_NMTOKEN;
1126
if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1131
if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1133
tok = XML_TOK_NMTOKEN;
1139
return XML_TOK_INVALID;
1141
while (ptr != end) {
1142
switch (BYTE_TYPE(enc, ptr)) {
1143
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1144
case BT_GT: case BT_RPAR: case BT_COMMA:
1145
case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1146
case BT_S: case BT_CR: case BT_LF:
1155
return XML_TOK_PARTIAL;
1156
tok = XML_TOK_PREFIXED_NAME;
1157
switch (BYTE_TYPE(enc, ptr)) {
1158
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1160
tok = XML_TOK_NMTOKEN;
1164
case XML_TOK_PREFIXED_NAME:
1165
tok = XML_TOK_NMTOKEN;
1171
if (tok == XML_TOK_NMTOKEN) {
1173
return XML_TOK_INVALID;
1175
*nextTokPtr = ptr + MINBPC(enc);
1176
return XML_TOK_NAME_PLUS;
1178
if (tok == XML_TOK_NMTOKEN) {
1180
return XML_TOK_INVALID;
1182
*nextTokPtr = ptr + MINBPC(enc);
1183
return XML_TOK_NAME_ASTERISK;
1185
if (tok == XML_TOK_NMTOKEN) {
1187
return XML_TOK_INVALID;
1189
*nextTokPtr = ptr + MINBPC(enc);
1190
return XML_TOK_NAME_QUESTION;
1193
return XML_TOK_INVALID;
1200
PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1201
const char *end, const char **nextTokPtr)
1205
return XML_TOK_NONE;
1207
while (ptr != end) {
1208
switch (BYTE_TYPE(enc, ptr)) {
1209
#define LEAD_CASE(n) \
1210
case BT_LEAD ## n: ptr += n; break;
1211
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1215
return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1217
return XML_TOK_DATA_CHARS;
1219
/* this is for inside entity references */
1221
return XML_TOK_INVALID;
1224
*nextTokPtr = ptr + MINBPC(enc);
1225
return XML_TOK_DATA_NEWLINE;
1228
return XML_TOK_DATA_CHARS;
1233
return XML_TOK_TRAILING_CR;
1234
if (BYTE_TYPE(enc, ptr) == BT_LF)
1237
return XML_TOK_DATA_NEWLINE;
1240
return XML_TOK_DATA_CHARS;
1243
*nextTokPtr = ptr + MINBPC(enc);
1244
return XML_TOK_ATTRIBUTE_VALUE_S;
1247
return XML_TOK_DATA_CHARS;
1254
return XML_TOK_DATA_CHARS;
1258
PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1259
const char *end, const char **nextTokPtr)
1263
return XML_TOK_NONE;
1265
while (ptr != end) {
1266
switch (BYTE_TYPE(enc, ptr)) {
1267
#define LEAD_CASE(n) \
1268
case BT_LEAD ## n: ptr += n; break;
1269
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1273
return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1275
return XML_TOK_DATA_CHARS;
1278
int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1280
return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1283
return XML_TOK_DATA_CHARS;
1286
*nextTokPtr = ptr + MINBPC(enc);
1287
return XML_TOK_DATA_NEWLINE;
1290
return XML_TOK_DATA_CHARS;
1295
return XML_TOK_TRAILING_CR;
1296
if (BYTE_TYPE(enc, ptr) == BT_LF)
1299
return XML_TOK_DATA_NEWLINE;
1302
return XML_TOK_DATA_CHARS;
1309
return XML_TOK_DATA_CHARS;
1315
PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1316
const char *end, const char **nextTokPtr)
1319
if (MINBPC(enc) > 1) {
1320
size_t n = end - ptr;
1321
if (n & (MINBPC(enc) - 1)) {
1322
n &= ~(MINBPC(enc) - 1);
1326
while (ptr != end) {
1327
switch (BYTE_TYPE(enc, ptr)) {
1328
INVALID_CASES(ptr, nextTokPtr)
1330
if ((ptr += MINBPC(enc)) == end)
1331
return XML_TOK_PARTIAL;
1332
if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1333
if ((ptr += MINBPC(enc)) == end)
1334
return XML_TOK_PARTIAL;
1335
if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1342
if ((ptr += MINBPC(enc)) == end)
1343
return XML_TOK_PARTIAL;
1344
if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1345
if ((ptr += MINBPC(enc)) == end)
1346
return XML_TOK_PARTIAL;
1347
if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1351
return XML_TOK_IGNORE_SECT;
1362
return XML_TOK_PARTIAL;
1365
#endif /* XML_DTD */
1368
PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1369
const char **badPtr)
1373
for (; ptr != end; ptr += MINBPC(enc)) {
1374
switch (BYTE_TYPE(enc, ptr)) {
1398
if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1405
if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1408
switch (BYTE_TO_ASCII(enc, ptr)) {
1422
/* This must only be called for a well-formed start-tag or empty
1423
element tag. Returns the number of attributes. Pointers to the
1424
first attsMax attributes are stored in atts.
1428
PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1429
int attsMax, ATTRIBUTE *atts)
1431
enum { other, inName, inValue } state = inName;
1433
int open = 0; /* defined when state == inValue;
1434
initialization just to shut up compilers */
1436
for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1437
switch (BYTE_TYPE(enc, ptr)) {
1438
#define START_NAME \
1439
if (state == other) { \
1440
if (nAtts < attsMax) { \
1441
atts[nAtts].name = ptr; \
1442
atts[nAtts].normalized = 1; \
1446
#define LEAD_CASE(n) \
1447
case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1448
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1457
if (state != inValue) {
1458
if (nAtts < attsMax)
1459
atts[nAtts].valuePtr = ptr + MINBPC(enc);
1463
else if (open == BT_QUOT) {
1465
if (nAtts < attsMax)
1466
atts[nAtts].valueEnd = ptr;
1471
if (state != inValue) {
1472
if (nAtts < attsMax)
1473
atts[nAtts].valuePtr = ptr + MINBPC(enc);
1477
else if (open == BT_APOS) {
1479
if (nAtts < attsMax)
1480
atts[nAtts].valueEnd = ptr;
1485
if (nAtts < attsMax)
1486
atts[nAtts].normalized = 0;
1489
if (state == inName)
1491
else if (state == inValue
1493
&& atts[nAtts].normalized
1494
&& (ptr == atts[nAtts].valuePtr
1495
|| BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1496
|| BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1497
|| BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1498
atts[nAtts].normalized = 0;
1500
case BT_CR: case BT_LF:
1501
/* This case ensures that the first attribute name is counted
1502
Apart from that we could just change state on the quote. */
1503
if (state == inName)
1505
else if (state == inValue && nAtts < attsMax)
1506
atts[nAtts].normalized = 0;
1510
if (state != inValue)
1520
static int PTRFASTCALL
1521
PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1525
ptr += 2*MINBPC(enc);
1526
if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1527
for (ptr += MINBPC(enc);
1528
!CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1529
ptr += MINBPC(enc)) {
1530
int c = BYTE_TO_ASCII(enc, ptr);
1532
case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1533
case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1535
result |= (c - ASCII_0);
1537
case ASCII_A: case ASCII_B: case ASCII_C:
1538
case ASCII_D: case ASCII_E: case ASCII_F:
1540
result += 10 + (c - ASCII_A);
1542
case ASCII_a: case ASCII_b: case ASCII_c:
1543
case ASCII_d: case ASCII_e: case ASCII_f:
1545
result += 10 + (c - ASCII_a);
1548
if (result >= 0x110000)
1553
for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1554
int c = BYTE_TO_ASCII(enc, ptr);
1556
result += (c - ASCII_0);
1557
if (result >= 0x110000)
1561
return checkCharRefNumber(result);
1565
PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1568
switch ((end - ptr)/MINBPC(enc)) {
1570
if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1571
switch (BYTE_TO_ASCII(enc, ptr)) {
1580
if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1582
if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1584
if (CHAR_MATCHES(enc, ptr, ASCII_p))
1590
switch (BYTE_TO_ASCII(enc, ptr)) {
1593
if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1595
if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1597
if (CHAR_MATCHES(enc, ptr, ASCII_t))
1604
if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1606
if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1608
if (CHAR_MATCHES(enc, ptr, ASCII_s))
1619
PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1622
switch (BYTE_TYPE(enc, ptr1)) {
1623
#define LEAD_CASE(n) \
1624
case BT_LEAD ## n: \
1625
if (*ptr1++ != *ptr2++) \
1627
LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1630
if (*ptr1++ != *ptr2++)
1642
if (*ptr2++ != *ptr1++)
1644
if (MINBPC(enc) > 1) {
1645
if (*ptr2++ != *ptr1++)
1647
if (MINBPC(enc) > 2) {
1648
if (*ptr2++ != *ptr1++)
1650
if (MINBPC(enc) > 3) {
1651
if (*ptr2++ != *ptr1++)
1658
if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1660
switch (BYTE_TYPE(enc, ptr2)) {
1683
PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1684
const char *end1, const char *ptr2)
1686
for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1689
if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1692
return ptr1 == end1;
1695
static int PTRFASTCALL
1696
PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1698
const char *start = ptr;
1700
switch (BYTE_TYPE(enc, ptr)) {
1701
#define LEAD_CASE(n) \
1702
case BT_LEAD ## n: ptr += n; break;
1703
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1722
static const char * PTRFASTCALL
1723
PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1726
switch (BYTE_TYPE(enc, ptr)) {
1739
PREFIX(updatePosition)(const ENCODING *enc,
1744
while (ptr != end) {
1745
switch (BYTE_TYPE(enc, ptr)) {
1746
#define LEAD_CASE(n) \
1747
case BT_LEAD ## n: \
1750
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1753
pos->columnNumber = (unsigned)-1;
1760
if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1762
pos->columnNumber = (unsigned)-1;
1768
pos->columnNumber++;
1773
#undef MULTIBYTE_CASES
1774
#undef INVALID_CASES
1775
#undef CHECK_NAME_CASE
1776
#undef CHECK_NAME_CASES
1777
#undef CHECK_NMSTRT_CASE
1778
#undef CHECK_NMSTRT_CASES