1
/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2
See the file COPYING for copying permission.
5
/* This file is included! */
8
#ifndef IS_INVALID_CHAR
9
#define IS_INVALID_CHAR(enc, ptr, n) (0)
12
#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
15
return XML_TOK_PARTIAL_CHAR; \
16
if (IS_INVALID_CHAR(enc, ptr, n)) { \
17
*(nextTokPtr) = (ptr); \
18
return XML_TOK_INVALID; \
23
#define INVALID_CASES(ptr, nextTokPtr) \
24
INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25
INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26
INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
30
*(nextTokPtr) = (ptr); \
31
return XML_TOK_INVALID;
33
#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
36
return XML_TOK_PARTIAL_CHAR; \
37
if (!IS_NAME_CHAR(enc, ptr, n)) { \
39
return XML_TOK_INVALID; \
44
#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
46
if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
48
return XML_TOK_INVALID; \
57
CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58
CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59
CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
61
#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
64
return XML_TOK_PARTIAL_CHAR; \
65
if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
67
return XML_TOK_INVALID; \
72
#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
74
if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
76
return XML_TOK_INVALID; \
82
CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83
CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84
CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
87
#define PREFIX(ident) ident
90
/* ptr points to character following "<!-" */
93
PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
94
const char *end, const char **nextTokPtr)
97
if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
99
return XML_TOK_INVALID;
103
switch (BYTE_TYPE(enc, ptr)) {
104
INVALID_CASES(ptr, nextTokPtr)
106
if ((ptr += MINBPC(enc)) == end)
107
return XML_TOK_PARTIAL;
108
if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
109
if ((ptr += MINBPC(enc)) == end)
110
return XML_TOK_PARTIAL;
111
if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
113
return XML_TOK_INVALID;
115
*nextTokPtr = ptr + MINBPC(enc);
116
return XML_TOK_COMMENT;
125
return XML_TOK_PARTIAL;
128
/* ptr points to character following "<!" */
131
PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
132
const char *end, const char **nextTokPtr)
135
return XML_TOK_PARTIAL;
136
switch (BYTE_TYPE(enc, ptr)) {
138
return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
140
*nextTokPtr = ptr + MINBPC(enc);
141
return XML_TOK_COND_SECT_OPEN;
148
return XML_TOK_INVALID;
151
switch (BYTE_TYPE(enc, ptr)) {
153
if (ptr + MINBPC(enc) == end)
154
return XML_TOK_PARTIAL;
155
/* don't allow <!ENTITY% foo "whatever"> */
156
switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
157
case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
159
return XML_TOK_INVALID;
162
case BT_S: case BT_CR: case BT_LF:
164
return XML_TOK_DECL_OPEN;
171
return XML_TOK_INVALID;
174
return XML_TOK_PARTIAL;
178
PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
179
const char *end, int *tokPtr)
182
*tokPtr = XML_TOK_PI;
183
if (end - ptr != MINBPC(enc)*3)
185
switch (BYTE_TO_ASCII(enc, ptr)) {
195
switch (BYTE_TO_ASCII(enc, ptr)) {
205
switch (BYTE_TO_ASCII(enc, ptr)) {
216
*tokPtr = XML_TOK_XML_DECL;
220
/* ptr points to character following "<?" */
223
PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
224
const char *end, const char **nextTokPtr)
227
const char *target = ptr;
229
return XML_TOK_PARTIAL;
230
switch (BYTE_TYPE(enc, ptr)) {
231
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
234
return XML_TOK_INVALID;
237
switch (BYTE_TYPE(enc, ptr)) {
238
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
239
case BT_S: case BT_CR: case BT_LF:
240
if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
242
return XML_TOK_INVALID;
246
switch (BYTE_TYPE(enc, ptr)) {
247
INVALID_CASES(ptr, nextTokPtr)
251
return XML_TOK_PARTIAL;
252
if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
253
*nextTokPtr = ptr + MINBPC(enc);
262
return XML_TOK_PARTIAL;
264
if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
266
return XML_TOK_INVALID;
270
return XML_TOK_PARTIAL;
271
if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
272
*nextTokPtr = ptr + MINBPC(enc);
278
return XML_TOK_INVALID;
281
return XML_TOK_PARTIAL;
285
PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
286
const char *end, const char **nextTokPtr)
288
static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
289
ASCII_T, ASCII_A, ASCII_LSQB };
292
if (end - ptr < 6 * MINBPC(enc))
293
return XML_TOK_PARTIAL;
294
for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
295
if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
297
return XML_TOK_INVALID;
301
return XML_TOK_CDATA_SECT_OPEN;
305
PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
306
const char *end, const char **nextTokPtr)
310
if (MINBPC(enc) > 1) {
311
size_t n = end - ptr;
312
if (n & (MINBPC(enc) - 1)) {
313
n &= ~(MINBPC(enc) - 1);
315
return XML_TOK_PARTIAL;
319
switch (BYTE_TYPE(enc, ptr)) {
323
return XML_TOK_PARTIAL;
324
if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
328
return XML_TOK_PARTIAL;
329
if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
333
*nextTokPtr = ptr + MINBPC(enc);
334
return XML_TOK_CDATA_SECT_CLOSE;
338
return XML_TOK_PARTIAL;
339
if (BYTE_TYPE(enc, ptr) == BT_LF)
342
return XML_TOK_DATA_NEWLINE;
344
*nextTokPtr = ptr + MINBPC(enc);
345
return XML_TOK_DATA_NEWLINE;
346
INVALID_CASES(ptr, nextTokPtr)
352
switch (BYTE_TYPE(enc, ptr)) {
353
#define LEAD_CASE(n) \
355
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
357
return XML_TOK_DATA_CHARS; \
361
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
370
return XML_TOK_DATA_CHARS;
377
return XML_TOK_DATA_CHARS;
380
/* ptr points to character following "</" */
383
PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
384
const char *end, const char **nextTokPtr)
387
return XML_TOK_PARTIAL;
388
switch (BYTE_TYPE(enc, ptr)) {
389
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
392
return XML_TOK_INVALID;
395
switch (BYTE_TYPE(enc, ptr)) {
396
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
397
case BT_S: case BT_CR: case BT_LF:
398
for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
399
switch (BYTE_TYPE(enc, ptr)) {
400
case BT_S: case BT_CR: case BT_LF:
403
*nextTokPtr = ptr + MINBPC(enc);
404
return XML_TOK_END_TAG;
407
return XML_TOK_INVALID;
410
return XML_TOK_PARTIAL;
413
/* no need to check qname syntax here,
414
since end-tag must match exactly */
419
*nextTokPtr = ptr + MINBPC(enc);
420
return XML_TOK_END_TAG;
423
return XML_TOK_INVALID;
426
return XML_TOK_PARTIAL;
429
/* ptr points to character following "&#X" */
432
PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
433
const char *end, const char **nextTokPtr)
436
switch (BYTE_TYPE(enc, ptr)) {
442
return XML_TOK_INVALID;
444
for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
445
switch (BYTE_TYPE(enc, ptr)) {
450
*nextTokPtr = ptr + MINBPC(enc);
451
return XML_TOK_CHAR_REF;
454
return XML_TOK_INVALID;
458
return XML_TOK_PARTIAL;
461
/* ptr points to character following "&#" */
464
PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
465
const char *end, const char **nextTokPtr)
468
if (CHAR_MATCHES(enc, ptr, ASCII_x))
469
return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
470
switch (BYTE_TYPE(enc, ptr)) {
475
return XML_TOK_INVALID;
477
for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
478
switch (BYTE_TYPE(enc, ptr)) {
482
*nextTokPtr = ptr + MINBPC(enc);
483
return XML_TOK_CHAR_REF;
486
return XML_TOK_INVALID;
490
return XML_TOK_PARTIAL;
493
/* ptr points to character following "&" */
496
PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
497
const char **nextTokPtr)
500
return XML_TOK_PARTIAL;
501
switch (BYTE_TYPE(enc, ptr)) {
502
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
504
return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
507
return XML_TOK_INVALID;
510
switch (BYTE_TYPE(enc, ptr)) {
511
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
513
*nextTokPtr = ptr + MINBPC(enc);
514
return XML_TOK_ENTITY_REF;
517
return XML_TOK_INVALID;
520
return XML_TOK_PARTIAL;
523
/* ptr points to character following first character of attribute name */
526
PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
527
const char **nextTokPtr)
533
switch (BYTE_TYPE(enc, ptr)) {
534
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
539
return XML_TOK_INVALID;
544
return XML_TOK_PARTIAL;
545
switch (BYTE_TYPE(enc, ptr)) {
546
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
549
return XML_TOK_INVALID;
553
case BT_S: case BT_CR: case BT_LF:
559
return XML_TOK_PARTIAL;
560
t = BYTE_TYPE(enc, ptr);
570
return XML_TOK_INVALID;
583
return XML_TOK_PARTIAL;
584
open = BYTE_TYPE(enc, ptr);
585
if (open == BT_QUOT || open == BT_APOS)
594
return XML_TOK_INVALID;
598
/* in attribute value */
602
return XML_TOK_PARTIAL;
603
t = BYTE_TYPE(enc, ptr);
607
INVALID_CASES(ptr, nextTokPtr)
610
int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
612
if (tok == XML_TOK_INVALID)
620
return XML_TOK_INVALID;
628
return XML_TOK_PARTIAL;
629
switch (BYTE_TYPE(enc, ptr)) {
640
return XML_TOK_INVALID;
642
/* ptr points to closing quote */
646
return XML_TOK_PARTIAL;
647
switch (BYTE_TYPE(enc, ptr)) {
648
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
649
case BT_S: case BT_CR: case BT_LF:
653
*nextTokPtr = ptr + MINBPC(enc);
654
return XML_TOK_START_TAG_WITH_ATTS;
659
return XML_TOK_PARTIAL;
660
if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
662
return XML_TOK_INVALID;
664
*nextTokPtr = ptr + MINBPC(enc);
665
return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
668
return XML_TOK_INVALID;
676
return XML_TOK_INVALID;
679
return XML_TOK_PARTIAL;
682
/* ptr points to character following "<" */
685
PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
686
const char **nextTokPtr)
692
return XML_TOK_PARTIAL;
693
switch (BYTE_TYPE(enc, ptr)) {
694
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
696
if ((ptr += MINBPC(enc)) == end)
697
return XML_TOK_PARTIAL;
698
switch (BYTE_TYPE(enc, ptr)) {
700
return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
702
return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
706
return XML_TOK_INVALID;
708
return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
710
return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
713
return XML_TOK_INVALID;
718
/* we have a start-tag */
720
switch (BYTE_TYPE(enc, ptr)) {
721
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
726
return XML_TOK_INVALID;
731
return XML_TOK_PARTIAL;
732
switch (BYTE_TYPE(enc, ptr)) {
733
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
736
return XML_TOK_INVALID;
740
case BT_S: case BT_CR: case BT_LF:
744
switch (BYTE_TYPE(enc, ptr)) {
745
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
750
case BT_S: case BT_CR: case BT_LF:
755
return XML_TOK_INVALID;
757
return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
759
return XML_TOK_PARTIAL;
763
*nextTokPtr = ptr + MINBPC(enc);
764
return XML_TOK_START_TAG_NO_ATTS;
769
return XML_TOK_PARTIAL;
770
if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
772
return XML_TOK_INVALID;
774
*nextTokPtr = ptr + MINBPC(enc);
775
return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
778
return XML_TOK_INVALID;
781
return XML_TOK_PARTIAL;
785
PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
786
const char **nextTokPtr)
790
if (MINBPC(enc) > 1) {
791
size_t n = end - ptr;
792
if (n & (MINBPC(enc) - 1)) {
793
n &= ~(MINBPC(enc) - 1);
795
return XML_TOK_PARTIAL;
799
switch (BYTE_TYPE(enc, ptr)) {
801
return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
803
return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
807
return XML_TOK_TRAILING_CR;
808
if (BYTE_TYPE(enc, ptr) == BT_LF)
811
return XML_TOK_DATA_NEWLINE;
813
*nextTokPtr = ptr + MINBPC(enc);
814
return XML_TOK_DATA_NEWLINE;
818
return XML_TOK_TRAILING_RSQB;
819
if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
823
return XML_TOK_TRAILING_RSQB;
824
if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
829
return XML_TOK_INVALID;
830
INVALID_CASES(ptr, nextTokPtr)
836
switch (BYTE_TYPE(enc, ptr)) {
837
#define LEAD_CASE(n) \
839
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
841
return XML_TOK_DATA_CHARS; \
845
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
848
if (ptr + MINBPC(enc) != end) {
849
if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
853
if (ptr + 2*MINBPC(enc) != end) {
854
if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
858
*nextTokPtr = ptr + 2*MINBPC(enc);
859
return XML_TOK_INVALID;
871
return XML_TOK_DATA_CHARS;
878
return XML_TOK_DATA_CHARS;
881
/* ptr points to character following "%" */
884
PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
885
const char **nextTokPtr)
888
return -XML_TOK_PERCENT;
889
switch (BYTE_TYPE(enc, ptr)) {
890
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
891
case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
893
return XML_TOK_PERCENT;
896
return XML_TOK_INVALID;
899
switch (BYTE_TYPE(enc, ptr)) {
900
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
902
*nextTokPtr = ptr + MINBPC(enc);
903
return XML_TOK_PARAM_ENTITY_REF;
906
return XML_TOK_INVALID;
909
return XML_TOK_PARTIAL;
913
PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
914
const char **nextTokPtr)
917
return XML_TOK_PARTIAL;
918
switch (BYTE_TYPE(enc, ptr)) {
919
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
922
return XML_TOK_INVALID;
925
switch (BYTE_TYPE(enc, ptr)) {
926
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
927
case BT_CR: case BT_LF: case BT_S:
928
case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
930
return XML_TOK_POUND_NAME;
933
return XML_TOK_INVALID;
936
return -XML_TOK_POUND_NAME;
940
PREFIX(scanLit)(int open, const ENCODING *enc,
941
const char *ptr, const char *end,
942
const char **nextTokPtr)
945
int t = BYTE_TYPE(enc, ptr);
947
INVALID_CASES(ptr, nextTokPtr)
954
return -XML_TOK_LITERAL;
956
switch (BYTE_TYPE(enc, ptr)) {
957
case BT_S: case BT_CR: case BT_LF:
958
case BT_GT: case BT_PERCNT: case BT_LSQB:
959
return XML_TOK_LITERAL;
961
return XML_TOK_INVALID;
968
return XML_TOK_PARTIAL;
972
PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
973
const char **nextTokPtr)
978
if (MINBPC(enc) > 1) {
979
size_t n = end - ptr;
980
if (n & (MINBPC(enc) - 1)) {
981
n &= ~(MINBPC(enc) - 1);
983
return XML_TOK_PARTIAL;
987
switch (BYTE_TYPE(enc, ptr)) {
989
return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
991
return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
996
return XML_TOK_PARTIAL;
997
switch (BYTE_TYPE(enc, ptr)) {
999
return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1001
return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1008
*nextTokPtr = ptr - MINBPC(enc);
1009
return XML_TOK_INSTANCE_START;
1012
return XML_TOK_INVALID;
1015
if (ptr + MINBPC(enc) == end) {
1017
/* indicate that this might be part of a CR/LF pair */
1018
return -XML_TOK_PROLOG_S;
1021
case BT_S: case BT_LF:
1026
switch (BYTE_TYPE(enc, ptr)) {
1027
case BT_S: case BT_LF:
1030
/* don't split CR/LF pair */
1031
if (ptr + MINBPC(enc) != end)
1036
return XML_TOK_PROLOG_S;
1040
return XML_TOK_PROLOG_S;
1042
return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1044
*nextTokPtr = ptr + MINBPC(enc);
1045
return XML_TOK_COMMA;
1047
*nextTokPtr = ptr + MINBPC(enc);
1048
return XML_TOK_OPEN_BRACKET;
1052
return -XML_TOK_CLOSE_BRACKET;
1053
if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1054
if (ptr + MINBPC(enc) == end)
1055
return XML_TOK_PARTIAL;
1056
if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1057
*nextTokPtr = ptr + 2*MINBPC(enc);
1058
return XML_TOK_COND_SECT_CLOSE;
1062
return XML_TOK_CLOSE_BRACKET;
1064
*nextTokPtr = ptr + MINBPC(enc);
1065
return XML_TOK_OPEN_PAREN;
1069
return -XML_TOK_CLOSE_PAREN;
1070
switch (BYTE_TYPE(enc, ptr)) {
1072
*nextTokPtr = ptr + MINBPC(enc);
1073
return XML_TOK_CLOSE_PAREN_ASTERISK;
1075
*nextTokPtr = ptr + MINBPC(enc);
1076
return XML_TOK_CLOSE_PAREN_QUESTION;
1078
*nextTokPtr = ptr + MINBPC(enc);
1079
return XML_TOK_CLOSE_PAREN_PLUS;
1080
case BT_CR: case BT_LF: case BT_S:
1081
case BT_GT: case BT_COMMA: case BT_VERBAR:
1084
return XML_TOK_CLOSE_PAREN;
1087
return XML_TOK_INVALID;
1089
*nextTokPtr = ptr + MINBPC(enc);
1092
*nextTokPtr = ptr + MINBPC(enc);
1093
return XML_TOK_DECL_CLOSE;
1095
return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1096
#define LEAD_CASE(n) \
1097
case BT_LEAD ## n: \
1098
if (end - ptr < n) \
1099
return XML_TOK_PARTIAL_CHAR; \
1100
if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1102
tok = XML_TOK_NAME; \
1105
if (IS_NAME_CHAR(enc, ptr, n)) { \
1107
tok = XML_TOK_NMTOKEN; \
1110
*nextTokPtr = ptr; \
1111
return XML_TOK_INVALID;
1112
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1125
tok = XML_TOK_NMTOKEN;
1129
if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1134
if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1136
tok = XML_TOK_NMTOKEN;
1142
return XML_TOK_INVALID;
1144
while (ptr != end) {
1145
switch (BYTE_TYPE(enc, ptr)) {
1146
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1147
case BT_GT: case BT_RPAR: case BT_COMMA:
1148
case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1149
case BT_S: case BT_CR: case BT_LF:
1158
return XML_TOK_PARTIAL;
1159
tok = XML_TOK_PREFIXED_NAME;
1160
switch (BYTE_TYPE(enc, ptr)) {
1161
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1163
tok = XML_TOK_NMTOKEN;
1167
case XML_TOK_PREFIXED_NAME:
1168
tok = XML_TOK_NMTOKEN;
1174
if (tok == XML_TOK_NMTOKEN) {
1176
return XML_TOK_INVALID;
1178
*nextTokPtr = ptr + MINBPC(enc);
1179
return XML_TOK_NAME_PLUS;
1181
if (tok == XML_TOK_NMTOKEN) {
1183
return XML_TOK_INVALID;
1185
*nextTokPtr = ptr + MINBPC(enc);
1186
return XML_TOK_NAME_ASTERISK;
1188
if (tok == XML_TOK_NMTOKEN) {
1190
return XML_TOK_INVALID;
1192
*nextTokPtr = ptr + MINBPC(enc);
1193
return XML_TOK_NAME_QUESTION;
1196
return XML_TOK_INVALID;
1203
PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1204
const char *end, const char **nextTokPtr)
1208
return XML_TOK_NONE;
1210
while (ptr != end) {
1211
switch (BYTE_TYPE(enc, ptr)) {
1212
#define LEAD_CASE(n) \
1213
case BT_LEAD ## n: ptr += n; break;
1214
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1218
return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1220
return XML_TOK_DATA_CHARS;
1222
/* this is for inside entity references */
1224
return XML_TOK_INVALID;
1227
*nextTokPtr = ptr + MINBPC(enc);
1228
return XML_TOK_DATA_NEWLINE;
1231
return XML_TOK_DATA_CHARS;
1236
return XML_TOK_TRAILING_CR;
1237
if (BYTE_TYPE(enc, ptr) == BT_LF)
1240
return XML_TOK_DATA_NEWLINE;
1243
return XML_TOK_DATA_CHARS;
1246
*nextTokPtr = ptr + MINBPC(enc);
1247
return XML_TOK_ATTRIBUTE_VALUE_S;
1250
return XML_TOK_DATA_CHARS;
1257
return XML_TOK_DATA_CHARS;
1261
PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1262
const char *end, const char **nextTokPtr)
1266
return XML_TOK_NONE;
1268
while (ptr != end) {
1269
switch (BYTE_TYPE(enc, ptr)) {
1270
#define LEAD_CASE(n) \
1271
case BT_LEAD ## n: ptr += n; break;
1272
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1276
return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1278
return XML_TOK_DATA_CHARS;
1281
int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1283
return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1286
return XML_TOK_DATA_CHARS;
1289
*nextTokPtr = ptr + MINBPC(enc);
1290
return XML_TOK_DATA_NEWLINE;
1293
return XML_TOK_DATA_CHARS;
1298
return XML_TOK_TRAILING_CR;
1299
if (BYTE_TYPE(enc, ptr) == BT_LF)
1302
return XML_TOK_DATA_NEWLINE;
1305
return XML_TOK_DATA_CHARS;
1312
return XML_TOK_DATA_CHARS;
1318
PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1319
const char *end, const char **nextTokPtr)
1322
if (MINBPC(enc) > 1) {
1323
size_t n = end - ptr;
1324
if (n & (MINBPC(enc) - 1)) {
1325
n &= ~(MINBPC(enc) - 1);
1329
while (ptr != end) {
1330
switch (BYTE_TYPE(enc, ptr)) {
1331
INVALID_CASES(ptr, nextTokPtr)
1333
if ((ptr += MINBPC(enc)) == end)
1334
return XML_TOK_PARTIAL;
1335
if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1336
if ((ptr += MINBPC(enc)) == end)
1337
return XML_TOK_PARTIAL;
1338
if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1345
if ((ptr += MINBPC(enc)) == end)
1346
return XML_TOK_PARTIAL;
1347
if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1348
if ((ptr += MINBPC(enc)) == end)
1349
return XML_TOK_PARTIAL;
1350
if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1354
return XML_TOK_IGNORE_SECT;
1365
return XML_TOK_PARTIAL;
1368
#endif /* XML_DTD */
1371
PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1372
const char **badPtr)
1376
for (; ptr != end; ptr += MINBPC(enc)) {
1377
switch (BYTE_TYPE(enc, ptr)) {
1401
if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1408
if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1411
switch (BYTE_TO_ASCII(enc, ptr)) {
1425
/* This must only be called for a well-formed start-tag or empty
1426
element tag. Returns the number of attributes. Pointers to the
1427
first attsMax attributes are stored in atts.
1431
PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1432
int attsMax, ATTRIBUTE *atts)
1434
enum { other, inName, inValue } state = inName;
1436
int open = 0; /* defined when state == inValue;
1437
initialization just to shut up compilers */
1439
for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1440
switch (BYTE_TYPE(enc, ptr)) {
1441
#define START_NAME \
1442
if (state == other) { \
1443
if (nAtts < attsMax) { \
1444
atts[nAtts].name = ptr; \
1445
atts[nAtts].normalized = 1; \
1449
#define LEAD_CASE(n) \
1450
case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1451
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1460
if (state != inValue) {
1461
if (nAtts < attsMax)
1462
atts[nAtts].valuePtr = ptr + MINBPC(enc);
1466
else if (open == BT_QUOT) {
1468
if (nAtts < attsMax)
1469
atts[nAtts].valueEnd = ptr;
1474
if (state != inValue) {
1475
if (nAtts < attsMax)
1476
atts[nAtts].valuePtr = ptr + MINBPC(enc);
1480
else if (open == BT_APOS) {
1482
if (nAtts < attsMax)
1483
atts[nAtts].valueEnd = ptr;
1488
if (nAtts < attsMax)
1489
atts[nAtts].normalized = 0;
1492
if (state == inName)
1494
else if (state == inValue
1496
&& atts[nAtts].normalized
1497
&& (ptr == atts[nAtts].valuePtr
1498
|| BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1499
|| BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1500
|| BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1501
atts[nAtts].normalized = 0;
1503
case BT_CR: case BT_LF:
1504
/* This case ensures that the first attribute name is counted
1505
Apart from that we could just change state on the quote. */
1506
if (state == inName)
1508
else if (state == inValue && nAtts < attsMax)
1509
atts[nAtts].normalized = 0;
1513
if (state != inValue)
1523
static int PTRFASTCALL
1524
PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1528
ptr += 2*MINBPC(enc);
1529
if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1530
for (ptr += MINBPC(enc);
1531
!CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1532
ptr += MINBPC(enc)) {
1533
int c = BYTE_TO_ASCII(enc, ptr);
1535
case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1536
case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1538
result |= (c - ASCII_0);
1540
case ASCII_A: case ASCII_B: case ASCII_C:
1541
case ASCII_D: case ASCII_E: case ASCII_F:
1543
result += 10 + (c - ASCII_A);
1545
case ASCII_a: case ASCII_b: case ASCII_c:
1546
case ASCII_d: case ASCII_e: case ASCII_f:
1548
result += 10 + (c - ASCII_a);
1551
if (result >= 0x110000)
1556
for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1557
int c = BYTE_TO_ASCII(enc, ptr);
1559
result += (c - ASCII_0);
1560
if (result >= 0x110000)
1564
return checkCharRefNumber(result);
1568
PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1571
switch ((end - ptr)/MINBPC(enc)) {
1573
if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1574
switch (BYTE_TO_ASCII(enc, ptr)) {
1583
if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1585
if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1587
if (CHAR_MATCHES(enc, ptr, ASCII_p))
1593
switch (BYTE_TO_ASCII(enc, ptr)) {
1596
if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1598
if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1600
if (CHAR_MATCHES(enc, ptr, ASCII_t))
1607
if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1609
if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1611
if (CHAR_MATCHES(enc, ptr, ASCII_s))
1622
PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1625
switch (BYTE_TYPE(enc, ptr1)) {
1626
#define LEAD_CASE(n) \
1627
case BT_LEAD ## n: \
1628
if (*ptr1++ != *ptr2++) \
1630
LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1633
if (*ptr1++ != *ptr2++)
1645
if (*ptr2++ != *ptr1++)
1647
if (MINBPC(enc) > 1) {
1648
if (*ptr2++ != *ptr1++)
1650
if (MINBPC(enc) > 2) {
1651
if (*ptr2++ != *ptr1++)
1653
if (MINBPC(enc) > 3) {
1654
if (*ptr2++ != *ptr1++)
1661
if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1663
switch (BYTE_TYPE(enc, ptr2)) {
1686
PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1687
const char *end1, const char *ptr2)
1689
for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1692
if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1695
return ptr1 == end1;
1698
static int PTRFASTCALL
1699
PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1701
const char *start = ptr;
1703
switch (BYTE_TYPE(enc, ptr)) {
1704
#define LEAD_CASE(n) \
1705
case BT_LEAD ## n: ptr += n; break;
1706
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1720
return (int)(ptr - start);
1725
static const char * PTRFASTCALL
1726
PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1729
switch (BYTE_TYPE(enc, ptr)) {
1742
PREFIX(updatePosition)(const ENCODING *enc,
1747
while (ptr != end) {
1748
switch (BYTE_TYPE(enc, ptr)) {
1749
#define LEAD_CASE(n) \
1750
case BT_LEAD ## n: \
1753
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1756
pos->columnNumber = (XML_Size)-1;
1763
if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1765
pos->columnNumber = (XML_Size)-1;
1771
pos->columnNumber++;
1776
#undef MULTIBYTE_CASES
1777
#undef INVALID_CASES
1778
#undef CHECK_NAME_CASE
1779
#undef CHECK_NAME_CASES
1780
#undef CHECK_NMSTRT_CASE
1781
#undef CHECK_NMSTRT_CASES
1783
#endif /* XML_TOK_IMPL_C */