1
/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2
See the file COPYING for copying permission.
7
#ifdef COMPILED_FROM_DSP
9
#elif defined(MACOS_CLASSIC)
10
#include "macconfig.h"
11
#elif defined(__amigaos__)
12
#include "amigaconfig.h"
13
#elif defined(__WATCOMC__)
14
#include "watcomconfig.h"
16
#ifdef HAVE_EXPAT_CONFIG_H
17
#include <expat_config.h>
19
#endif /* ndef COMPILED_FROM_DSP */
21
#include "expat_external.h"
27
#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
29
#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
33
{ PREFIX(prologTok), PREFIX(contentTok), \
34
PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
35
{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
37
PREFIX(nameMatchesAscii), \
41
PREFIX(charRefNumber), \
42
PREFIX(predefinedEntityName), \
43
PREFIX(updatePosition), \
46
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
48
#define UCS2_GET_NAMING(pages, hi, lo) \
49
(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
51
/* A 2 byte UTF-8 representation splits the characters 11 bits between
52
the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
53
pages, 3 bits to add to that index and 5 bits to generate the mask.
55
#define UTF8_GET_NAMING2(pages, byte) \
56
(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
57
+ ((((byte)[0]) & 3) << 1) \
58
+ ((((byte)[1]) >> 5) & 1)] \
59
& (1 << (((byte)[1]) & 0x1F)))
61
/* A 3 byte UTF-8 representation splits the characters 16 bits between
62
the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
63
into pages, 3 bits to add to that index and 5 bits to generate the
66
#define UTF8_GET_NAMING3(pages, byte) \
67
(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
68
+ ((((byte)[1]) >> 2) & 0xF)] \
70
+ ((((byte)[1]) & 3) << 1) \
71
+ ((((byte)[2]) >> 5) & 1)] \
72
& (1 << (((byte)[2]) & 0x1F)))
74
#define UTF8_GET_NAMING(pages, p, n) \
76
? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
78
? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
81
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
82
of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
83
with the additional restriction of not allowing the Unicode
84
code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
85
Implementation details:
86
(A & 0x80) == 0 means A < 0x80
88
(A & 0xC0) == 0xC0 means A > 0xBF
91
#define UTF8_INVALID2(p) \
92
((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
94
#define UTF8_INVALID3(p) \
95
(((p)[2] & 0x80) == 0 \
97
((*p) == 0xEF && (p)[1] == 0xBF \
101
((p)[2] & 0xC0) == 0xC0) \
105
(p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
107
((p)[1] & 0x80) == 0 \
109
((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
111
#define UTF8_INVALID4(p) \
112
(((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
114
((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
118
(p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
120
((p)[1] & 0x80) == 0 \
122
((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
124
static int PTRFASTCALL
125
isNever(const ENCODING *enc, const char *p)
130
static int PTRFASTCALL
131
utf8_isName2(const ENCODING *enc, const char *p)
133
return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
136
static int PTRFASTCALL
137
utf8_isName3(const ENCODING *enc, const char *p)
139
return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
142
#define utf8_isName4 isNever
144
static int PTRFASTCALL
145
utf8_isNmstrt2(const ENCODING *enc, const char *p)
147
return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
150
static int PTRFASTCALL
151
utf8_isNmstrt3(const ENCODING *enc, const char *p)
153
return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
156
#define utf8_isNmstrt4 isNever
158
static int PTRFASTCALL
159
utf8_isInvalid2(const ENCODING *enc, const char *p)
161
return UTF8_INVALID2((const unsigned char *)p);
164
static int PTRFASTCALL
165
utf8_isInvalid3(const ENCODING *enc, const char *p)
167
return UTF8_INVALID3((const unsigned char *)p);
170
static int PTRFASTCALL
171
utf8_isInvalid4(const ENCODING *enc, const char *p)
173
return UTF8_INVALID4((const unsigned char *)p);
176
struct normal_encoding {
178
unsigned char type[256];
180
int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
181
int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
182
int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
183
int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
184
int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
185
#endif /* XML_MIN_SIZE */
186
int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
187
int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
188
int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
189
int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
190
int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
191
int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
192
int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
193
int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
194
int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
197
#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
201
#define STANDARD_VTABLE(E) \
210
#define STANDARD_VTABLE(E) /* as nothing */
214
#define NORMAL_VTABLE(E) \
225
static int FASTCALL checkCharRefNumber(int);
227
#include "xmltok_impl.h"
231
#define sb_isNameMin isNever
232
#define sb_isNmstrtMin isNever
236
#define MINBPC(enc) ((enc)->minBytesPerChar)
238
/* minimum bytes per character */
239
#define MINBPC(enc) 1
242
#define SB_BYTE_TYPE(enc, p) \
243
(((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
246
static int PTRFASTCALL
247
sb_byteType(const ENCODING *enc, const char *p)
249
return SB_BYTE_TYPE(enc, p);
251
#define BYTE_TYPE(enc, p) \
252
(AS_NORMAL_ENCODING(enc)->byteType(enc, p))
254
#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
258
#define BYTE_TO_ASCII(enc, p) \
259
(AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
260
static int PTRFASTCALL
261
sb_byteToAscii(const ENCODING *enc, const char *p)
266
#define BYTE_TO_ASCII(enc, p) (*(p))
269
#define IS_NAME_CHAR(enc, p, n) \
270
(AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
271
#define IS_NMSTRT_CHAR(enc, p, n) \
272
(AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
273
#define IS_INVALID_CHAR(enc, p, n) \
274
(AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
277
#define IS_NAME_CHAR_MINBPC(enc, p) \
278
(AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
279
#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
280
(AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
282
#define IS_NAME_CHAR_MINBPC(enc, p) (0)
283
#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
287
#define CHAR_MATCHES(enc, p, c) \
288
(AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
290
sb_charMatches(const ENCODING *enc, const char *p, int c)
295
/* c is an ASCII character */
296
#define CHAR_MATCHES(enc, p, c) (*(p) == c)
299
#define PREFIX(ident) normal_ ## ident
300
#define XML_TOK_IMPL_C
301
#include "xmltok_impl.c"
302
#undef XML_TOK_IMPL_C
309
#undef IS_NAME_CHAR_MINBPC
310
#undef IS_NMSTRT_CHAR
311
#undef IS_NMSTRT_CHAR_MINBPC
312
#undef IS_INVALID_CHAR
314
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
322
utf8_toUtf8(const ENCODING *enc,
323
const char **fromP, const char *fromLim,
324
char **toP, const char *toLim)
328
if (fromLim - *fromP > toLim - *toP) {
329
/* Avoid copying partial characters. */
330
for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
331
if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
334
for (to = *toP, from = *fromP; from != fromLim; from++, to++)
341
utf8_toUtf16(const ENCODING *enc,
342
const char **fromP, const char *fromLim,
343
unsigned short **toP, const unsigned short *toLim)
345
unsigned short *to = *toP;
346
const char *from = *fromP;
347
while (from != fromLim && to != toLim) {
348
switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
350
*to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
354
*to++ = (unsigned short)(((from[0] & 0xf) << 12)
355
| ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
363
n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
364
| ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
366
to[0] = (unsigned short)((n >> 10) | 0xD800);
367
to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
383
static const struct normal_encoding utf8_encoding_ns = {
384
{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
386
#include "asciitab.h"
389
STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
393
static const struct normal_encoding utf8_encoding = {
394
{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
396
#define BT_COLON BT_NMSTRT
397
#include "asciitab.h"
401
STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
406
static const struct normal_encoding internal_utf8_encoding_ns = {
407
{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
409
#include "iasciitab.h"
412
STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
417
static const struct normal_encoding internal_utf8_encoding = {
418
{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
420
#define BT_COLON BT_NMSTRT
421
#include "iasciitab.h"
425
STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
429
latin1_toUtf8(const ENCODING *enc,
430
const char **fromP, const char *fromLim,
431
char **toP, const char *toLim)
435
if (*fromP == fromLim)
437
c = (unsigned char)**fromP;
439
if (toLim - *toP < 2)
441
*(*toP)++ = (char)((c >> 6) | UTF8_cval2);
442
*(*toP)++ = (char)((c & 0x3f) | 0x80);
448
*(*toP)++ = *(*fromP)++;
454
latin1_toUtf16(const ENCODING *enc,
455
const char **fromP, const char *fromLim,
456
unsigned short **toP, const unsigned short *toLim)
458
while (*fromP != fromLim && *toP != toLim)
459
*(*toP)++ = (unsigned char)*(*fromP)++;
464
static const struct normal_encoding latin1_encoding_ns = {
465
{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
467
#include "asciitab.h"
468
#include "latin1tab.h"
475
static const struct normal_encoding latin1_encoding = {
476
{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
478
#define BT_COLON BT_NMSTRT
479
#include "asciitab.h"
481
#include "latin1tab.h"
487
ascii_toUtf8(const ENCODING *enc,
488
const char **fromP, const char *fromLim,
489
char **toP, const char *toLim)
491
while (*fromP != fromLim && *toP != toLim)
492
*(*toP)++ = *(*fromP)++;
497
static const struct normal_encoding ascii_encoding_ns = {
498
{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
500
#include "asciitab.h"
508
static const struct normal_encoding ascii_encoding = {
509
{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
511
#define BT_COLON BT_NMSTRT
512
#include "asciitab.h"
519
static int PTRFASTCALL
520
unicode_byte_type(char hi, char lo)
522
switch ((unsigned char)hi) {
523
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
525
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
528
switch ((unsigned char)lo) {
538
#define DEFINE_UTF16_TO_UTF8(E) \
539
static void PTRCALL \
540
E ## toUtf8(const ENCODING *enc, \
541
const char **fromP, const char *fromLim, \
542
char **toP, const char *toLim) \
545
for (from = *fromP; from != fromLim; from += 2) { \
548
unsigned char lo = GET_LO(from); \
549
unsigned char hi = GET_HI(from); \
553
if (*toP == toLim) { \
561
case 0x1: case 0x2: case 0x3: \
562
case 0x4: case 0x5: case 0x6: case 0x7: \
563
if (toLim - *toP < 2) { \
567
*(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
568
*(*toP)++ = ((lo & 0x3f) | 0x80); \
571
if (toLim - *toP < 3) { \
575
/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
576
*(*toP)++ = ((hi >> 4) | UTF8_cval3); \
577
*(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
578
*(*toP)++ = ((lo & 0x3f) | 0x80); \
580
case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
581
if (toLim - *toP < 4) { \
585
plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
586
*(*toP)++ = ((plane >> 2) | UTF8_cval4); \
587
*(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
589
lo2 = GET_LO(from); \
590
*(*toP)++ = (((lo & 0x3) << 4) \
591
| ((GET_HI(from) & 0x3) << 2) \
594
*(*toP)++ = ((lo2 & 0x3f) | 0x80); \
601
#define DEFINE_UTF16_TO_UTF16(E) \
602
static void PTRCALL \
603
E ## toUtf16(const ENCODING *enc, \
604
const char **fromP, const char *fromLim, \
605
unsigned short **toP, const unsigned short *toLim) \
607
/* Avoid copying first half only of surrogate */ \
608
if (fromLim - *fromP > ((toLim - *toP) << 1) \
609
&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
611
for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
612
*(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
615
#define SET2(ptr, ch) \
616
(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
617
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
618
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
620
DEFINE_UTF16_TO_UTF8(little2_)
621
DEFINE_UTF16_TO_UTF16(little2_)
627
#define SET2(ptr, ch) \
628
(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
629
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
630
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
632
DEFINE_UTF16_TO_UTF8(big2_)
633
DEFINE_UTF16_TO_UTF16(big2_)
639
#define LITTLE2_BYTE_TYPE(enc, p) \
641
? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
642
: unicode_byte_type((p)[1], (p)[0]))
643
#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
644
#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
645
#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
646
UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
647
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
648
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
652
static int PTRFASTCALL
653
little2_byteType(const ENCODING *enc, const char *p)
655
return LITTLE2_BYTE_TYPE(enc, p);
658
static int PTRFASTCALL
659
little2_byteToAscii(const ENCODING *enc, const char *p)
661
return LITTLE2_BYTE_TO_ASCII(enc, p);
665
little2_charMatches(const ENCODING *enc, const char *p, int c)
667
return LITTLE2_CHAR_MATCHES(enc, p, c);
670
static int PTRFASTCALL
671
little2_isNameMin(const ENCODING *enc, const char *p)
673
return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
676
static int PTRFASTCALL
677
little2_isNmstrtMin(const ENCODING *enc, const char *p)
679
return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
683
#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
685
#else /* not XML_MIN_SIZE */
688
#define PREFIX(ident) little2_ ## ident
689
#define MINBPC(enc) 2
690
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
691
#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
692
#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
693
#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
694
#define IS_NAME_CHAR(enc, p, n) 0
695
#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
696
#define IS_NMSTRT_CHAR(enc, p, n) (0)
697
#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
699
#define XML_TOK_IMPL_C
700
#include "xmltok_impl.c"
701
#undef XML_TOK_IMPL_C
708
#undef IS_NAME_CHAR_MINBPC
709
#undef IS_NMSTRT_CHAR
710
#undef IS_NMSTRT_CHAR_MINBPC
711
#undef IS_INVALID_CHAR
713
#endif /* not XML_MIN_SIZE */
717
static const struct normal_encoding little2_encoding_ns = {
719
#if BYTEORDER == 1234
726
#include "asciitab.h"
727
#include "latin1tab.h"
729
STANDARD_VTABLE(little2_)
734
static const struct normal_encoding little2_encoding = {
736
#if BYTEORDER == 1234
743
#define BT_COLON BT_NMSTRT
744
#include "asciitab.h"
746
#include "latin1tab.h"
748
STANDARD_VTABLE(little2_)
751
#if BYTEORDER != 4321
755
static const struct normal_encoding internal_little2_encoding_ns = {
758
#include "iasciitab.h"
759
#include "latin1tab.h"
761
STANDARD_VTABLE(little2_)
766
static const struct normal_encoding internal_little2_encoding = {
769
#define BT_COLON BT_NMSTRT
770
#include "iasciitab.h"
772
#include "latin1tab.h"
774
STANDARD_VTABLE(little2_)
780
#define BIG2_BYTE_TYPE(enc, p) \
782
? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
783
: unicode_byte_type((p)[0], (p)[1]))
784
#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
785
#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
786
#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
787
UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
788
#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
789
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
793
static int PTRFASTCALL
794
big2_byteType(const ENCODING *enc, const char *p)
796
return BIG2_BYTE_TYPE(enc, p);
799
static int PTRFASTCALL
800
big2_byteToAscii(const ENCODING *enc, const char *p)
802
return BIG2_BYTE_TO_ASCII(enc, p);
806
big2_charMatches(const ENCODING *enc, const char *p, int c)
808
return BIG2_CHAR_MATCHES(enc, p, c);
811
static int PTRFASTCALL
812
big2_isNameMin(const ENCODING *enc, const char *p)
814
return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
817
static int PTRFASTCALL
818
big2_isNmstrtMin(const ENCODING *enc, const char *p)
820
return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
824
#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
826
#else /* not XML_MIN_SIZE */
829
#define PREFIX(ident) big2_ ## ident
830
#define MINBPC(enc) 2
831
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
832
#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
833
#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
834
#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
835
#define IS_NAME_CHAR(enc, p, n) 0
836
#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
837
#define IS_NMSTRT_CHAR(enc, p, n) (0)
838
#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
840
#define XML_TOK_IMPL_C
841
#include "xmltok_impl.c"
842
#undef XML_TOK_IMPL_C
849
#undef IS_NAME_CHAR_MINBPC
850
#undef IS_NMSTRT_CHAR
851
#undef IS_NMSTRT_CHAR_MINBPC
852
#undef IS_INVALID_CHAR
854
#endif /* not XML_MIN_SIZE */
858
static const struct normal_encoding big2_encoding_ns = {
860
#if BYTEORDER == 4321
867
#include "asciitab.h"
868
#include "latin1tab.h"
870
STANDARD_VTABLE(big2_)
875
static const struct normal_encoding big2_encoding = {
877
#if BYTEORDER == 4321
884
#define BT_COLON BT_NMSTRT
885
#include "asciitab.h"
887
#include "latin1tab.h"
889
STANDARD_VTABLE(big2_)
892
#if BYTEORDER != 1234
896
static const struct normal_encoding internal_big2_encoding_ns = {
899
#include "iasciitab.h"
900
#include "latin1tab.h"
902
STANDARD_VTABLE(big2_)
907
static const struct normal_encoding internal_big2_encoding = {
910
#define BT_COLON BT_NMSTRT
911
#include "iasciitab.h"
913
#include "latin1tab.h"
915
STANDARD_VTABLE(big2_)
923
streqci(const char *s1, const char *s2)
928
if (ASCII_a <= c1 && c1 <= ASCII_z)
929
c1 += ASCII_A - ASCII_a;
930
if (ASCII_a <= c2 && c2 <= ASCII_z)
931
c2 += ASCII_A - ASCII_a;
941
initUpdatePosition(const ENCODING *enc, const char *ptr,
942
const char *end, POSITION *pos)
944
normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
948
toAscii(const ENCODING *enc, const char *ptr, const char *end)
952
XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
972
/* Return 1 if there's just optional white space or there's an S
973
followed by name=val.
976
parsePseudoAttribute(const ENCODING *enc,
979
const char **namePtr,
980
const char **nameEndPtr,
982
const char **nextTokPtr)
990
if (!isSpace(toAscii(enc, ptr, end))) {
995
ptr += enc->minBytesPerChar;
996
} while (isSpace(toAscii(enc, ptr, end)));
1003
c = toAscii(enc, ptr, end);
1008
if (c == ASCII_EQUALS) {
1015
ptr += enc->minBytesPerChar;
1016
} while (isSpace(c = toAscii(enc, ptr, end)));
1017
if (c != ASCII_EQUALS) {
1023
ptr += enc->minBytesPerChar;
1025
if (ptr == *namePtr) {
1029
ptr += enc->minBytesPerChar;
1030
c = toAscii(enc, ptr, end);
1031
while (isSpace(c)) {
1032
ptr += enc->minBytesPerChar;
1033
c = toAscii(enc, ptr, end);
1035
if (c != ASCII_QUOT && c != ASCII_APOS) {
1040
ptr += enc->minBytesPerChar;
1042
for (;; ptr += enc->minBytesPerChar) {
1043
c = toAscii(enc, ptr, end);
1046
if (!(ASCII_a <= c && c <= ASCII_z)
1047
&& !(ASCII_A <= c && c <= ASCII_Z)
1048
&& !(ASCII_0 <= c && c <= ASCII_9)
1049
&& c != ASCII_PERIOD
1051
&& c != ASCII_UNDERSCORE) {
1056
*nextTokPtr = ptr + enc->minBytesPerChar;
1060
static const char KW_version[] = {
1061
ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1064
static const char KW_encoding[] = {
1065
ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1068
static const char KW_standalone[] = {
1069
ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1070
ASCII_n, ASCII_e, '\0'
1073
static const char KW_yes[] = {
1074
ASCII_y, ASCII_e, ASCII_s, '\0'
1077
static const char KW_no[] = {
1078
ASCII_n, ASCII_o, '\0'
1082
doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1085
int isGeneralTextEntity,
1086
const ENCODING *enc,
1089
const char **badPtr,
1090
const char **versionPtr,
1091
const char **versionEndPtr,
1092
const char **encodingName,
1093
const ENCODING **encoding,
1096
const char *val = NULL;
1097
const char *name = NULL;
1098
const char *nameEnd = NULL;
1099
ptr += 5 * enc->minBytesPerChar;
1100
end -= 2 * enc->minBytesPerChar;
1101
if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1106
if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1107
if (!isGeneralTextEntity) {
1116
*versionEndPtr = ptr;
1117
if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1122
if (isGeneralTextEntity) {
1123
/* a TextDecl must have an EncodingDecl */
1130
if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1131
int c = toAscii(enc, val, end);
1132
if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1137
*encodingName = val;
1139
*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1140
if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1147
if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1148
|| isGeneralTextEntity) {
1152
if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1156
else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1164
while (isSpace(toAscii(enc, ptr, end)))
1165
ptr += enc->minBytesPerChar;
1174
checkCharRefNumber(int result)
1176
switch (result >> 8) {
1177
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1178
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1181
if (latin1_encoding.type[result] == BT_NONXML)
1185
if (result == 0xFFFE || result == 0xFFFF)
1193
XmlUtf8Encode(int c, char *buf)
1196
/* minN is minimum legal resulting value for N byte sequence */
1205
buf[0] = (char)(c | UTF8_cval1);
1209
buf[0] = (char)((c >> 6) | UTF8_cval2);
1210
buf[1] = (char)((c & 0x3f) | 0x80);
1214
buf[0] = (char)((c >> 12) | UTF8_cval3);
1215
buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1216
buf[2] = (char)((c & 0x3f) | 0x80);
1220
buf[0] = (char)((c >> 18) | UTF8_cval4);
1221
buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1222
buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1223
buf[3] = (char)((c & 0x3f) | 0x80);
1230
XmlUtf16Encode(int charNum, unsigned short *buf)
1234
if (charNum < 0x10000) {
1235
buf[0] = (unsigned short)charNum;
1238
if (charNum < 0x110000) {
1240
buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1241
buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1247
struct unknown_encoding {
1248
struct normal_encoding normal;
1251
unsigned short utf16[256];
1255
#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1258
XmlSizeOfUnknownEncoding(void)
1260
return sizeof(struct unknown_encoding);
1263
static int PTRFASTCALL
1264
unknown_isName(const ENCODING *enc, const char *p)
1266
const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1267
int c = uenc->convert(uenc->userData, p);
1270
return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1273
static int PTRFASTCALL
1274
unknown_isNmstrt(const ENCODING *enc, const char *p)
1276
const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1277
int c = uenc->convert(uenc->userData, p);
1280
return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1283
static int PTRFASTCALL
1284
unknown_isInvalid(const ENCODING *enc, const char *p)
1286
const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1287
int c = uenc->convert(uenc->userData, p);
1288
return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1292
unknown_toUtf8(const ENCODING *enc,
1293
const char **fromP, const char *fromLim,
1294
char **toP, const char *toLim)
1296
const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1297
char buf[XML_UTF8_ENCODE_MAX];
1301
if (*fromP == fromLim)
1303
utf8 = uenc->utf8[(unsigned char)**fromP];
1306
int c = uenc->convert(uenc->userData, *fromP);
1307
n = XmlUtf8Encode(c, buf);
1308
if (n > toLim - *toP)
1311
*fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1315
if (n > toLim - *toP)
1320
*(*toP)++ = *utf8++;
1326
unknown_toUtf16(const ENCODING *enc,
1327
const char **fromP, const char *fromLim,
1328
unsigned short **toP, const unsigned short *toLim)
1330
const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1331
while (*fromP != fromLim && *toP != toLim) {
1332
unsigned short c = uenc->utf16[(unsigned char)**fromP];
1334
c = (unsigned short)
1335
uenc->convert(uenc->userData, *fromP);
1336
*fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1346
XmlInitUnknownEncoding(void *mem,
1352
struct unknown_encoding *e = (struct unknown_encoding *)mem;
1353
for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1354
((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1355
for (i = 0; i < 128; i++)
1356
if (latin1_encoding.type[i] != BT_OTHER
1357
&& latin1_encoding.type[i] != BT_NONXML
1360
for (i = 0; i < 256; i++) {
1363
e->normal.type[i] = BT_MALFORM;
1364
/* This shouldn't really get used. */
1365
e->utf16[i] = 0xFFFF;
1372
e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1376
else if (c < 0x80) {
1377
if (latin1_encoding.type[c] != BT_OTHER
1378
&& latin1_encoding.type[c] != BT_NONXML
1381
e->normal.type[i] = latin1_encoding.type[c];
1383
e->utf8[i][1] = (char)c;
1384
e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1386
else if (checkCharRefNumber(c) < 0) {
1387
e->normal.type[i] = BT_NONXML;
1388
/* This shouldn't really get used. */
1389
e->utf16[i] = 0xFFFF;
1396
if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1397
e->normal.type[i] = BT_NMSTRT;
1398
else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1399
e->normal.type[i] = BT_NAME;
1401
e->normal.type[i] = BT_OTHER;
1402
e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1403
e->utf16[i] = (unsigned short)c;
1406
e->userData = userData;
1407
e->convert = convert;
1409
e->normal.isName2 = unknown_isName;
1410
e->normal.isName3 = unknown_isName;
1411
e->normal.isName4 = unknown_isName;
1412
e->normal.isNmstrt2 = unknown_isNmstrt;
1413
e->normal.isNmstrt3 = unknown_isNmstrt;
1414
e->normal.isNmstrt4 = unknown_isNmstrt;
1415
e->normal.isInvalid2 = unknown_isInvalid;
1416
e->normal.isInvalid3 = unknown_isInvalid;
1417
e->normal.isInvalid4 = unknown_isInvalid;
1419
e->normal.enc.utf8Convert = unknown_toUtf8;
1420
e->normal.enc.utf16Convert = unknown_toUtf16;
1421
return &(e->normal.enc);
1424
/* If this enumeration is changed, getEncodingIndex and encodings
1425
must also be changed. */
1434
/* must match encodingNames up to here */
1438
static const char KW_ISO_8859_1[] = {
1439
ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1440
ASCII_MINUS, ASCII_1, '\0'
1442
static const char KW_US_ASCII[] = {
1443
ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1446
static const char KW_UTF_8[] = {
1447
ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1449
static const char KW_UTF_16[] = {
1450
ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1452
static const char KW_UTF_16BE[] = {
1453
ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1456
static const char KW_UTF_16LE[] = {
1457
ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1462
getEncodingIndex(const char *name)
1464
static const char * const encodingNames[] = {
1475
for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1476
if (streqci(name, encodingNames[i]))
1481
/* For binary compatibility, we store the index of the encoding
1482
specified at initialization in the isUtf16 member.
1485
#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1486
#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1488
/* This is what detects the encoding. encodingTable maps from
1489
encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1490
the external (protocol) specified encoding; state is
1491
XML_CONTENT_STATE if we're parsing an external text entity, and
1492
XML_PROLOG_STATE otherwise.
1497
initScan(const ENCODING * const *encodingTable,
1498
const INIT_ENCODING *enc,
1502
const char **nextTokPtr)
1504
const ENCODING **encPtr;
1507
return XML_TOK_NONE;
1508
encPtr = enc->encPtr;
1509
if (ptr + 1 == end) {
1510
/* only a single byte available for auto-detection */
1511
#ifndef XML_DTD /* FIXME */
1512
/* a well-formed document entity must have more than one byte */
1513
if (state != XML_CONTENT_STATE)
1514
return XML_TOK_PARTIAL;
1516
/* so we're parsing an external text entity... */
1517
/* if UTF-16 was externally specified, then we need at least 2 bytes */
1518
switch (INIT_ENC_INDEX(enc)) {
1522
return XML_TOK_PARTIAL;
1524
switch ((unsigned char)*ptr) {
1527
case 0xEF: /* possibly first byte of UTF-8 BOM */
1528
if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1529
&& state == XML_CONTENT_STATE)
1534
return XML_TOK_PARTIAL;
1538
switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1540
if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1541
&& state == XML_CONTENT_STATE)
1543
*nextTokPtr = ptr + 2;
1544
*encPtr = encodingTable[UTF_16BE_ENC];
1546
/* 00 3C is handled in the default case */
1548
if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1549
|| INIT_ENC_INDEX(enc) == UTF_16_ENC)
1550
&& state == XML_CONTENT_STATE)
1552
*encPtr = encodingTable[UTF_16LE_ENC];
1553
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1555
if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1556
&& state == XML_CONTENT_STATE)
1558
*nextTokPtr = ptr + 2;
1559
*encPtr = encodingTable[UTF_16LE_ENC];
1562
/* Maybe a UTF-8 BOM (EF BB BF) */
1563
/* If there's an explicitly specified (external) encoding
1564
of ISO-8859-1 or some flavour of UTF-16
1565
and this is an external text entity,
1566
don't look for the BOM,
1567
because it might be a legal data.
1569
if (state == XML_CONTENT_STATE) {
1570
int e = INIT_ENC_INDEX(enc);
1571
if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1572
|| e == UTF_16LE_ENC || e == UTF_16_ENC)
1576
return XML_TOK_PARTIAL;
1577
if ((unsigned char)ptr[2] == 0xBF) {
1578
*nextTokPtr = ptr + 3;
1579
*encPtr = encodingTable[UTF_8_ENC];
1584
if (ptr[0] == '\0') {
1585
/* 0 isn't a legal data character. Furthermore a document
1586
entity can only start with ASCII characters. So the only
1587
way this can fail to be big-endian UTF-16 is if it is an
1588
external parsed general entity that's labelled as
1591
if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1593
*encPtr = encodingTable[UTF_16BE_ENC];
1594
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1596
else if (ptr[1] == '\0') {
1597
/* We could recover here in the case:
1598
- parsing an external entity
1600
- no externally specified encoding
1601
- no encoding declaration
1602
by assuming UTF-16LE. But we don't, because this would mean when
1603
presented just with a single byte, we couldn't reliably determine
1604
whether we needed further bytes.
1606
if (state == XML_CONTENT_STATE)
1608
*encPtr = encodingTable[UTF_16LE_ENC];
1609
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1614
*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1615
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621
#define XML_TOK_NS_C
1622
#include "xmltok_ns.c"
1629
#define NS(x) x ## NS
1630
#define ns(x) x ## _ns
1632
#define XML_TOK_NS_C
1633
#include "xmltok_ns.c"
1640
XmlInitUnknownEncodingNS(void *mem,
1645
ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1647
((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;