133
int UTF8BytesOfLead[256];
134
static bool initialisedBytesOfLead = false;
136
static int BytesFromLead(int leadByte) {
137
if (leadByte < 0xC2) {
138
// Single byte or invalid
140
} else if (leadByte < 0xE0) {
142
} else if (leadByte < 0xF0) {
144
} else if (leadByte < 0xF5) {
147
// Characters longer than 4 bytes not possible in current UTF-8
152
void UTF8BytesOfLeadInitialise() {
153
if (!initialisedBytesOfLead) {
154
for (int i=0;i<256;i++) {
155
UTF8BytesOfLead[i] = BytesFromLead(i);
157
initialisedBytesOfLead = true;
161
// Return both the width of the first character in the string and a status
162
// saying whether it is valid or invalid.
163
// Most invalid sequences return a width of 1 so are treated as isolated bytes but
164
// the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
165
// reasonably treated as code points in some circumstances. They will, however,
166
// not have associated glyphs.
167
int UTF8Classify(const unsigned char *us, int len) {
168
// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
172
} else if (*us > 0xf4) {
173
// Characters longer than 4 bytes not possible in current UTF-8
174
return UTF8MaskInvalid | 1;
175
} else if (*us >= 0xf0) {
178
return UTF8MaskInvalid | 1;
179
if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
180
if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
181
// *FFFE or *FFFF non-character
182
return UTF8MaskInvalid | 4;
185
// Check if encoding a value beyond the last Unicode character 10FFFF
187
return UTF8MaskInvalid | 1;
188
} else if (us[1] == 0x8f) {
190
return UTF8MaskInvalid | 1;
191
} else if (us[2] == 0xbf) {
193
return UTF8MaskInvalid | 1;
197
} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
199
return UTF8MaskInvalid | 1;
203
return UTF8MaskInvalid | 1;
205
} else if (*us >= 0xe0) {
208
return UTF8MaskInvalid | 1;
209
if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
210
if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
212
return UTF8MaskInvalid | 1;
214
if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
216
return UTF8MaskInvalid | 1;
218
if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
219
// U+FFFE non-character - 3 bytes long
220
return UTF8MaskInvalid | 3;
222
if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
223
// U+FFFF non-character - 3 bytes long
224
return UTF8MaskInvalid | 3;
226
if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
228
return UTF8MaskInvalid | 3;
232
return UTF8MaskInvalid | 1;
234
} else if (*us >= 0xc2) {
237
return UTF8MaskInvalid | 1;
238
if (UTF8IsTrailByte(us[1])) {
241
return UTF8MaskInvalid | 1;
244
// 0xc0 .. 0xc1 is overlong encoding
245
// 0x80 .. 0xbf is trail byte
246
return UTF8MaskInvalid | 1;