~ubuntu-branches/ubuntu/quantal/libxml2/quantal-updates

1 by Daniel Holbach
Import upstream version 2.6.22
1
/*
2
 * string.c : an XML string utilities module
3
 *
4
 * This module provides various utility functions for manipulating
5
 * the xmlChar* type. All functions named xmlStr* have been moved here
6
 * from the parser.c file (their original home). 
7
 *
8
 * See Copyright for the status of this software.
9
 *
10
 * UTF8 string routines from:
11
 * William Brack <wbrack@mmm.com.hk>
12
 *
13
 * daniel@veillard.com
14
 */
15
16
#define IN_LIBXML
17
#include "libxml.h"
18
19
#include <stdlib.h>
20
#include <string.h>
21
#include <libxml/xmlmemory.h>
22
#include <libxml/parserInternals.h>
23
#include <libxml/xmlstring.h>
24
25
/************************************************************************
26
 *                                                                      *
27
 *                Commodity functions to handle xmlChars                *
28
 *                                                                      *
29
 ************************************************************************/
30
31
/**
32
 * xmlStrndup:
33
 * @cur:  the input xmlChar *
34
 * @len:  the len of @cur
35
 *
36
 * a strndup for array of xmlChar's
37
 *
38
 * Returns a new xmlChar * or NULL
39
 */
40
xmlChar *
41
xmlStrndup(const xmlChar *cur, int len) {
42
    xmlChar *ret;
43
    
44
    if ((cur == NULL) || (len < 0)) return(NULL);
45
    ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46
    if (ret == NULL) {
47
        xmlErrMemory(NULL, NULL);
48
        return(NULL);
49
    }
50
    memcpy(ret, cur, len * sizeof(xmlChar));
51
    ret[len] = 0;
52
    return(ret);
53
}
54
55
/**
56
 * xmlStrdup:
57
 * @cur:  the input xmlChar *
58
 *
59
 * a strdup for array of xmlChar's. Since they are supposed to be
60
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61
 * a termination mark of '0'.
62
 *
63
 * Returns a new xmlChar * or NULL
64
 */
65
xmlChar *
66
xmlStrdup(const xmlChar *cur) {
67
    const xmlChar *p = cur;
68
69
    if (cur == NULL) return(NULL);
70
    while (*p != 0) p++; /* non input consuming */
71
    return(xmlStrndup(cur, p - cur));
72
}
73
74
/**
75
 * xmlCharStrndup:
76
 * @cur:  the input char *
77
 * @len:  the len of @cur
78
 *
79
 * a strndup for char's to xmlChar's
80
 *
81
 * Returns a new xmlChar * or NULL
82
 */
83
84
xmlChar *
85
xmlCharStrndup(const char *cur, int len) {
86
    int i;
87
    xmlChar *ret;
88
    
89
    if ((cur == NULL) || (len < 0)) return(NULL);
90
    ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91
    if (ret == NULL) {
92
        xmlErrMemory(NULL, NULL);
93
        return(NULL);
94
    }
95
    for (i = 0;i < len;i++) {
96
        ret[i] = (xmlChar) cur[i];
97
        if (ret[i] == 0) return(ret);
98
    }
99
    ret[len] = 0;
100
    return(ret);
101
}
102
103
/**
104
 * xmlCharStrdup:
105
 * @cur:  the input char *
106
 *
107
 * a strdup for char's to xmlChar's
108
 *
109
 * Returns a new xmlChar * or NULL
110
 */
111
112
xmlChar *
113
xmlCharStrdup(const char *cur) {
114
    const char *p = cur;
115
116
    if (cur == NULL) return(NULL);
117
    while (*p != '\0') p++; /* non input consuming */
118
    return(xmlCharStrndup(cur, p - cur));
119
}
120
121
/**
122
 * xmlStrcmp:
123
 * @str1:  the first xmlChar *
124
 * @str2:  the second xmlChar *
125
 *
126
 * a strcmp for xmlChar's
127
 *
128
 * Returns the integer result of the comparison
129
 */
130
131
int
132
xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133
    register int tmp;
134
135
    if (str1 == str2) return(0);
136
    if (str1 == NULL) return(-1);
137
    if (str2 == NULL) return(1);
138
    do {
139
        tmp = *str1++ - *str2;
140
        if (tmp != 0) return(tmp);
141
    } while (*str2++ != 0);
142
    return 0;
143
}
144
145
/**
146
 * xmlStrEqual:
147
 * @str1:  the first xmlChar *
148
 * @str2:  the second xmlChar *
149
 *
1.1.1 by Daniel Holbach
Import upstream version 2.6.23
150
 * Check if both strings are equal of have same content.
151
 * Should be a bit more readable and faster than xmlStrcmp()
1 by Daniel Holbach
Import upstream version 2.6.22
152
 *
153
 * Returns 1 if they are equal, 0 if they are different
154
 */
155
156
int
157
xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158
    if (str1 == str2) return(1);
159
    if (str1 == NULL) return(0);
160
    if (str2 == NULL) return(0);
161
    do {
162
        if (*str1++ != *str2) return(0);
163
    } while (*str2++);
164
    return(1);
165
}
166
167
/**
168
 * xmlStrQEqual:
169
 * @pref:  the prefix of the QName
170
 * @name:  the localname of the QName
171
 * @str:  the second xmlChar *
172
 *
173
 * Check if a QName is Equal to a given string 
174
 *
175
 * Returns 1 if they are equal, 0 if they are different
176
 */
177
178
int
179
xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180
    if (pref == NULL) return(xmlStrEqual(name, str));
181
    if (name == NULL) return(0);
182
    if (str == NULL) return(0);
183
184
    do {
185
        if (*pref++ != *str) return(0);
186
    } while ((*str++) && (*pref));
187
    if (*str++ != ':') return(0);
188
    do {
189
        if (*name++ != *str) return(0);
190
    } while (*str++);
191
    return(1);
192
}
193
194
/**
195
 * xmlStrncmp:
196
 * @str1:  the first xmlChar *
197
 * @str2:  the second xmlChar *
198
 * @len:  the max comparison length
199
 *
200
 * a strncmp for xmlChar's
201
 *
202
 * Returns the integer result of the comparison
203
 */
204
205
int
206
xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207
    register int tmp;
208
209
    if (len <= 0) return(0);
210
    if (str1 == str2) return(0);
211
    if (str1 == NULL) return(-1);
212
    if (str2 == NULL) return(1);
213
#ifdef __GNUC__
214
    tmp = strncmp((const char *)str1, (const char *)str2, len);
215
    return tmp;
216
#else
217
    do {
218
        tmp = *str1++ - *str2;
219
        if (tmp != 0 || --len == 0) return(tmp);
220
    } while (*str2++ != 0);
221
    return 0;
222
#endif
223
}
224
225
static const xmlChar casemap[256] = {
226
    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227
    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228
    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229
    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230
    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231
    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232
    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233
    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234
    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237
    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238
    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241
    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242
    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243
    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244
    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245
    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246
    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247
    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248
    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249
    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250
    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251
    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252
    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253
    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254
    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255
    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256
    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257
    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
258
};
259
260
/**
261
 * xmlStrcasecmp:
262
 * @str1:  the first xmlChar *
263
 * @str2:  the second xmlChar *
264
 *
265
 * a strcasecmp for xmlChar's
266
 *
267
 * Returns the integer result of the comparison
268
 */
269
270
int
271
xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
272
    register int tmp;
273
274
    if (str1 == str2) return(0);
275
    if (str1 == NULL) return(-1);
276
    if (str2 == NULL) return(1);
277
    do {
278
        tmp = casemap[*str1++] - casemap[*str2];
279
        if (tmp != 0) return(tmp);
280
    } while (*str2++ != 0);
281
    return 0;
282
}
283
284
/**
285
 * xmlStrncasecmp:
286
 * @str1:  the first xmlChar *
287
 * @str2:  the second xmlChar *
288
 * @len:  the max comparison length
289
 *
290
 * a strncasecmp for xmlChar's
291
 *
292
 * Returns the integer result of the comparison
293
 */
294
295
int
296
xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
297
    register int tmp;
298
299
    if (len <= 0) return(0);
300
    if (str1 == str2) return(0);
301
    if (str1 == NULL) return(-1);
302
    if (str2 == NULL) return(1);
303
    do {
304
        tmp = casemap[*str1++] - casemap[*str2];
305
        if (tmp != 0 || --len == 0) return(tmp);
306
    } while (*str2++ != 0);
307
    return 0;
308
}
309
310
/**
311
 * xmlStrchr:
312
 * @str:  the xmlChar * array
313
 * @val:  the xmlChar to search
314
 *
315
 * a strchr for xmlChar's
316
 *
317
 * Returns the xmlChar * for the first occurrence or NULL.
318
 */
319
320
const xmlChar *
321
xmlStrchr(const xmlChar *str, xmlChar val) {
322
    if (str == NULL) return(NULL);
323
    while (*str != 0) { /* non input consuming */
324
        if (*str == val) return((xmlChar *) str);
325
        str++;
326
    }
327
    return(NULL);
328
}
329
330
/**
331
 * xmlStrstr:
332
 * @str:  the xmlChar * array (haystack)
333
 * @val:  the xmlChar to search (needle)
334
 *
335
 * a strstr for xmlChar's
336
 *
337
 * Returns the xmlChar * for the first occurrence or NULL.
338
 */
339
340
const xmlChar *
341
xmlStrstr(const xmlChar *str, const xmlChar *val) {
342
    int n;
343
    
344
    if (str == NULL) return(NULL);
345
    if (val == NULL) return(NULL);
346
    n = xmlStrlen(val);
347
348
    if (n == 0) return(str);
349
    while (*str != 0) { /* non input consuming */
350
        if (*str == *val) {
351
            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352
        }
353
        str++;
354
    }
355
    return(NULL);
356
}
357
358
/**
359
 * xmlStrcasestr:
360
 * @str:  the xmlChar * array (haystack)
361
 * @val:  the xmlChar to search (needle)
362
 *
363
 * a case-ignoring strstr for xmlChar's
364
 *
365
 * Returns the xmlChar * for the first occurrence or NULL.
366
 */
367
368
const xmlChar *
1.1.11 by Mike Hommey
Import upstream version 2.7.4.dfsg
369
xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
1 by Daniel Holbach
Import upstream version 2.6.22
370
    int n;
371
    
372
    if (str == NULL) return(NULL);
373
    if (val == NULL) return(NULL);
374
    n = xmlStrlen(val);
375
376
    if (n == 0) return(str);
377
    while (*str != 0) { /* non input consuming */
378
        if (casemap[*str] == casemap[*val])
379
            if (!xmlStrncasecmp(str, val, n)) return(str);
380
        str++;
381
    }
382
    return(NULL);
383
}
384
385
/**
386
 * xmlStrsub:
387
 * @str:  the xmlChar * array (haystack)
388
 * @start:  the index of the first char (zero based)
389
 * @len:  the length of the substring
390
 *
391
 * Extract a substring of a given string
392
 *
393
 * Returns the xmlChar * for the first occurrence or NULL.
394
 */
395
396
xmlChar *
397
xmlStrsub(const xmlChar *str, int start, int len) {
398
    int i;
399
    
400
    if (str == NULL) return(NULL);
401
    if (start < 0) return(NULL);
402
    if (len < 0) return(NULL);
403
404
    for (i = 0;i < start;i++) {
405
        if (*str == 0) return(NULL);
406
        str++;
407
    }
408
    if (*str == 0) return(NULL);
409
    return(xmlStrndup(str, len));
410
}
411
412
/**
413
 * xmlStrlen:
414
 * @str:  the xmlChar * array
415
 *
416
 * length of a xmlChar's string
417
 *
418
 * Returns the number of xmlChar contained in the ARRAY.
419
 */
420
421
int
422
xmlStrlen(const xmlChar *str) {
423
    int len = 0;
424
425
    if (str == NULL) return(0);
426
    while (*str != 0) { /* non input consuming */
427
        str++;
428
        len++;
429
    }
430
    return(len);
431
}
432
433
/**
434
 * xmlStrncat:
435
 * @cur:  the original xmlChar * array
436
 * @add:  the xmlChar * array added
437
 * @len:  the length of @add
438
 *
439
 * a strncat for array of xmlChar's, it will extend @cur with the len
1.1.1 by Daniel Holbach
Import upstream version 2.6.23
440
 * first bytes of @add. Note that if @len < 0 then this is an API error
441
 * and NULL will be returned.
1 by Daniel Holbach
Import upstream version 2.6.22
442
 *
443
 * Returns a new xmlChar *, the original @cur is reallocated if needed
444
 * and should not be freed
445
 */
446
447
xmlChar *
448
xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449
    int size;
450
    xmlChar *ret;
451
452
    if ((add == NULL) || (len == 0))
453
        return(cur);
1.1.1 by Daniel Holbach
Import upstream version 2.6.23
454
    if (len < 0)
455
	return(NULL);
1 by Daniel Holbach
Import upstream version 2.6.22
456
    if (cur == NULL)
457
        return(xmlStrndup(add, len));
458
459
    size = xmlStrlen(cur);
460
    ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
461
    if (ret == NULL) {
462
        xmlErrMemory(NULL, NULL);
463
        return(cur);
464
    }
465
    memcpy(&ret[size], add, len * sizeof(xmlChar));
466
    ret[size + len] = 0;
467
    return(ret);
468
}
469
470
/**
471
 * xmlStrncatNew:
472
 * @str1:  first xmlChar string
473
 * @str2:  second xmlChar string
1.1.1 by Daniel Holbach
Import upstream version 2.6.23
474
 * @len:  the len of @str2 or < 0
1 by Daniel Holbach
Import upstream version 2.6.22
475
 *
476
 * same as xmlStrncat, but creates a new string.  The original
1.1.1 by Daniel Holbach
Import upstream version 2.6.23
477
 * two strings are not freed. If @len is < 0 then the length
478
 * will be calculated automatically.
1 by Daniel Holbach
Import upstream version 2.6.22
479
 *
480
 * Returns a new xmlChar * or NULL
481
 */
482
xmlChar *
483
xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
484
    int size;
485
    xmlChar *ret;
486
487
    if (len < 0)
488
        len = xmlStrlen(str2);
489
    if ((str2 == NULL) || (len == 0))
490
        return(xmlStrdup(str1));
491
    if (str1 == NULL)
492
        return(xmlStrndup(str2, len));
493
494
    size = xmlStrlen(str1);
495
    ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
496
    if (ret == NULL) {
497
        xmlErrMemory(NULL, NULL);
498
        return(xmlStrndup(str1, size));
499
    }
500
    memcpy(ret, str1, size * sizeof(xmlChar));
501
    memcpy(&ret[size], str2, len * sizeof(xmlChar));
502
    ret[size + len] = 0;
503
    return(ret);
504
}
505
506
/**
507
 * xmlStrcat:
508
 * @cur:  the original xmlChar * array
509
 * @add:  the xmlChar * array added
510
 *
511
 * a strcat for array of xmlChar's. Since they are supposed to be
512
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
513
 * a termination mark of '0'.
514
 *
515
 * Returns a new xmlChar * containing the concatenated string.
516
 */
517
xmlChar *
518
xmlStrcat(xmlChar *cur, const xmlChar *add) {
519
    const xmlChar *p = add;
520
521
    if (add == NULL) return(cur);
522
    if (cur == NULL) 
523
        return(xmlStrdup(add));
524
525
    while (*p != 0) p++; /* non input consuming */
526
    return(xmlStrncat(cur, add, p - add));
527
}
528
529
/**
530
 * xmlStrPrintf:
531
 * @buf:   the result buffer.
532
 * @len:   the result buffer length.
533
 * @msg:   the message with printf formatting.
534
 * @...:   extra parameters for the message.
535
 *
536
 * Formats @msg and places result into @buf.
537
 *
538
 * Returns the number of characters written to @buf or -1 if an error occurs.
539
 */
540
int XMLCDECL 
541
xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
542
    va_list args;
543
    int ret;
544
    
545
    if((buf == NULL) || (msg == NULL)) {
546
        return(-1);
547
    }
548
    
549
    va_start(args, msg);
550
    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
551
    va_end(args);
552
    buf[len - 1] = 0; /* be safe ! */
553
    
554
    return(ret);
555
}
556
557
/**
558
 * xmlStrVPrintf:
559
 * @buf:   the result buffer.
560
 * @len:   the result buffer length.
561
 * @msg:   the message with printf formatting.
562
 * @ap:    extra parameters for the message.
563
 *
564
 * Formats @msg and places result into @buf.
565
 *
566
 * Returns the number of characters written to @buf or -1 if an error occurs.
567
 */
568
int 
569
xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
570
    int ret;
571
    
572
    if((buf == NULL) || (msg == NULL)) {
573
        return(-1);
574
    }
575
    
576
    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
577
    buf[len - 1] = 0; /* be safe ! */
578
    
579
    return(ret);
580
}
581
582
/************************************************************************
583
 *                                                                      *
584
 *              Generic UTF8 handling routines                          *
585
 *                                                                      *
586
 * From rfc2044: encoding of the Unicode values on UTF-8:               *
587
 *                                                                      *
588
 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
589
 * 0000 0000-0000 007F   0xxxxxxx                                       *
590
 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
591
 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
592
 *                                                                      *
593
 * I hope we won't use values > 0xFFFF anytime soon !                   *
594
 *                                                                      *
595
 ************************************************************************/
596
597
598
/**
599
 * xmlUTF8Size:
600
 * @utf: pointer to the UTF8 character
601
 *
602
 * calculates the internal size of a UTF8 character
603
 *
604
 * returns the numbers of bytes in the character, -1 on format error
605
 */
606
int
607
xmlUTF8Size(const xmlChar *utf) {
608
    xmlChar mask;
609
    int len;
610
611
    if (utf == NULL)
612
        return -1;
613
    if (*utf < 0x80)
614
        return 1;
615
    /* check valid UTF8 character */
616
    if (!(*utf & 0x40))
617
        return -1;
618
    /* determine number of bytes in char */
619
    len = 2;
620
    for (mask=0x20; mask != 0; mask>>=1) {
621
        if (!(*utf & mask))
622
            return len;
623
        len++;
624
    }
625
    return -1;
626
}
627
628
/**
629
 * xmlUTF8Charcmp:
630
 * @utf1: pointer to first UTF8 char
631
 * @utf2: pointer to second UTF8 char
632
 *
633
 * compares the two UCS4 values
634
 *
635
 * returns result of the compare as with xmlStrncmp
636
 */
637
int
638
xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
639
640
    if (utf1 == NULL ) {
641
        if (utf2 == NULL)
642
            return 0;
643
        return -1;
644
    }
645
    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
646
}
647
648
/**
649
 * xmlUTF8Strlen:
650
 * @utf:  a sequence of UTF-8 encoded bytes
651
 *
652
 * compute the length of an UTF8 string, it doesn't do a full UTF8
653
 * checking of the content of the string.
654
 *
655
 * Returns the number of characters in the string or -1 in case of error
656
 */
657
int
658
xmlUTF8Strlen(const xmlChar *utf) {
659
    int ret = 0;
660
661
    if (utf == NULL)
662
        return(-1);
663
664
    while (*utf != 0) {
665
        if (utf[0] & 0x80) {
666
            if ((utf[1] & 0xc0) != 0x80)
667
                return(-1);
668
            if ((utf[0] & 0xe0) == 0xe0) {
669
                if ((utf[2] & 0xc0) != 0x80)
670
                    return(-1);
671
                if ((utf[0] & 0xf0) == 0xf0) {
672
                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
673
                        return(-1);
674
                    utf += 4;
675
                } else {
676
                    utf += 3;
677
                }
678
            } else {
679
                utf += 2;
680
            }
681
        } else {
682
            utf++;
683
        }
684
        ret++;
685
    }
686
    return(ret);
687
}
688
689
/**
690
 * xmlGetUTF8Char:
691
 * @utf:  a sequence of UTF-8 encoded bytes
692
 * @len:  a pointer to the minimum number of bytes present in
693
 *        the sequence.  This is used to assure the next character
694
 *        is completely contained within the sequence.
695
 *
696
 * Read the first UTF8 character from @utf
697
 *
698
 * Returns the char value or -1 in case of error, and sets *len to
699
 *        the actual number of bytes consumed (0 in case of error)
700
 */
701
int
702
xmlGetUTF8Char(const unsigned char *utf, int *len) {
703
    unsigned int c;
704
705
    if (utf == NULL)
706
        goto error;
707
    if (len == NULL)
708
        goto error;
709
    if (*len < 1)
710
        goto error;
711
712
    c = utf[0];
713
    if (c & 0x80) {
714
        if (*len < 2)
715
            goto error;
716
        if ((utf[1] & 0xc0) != 0x80)
717
            goto error;
718
        if ((c & 0xe0) == 0xe0) {
719
            if (*len < 3)
720
                goto error;
721
            if ((utf[2] & 0xc0) != 0x80)
722
                goto error;
723
            if ((c & 0xf0) == 0xf0) {
724
                if (*len < 4)
725
                    goto error;
726
                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
727
                    goto error;
728
                *len = 4;
729
                /* 4-byte code */
730
                c = (utf[0] & 0x7) << 18;
731
                c |= (utf[1] & 0x3f) << 12;
732
                c |= (utf[2] & 0x3f) << 6;
733
                c |= utf[3] & 0x3f;
734
            } else {
735
              /* 3-byte code */
736
                *len = 3;
737
                c = (utf[0] & 0xf) << 12;
738
                c |= (utf[1] & 0x3f) << 6;
739
                c |= utf[2] & 0x3f;
740
            }
741
        } else {
742
          /* 2-byte code */
743
            *len = 2;
744
            c = (utf[0] & 0x1f) << 6;
745
            c |= utf[1] & 0x3f;
746
        }
747
    } else {
748
        /* 1-byte code */
749
        *len = 1;
750
    }
751
    return(c);
752
753
error:
754
    if (len != NULL)
755
	*len = 0;
756
    return(-1);
757
}
758
759
/**
760
 * xmlCheckUTF8:
761
 * @utf: Pointer to putative UTF-8 encoded string.
762
 *
763
 * Checks @utf for being valid UTF-8. @utf is assumed to be
764
 * null-terminated. This function is not super-strict, as it will
765
 * allow longer UTF-8 sequences than necessary. Note that Java is
766
 * capable of producing these sequences if provoked. Also note, this
767
 * routine checks for the 4-byte maximum size, but does not check for
768
 * 0x10ffff maximum value.
769
 *
770
 * Return value: true if @utf is valid.
771
 **/
772
int
773
xmlCheckUTF8(const unsigned char *utf)
774
{
775
    int ix;
776
    unsigned char c;
777
778
    if (utf == NULL)
779
        return(0);
780
    /*
781
     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
782
     * are as follows (in "bit format"):
783
     *    0xxxxxxx                                      valid 1-byte
784
     *    110xxxxx 10xxxxxx                             valid 2-byte
785
     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
786
     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
787
     */
788
    for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
789
        if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
790
            ix++;
791
	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
792
	    if ((utf[ix+1] & 0xc0 ) != 0x80)
793
	        return 0;
794
	    ix += 2;
795
	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
796
	    if (((utf[ix+1] & 0xc0) != 0x80) ||
797
	        ((utf[ix+2] & 0xc0) != 0x80))
798
		    return 0;
799
	    ix += 3;
800
	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
801
	    if (((utf[ix+1] & 0xc0) != 0x80) ||
802
	        ((utf[ix+2] & 0xc0) != 0x80) ||
803
		((utf[ix+3] & 0xc0) != 0x80))
804
		    return 0;
805
	    ix += 4;
806
	} else				/* unknown encoding */
807
	    return 0;
808
      }
809
      return(1);
810
}
811
812
/**
813
 * xmlUTF8Strsize:
814
 * @utf:  a sequence of UTF-8 encoded bytes
815
 * @len:  the number of characters in the array
816
 *
817
 * storage size of an UTF8 string
818
 * the behaviour is not garanteed if the input string is not UTF-8
819
 *
820
 * Returns the storage size of
821
 * the first 'len' characters of ARRAY
822
 */
823
824
int
825
xmlUTF8Strsize(const xmlChar *utf, int len) {
826
    const xmlChar   *ptr=utf;
827
    xmlChar         ch;
828
829
    if (utf == NULL)
830
        return(0);
831
832
    if (len <= 0)
833
        return(0);
834
835
    while ( len-- > 0) {
836
        if ( !*ptr )
837
            break;
838
        if ( (ch = *ptr++) & 0x80)
839
            while ((ch<<=1) & 0x80 ) {
840
                ptr++;
841
		if (*ptr == 0) break;
842
	    }
843
    }
844
    return (ptr - utf);
845
}
846
847
848
/**
849
 * xmlUTF8Strndup:
850
 * @utf:  the input UTF8 *
851
 * @len:  the len of @utf (in chars)
852
 *
853
 * a strndup for array of UTF8's
854
 *
855
 * Returns a new UTF8 * or NULL
856
 */
857
xmlChar *
858
xmlUTF8Strndup(const xmlChar *utf, int len) {
859
    xmlChar *ret;
860
    int i;
861
    
862
    if ((utf == NULL) || (len < 0)) return(NULL);
863
    i = xmlUTF8Strsize(utf, len);
864
    ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
865
    if (ret == NULL) {
866
        xmlGenericError(xmlGenericErrorContext,
867
                "malloc of %ld byte failed\n",
868
                (len + 1) * (long)sizeof(xmlChar));
869
        return(NULL);
870
    }
871
    memcpy(ret, utf, i * sizeof(xmlChar));
872
    ret[i] = 0;
873
    return(ret);
874
}
875
876
/**
877
 * xmlUTF8Strpos:
878
 * @utf:  the input UTF8 *
879
 * @pos:  the position of the desired UTF8 char (in chars)
880
 *
881
 * a function to provide the equivalent of fetching a
882
 * character from a string array
883
 *
884
 * Returns a pointer to the UTF8 character or NULL
885
 */
886
const xmlChar *
887
xmlUTF8Strpos(const xmlChar *utf, int pos) {
888
    xmlChar ch;
889
890
    if (utf == NULL) return(NULL);
891
    if (pos < 0)
892
        return(NULL);
893
    while (pos--) {
894
        if ((ch=*utf++) == 0) return(NULL);
895
        if ( ch & 0x80 ) {
896
            /* if not simple ascii, verify proper format */
897
            if ( (ch & 0xc0) != 0xc0 )
898
                return(NULL);
899
            /* then skip over remaining bytes for this char */
900
            while ( (ch <<= 1) & 0x80 )
901
                if ( (*utf++ & 0xc0) != 0x80 )
902
                    return(NULL);
903
        }
904
    }
905
    return((xmlChar *)utf);
906
}
907
908
/**
909
 * xmlUTF8Strloc:
910
 * @utf:  the input UTF8 *
911
 * @utfchar:  the UTF8 character to be found
912
 *
913
 * a function to provide the relative location of a UTF8 char
914
 *
915
 * Returns the relative character position of the desired char
916
 * or -1 if not found
917
 */
918
int
919
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
920
    int i, size;
921
    xmlChar ch;
922
923
    if (utf==NULL || utfchar==NULL) return -1;
924
    size = xmlUTF8Strsize(utfchar, 1);
925
        for(i=0; (ch=*utf) != 0; i++) {
926
            if (xmlStrncmp(utf, utfchar, size)==0)
927
                return(i);
928
            utf++;
929
            if ( ch & 0x80 ) {
930
                /* if not simple ascii, verify proper format */
931
                if ( (ch & 0xc0) != 0xc0 )
932
                    return(-1);
933
                /* then skip over remaining bytes for this char */
934
                while ( (ch <<= 1) & 0x80 )
935
                    if ( (*utf++ & 0xc0) != 0x80 )
936
                        return(-1);
937
            }
938
        }
939
940
    return(-1);
941
}
942
/**
943
 * xmlUTF8Strsub:
944
 * @utf:  a sequence of UTF-8 encoded bytes
945
 * @start: relative pos of first char
946
 * @len:   total number to copy
947
 *
948
 * Create a substring from a given UTF-8 string
949
 * Note:  positions are given in units of UTF-8 chars
950
 *
951
 * Returns a pointer to a newly created string
952
 * or NULL if any problem
953
 */
954
955
xmlChar *
956
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
957
    int            i;
958
    xmlChar ch;
959
960
    if (utf == NULL) return(NULL);
961
    if (start < 0) return(NULL);
962
    if (len < 0) return(NULL);
963
964
    /*
965
     * Skip over any leading chars
966
     */
967
    for (i = 0;i < start;i++) {
968
        if ((ch=*utf++) == 0) return(NULL);
969
        if ( ch & 0x80 ) {
970
            /* if not simple ascii, verify proper format */
971
            if ( (ch & 0xc0) != 0xc0 )
972
                return(NULL);
973
            /* then skip over remaining bytes for this char */
974
            while ( (ch <<= 1) & 0x80 )
975
                if ( (*utf++ & 0xc0) != 0x80 )
976
                    return(NULL);
977
        }
978
    }
979
980
    return(xmlUTF8Strndup(utf, len));
981
}
982
983
#define bottom_xmlstring
984
#include "elfgcchack.h"