2
* string.c : an XML string utilities module
4
* This module provides various utility functions for manipulating
5
* the xmlChar* type. All functions named xmlStr* have been moved here
6
* from the parser.c file (their original home).
8
* See Copyright for the status of this software.
10
* UTF8 string routines from:
11
* William Brack <wbrack@mmm.com.hk>
21
#include <libxml/xmlmemory.h>
22
#include <libxml/parserInternals.h>
23
#include <libxml/xmlstring.h>
25
/************************************************************************
27
* Commodity functions to handle xmlChars *
29
************************************************************************/
33
* @cur: the input xmlChar *
34
* @len: the len of @cur
36
* a strndup for array of xmlChar's
38
* Returns a new xmlChar * or NULL
41
xmlStrndup(const xmlChar *cur, int len) {
44
if ((cur == NULL) || (len < 0)) return(NULL);
45
ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
47
xmlErrMemory(NULL, NULL);
50
memcpy(ret, cur, len * sizeof(xmlChar));
57
* @cur: the input xmlChar *
59
* a strdup for array of xmlChar's. Since they are supposed to be
60
* encoded in UTF-8 or an encoding with 8bit based chars, we assume
61
* a termination mark of '0'.
63
* Returns a new xmlChar * or NULL
66
xmlStrdup(const xmlChar *cur) {
67
const xmlChar *p = cur;
69
if (cur == NULL) return(NULL);
70
while (*p != 0) p++; /* non input consuming */
71
return(xmlStrndup(cur, p - cur));
76
* @cur: the input char *
77
* @len: the len of @cur
79
* a strndup for char's to xmlChar's
81
* Returns a new xmlChar * or NULL
85
xmlCharStrndup(const char *cur, int len) {
89
if ((cur == NULL) || (len < 0)) return(NULL);
90
ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
92
xmlErrMemory(NULL, NULL);
95
for (i = 0;i < len;i++) {
96
ret[i] = (xmlChar) cur[i];
97
if (ret[i] == 0) return(ret);
105
* @cur: the input char *
107
* a strdup for char's to xmlChar's
109
* Returns a new xmlChar * or NULL
113
xmlCharStrdup(const char *cur) {
116
if (cur == NULL) return(NULL);
117
while (*p != '\0') p++; /* non input consuming */
118
return(xmlCharStrndup(cur, p - cur));
123
* @str1: the first xmlChar *
124
* @str2: the second xmlChar *
126
* a strcmp for xmlChar's
128
* Returns the integer result of the comparison
132
xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
135
if (str1 == str2) return(0);
136
if (str1 == NULL) return(-1);
137
if (str2 == NULL) return(1);
139
tmp = *str1++ - *str2;
140
if (tmp != 0) return(tmp);
141
} while (*str2++ != 0);
147
* @str1: the first xmlChar *
148
* @str2: the second xmlChar *
150
* Check if both strings are equal of have same content.
151
* Should be a bit more readable and faster than xmlStrcmp()
153
* Returns 1 if they are equal, 0 if they are different
157
xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158
if (str1 == str2) return(1);
159
if (str1 == NULL) return(0);
160
if (str2 == NULL) return(0);
162
if (*str1++ != *str2) return(0);
169
* @pref: the prefix of the QName
170
* @name: the localname of the QName
171
* @str: the second xmlChar *
173
* Check if a QName is Equal to a given string
175
* Returns 1 if they are equal, 0 if they are different
179
xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180
if (pref == NULL) return(xmlStrEqual(name, str));
181
if (name == NULL) return(0);
182
if (str == NULL) return(0);
185
if (*pref++ != *str) return(0);
186
} while ((*str++) && (*pref));
187
if (*str++ != ':') return(0);
189
if (*name++ != *str) return(0);
196
* @str1: the first xmlChar *
197
* @str2: the second xmlChar *
198
* @len: the max comparison length
200
* a strncmp for xmlChar's
202
* Returns the integer result of the comparison
206
xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
209
if (len <= 0) return(0);
210
if (str1 == str2) return(0);
211
if (str1 == NULL) return(-1);
212
if (str2 == NULL) return(1);
214
tmp = strncmp((const char *)str1, (const char *)str2, len);
218
tmp = *str1++ - *str2;
219
if (tmp != 0 || --len == 0) return(tmp);
220
} while (*str2++ != 0);
225
static const xmlChar casemap[256] = {
226
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227
0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229
0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231
0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233
0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234
0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235
0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237
0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239
0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241
0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243
0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245
0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246
0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247
0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248
0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249
0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250
0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251
0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252
0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253
0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254
0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255
0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256
0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257
0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
262
* @str1: the first xmlChar *
263
* @str2: the second xmlChar *
265
* a strcasecmp for xmlChar's
267
* Returns the integer result of the comparison
271
xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
274
if (str1 == str2) return(0);
275
if (str1 == NULL) return(-1);
276
if (str2 == NULL) return(1);
278
tmp = casemap[*str1++] - casemap[*str2];
279
if (tmp != 0) return(tmp);
280
} while (*str2++ != 0);
286
* @str1: the first xmlChar *
287
* @str2: the second xmlChar *
288
* @len: the max comparison length
290
* a strncasecmp for xmlChar's
292
* Returns the integer result of the comparison
296
xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
299
if (len <= 0) return(0);
300
if (str1 == str2) return(0);
301
if (str1 == NULL) return(-1);
302
if (str2 == NULL) return(1);
304
tmp = casemap[*str1++] - casemap[*str2];
305
if (tmp != 0 || --len == 0) return(tmp);
306
} while (*str2++ != 0);
312
* @str: the xmlChar * array
313
* @val: the xmlChar to search
315
* a strchr for xmlChar's
317
* Returns the xmlChar * for the first occurrence or NULL.
321
xmlStrchr(const xmlChar *str, xmlChar val) {
322
if (str == NULL) return(NULL);
323
while (*str != 0) { /* non input consuming */
324
if (*str == val) return((xmlChar *) str);
332
* @str: the xmlChar * array (haystack)
333
* @val: the xmlChar to search (needle)
335
* a strstr for xmlChar's
337
* Returns the xmlChar * for the first occurrence or NULL.
341
xmlStrstr(const xmlChar *str, const xmlChar *val) {
344
if (str == NULL) return(NULL);
345
if (val == NULL) return(NULL);
348
if (n == 0) return(str);
349
while (*str != 0) { /* non input consuming */
351
if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
360
* @str: the xmlChar * array (haystack)
361
* @val: the xmlChar to search (needle)
363
* a case-ignoring strstr for xmlChar's
365
* Returns the xmlChar * for the first occurrence or NULL.
369
xmlStrcasestr(const xmlChar *str, xmlChar *val) {
372
if (str == NULL) return(NULL);
373
if (val == NULL) return(NULL);
376
if (n == 0) return(str);
377
while (*str != 0) { /* non input consuming */
378
if (casemap[*str] == casemap[*val])
379
if (!xmlStrncasecmp(str, val, n)) return(str);
387
* @str: the xmlChar * array (haystack)
388
* @start: the index of the first char (zero based)
389
* @len: the length of the substring
391
* Extract a substring of a given string
393
* Returns the xmlChar * for the first occurrence or NULL.
397
xmlStrsub(const xmlChar *str, int start, int len) {
400
if (str == NULL) return(NULL);
401
if (start < 0) return(NULL);
402
if (len < 0) return(NULL);
404
for (i = 0;i < start;i++) {
405
if (*str == 0) return(NULL);
408
if (*str == 0) return(NULL);
409
return(xmlStrndup(str, len));
414
* @str: the xmlChar * array
416
* length of a xmlChar's string
418
* Returns the number of xmlChar contained in the ARRAY.
422
xmlStrlen(const xmlChar *str) {
425
if (str == NULL) return(0);
426
while (*str != 0) { /* non input consuming */
435
* @cur: the original xmlChar * array
436
* @add: the xmlChar * array added
437
* @len: the length of @add
439
* a strncat for array of xmlChar's, it will extend @cur with the len
440
* first bytes of @add. Note that if @len < 0 then this is an API error
441
* and NULL will be returned.
443
* Returns a new xmlChar *, the original @cur is reallocated if needed
444
* and should not be freed
448
xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
452
if ((add == NULL) || (len == 0))
457
return(xmlStrndup(add, len));
459
size = xmlStrlen(cur);
460
ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
462
xmlErrMemory(NULL, NULL);
465
memcpy(&ret[size], add, len * sizeof(xmlChar));
472
* @str1: first xmlChar string
473
* @str2: second xmlChar string
474
* @len: the len of @str2 or < 0
476
* same as xmlStrncat, but creates a new string. The original
477
* two strings are not freed. If @len is < 0 then the length
478
* will be calculated automatically.
480
* Returns a new xmlChar * or NULL
483
xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
488
len = xmlStrlen(str2);
489
if ((str2 == NULL) || (len == 0))
490
return(xmlStrdup(str1));
492
return(xmlStrndup(str2, len));
494
size = xmlStrlen(str1);
495
ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
497
xmlErrMemory(NULL, NULL);
498
return(xmlStrndup(str1, size));
500
memcpy(ret, str1, size * sizeof(xmlChar));
501
memcpy(&ret[size], str2, len * sizeof(xmlChar));
508
* @cur: the original xmlChar * array
509
* @add: the xmlChar * array added
511
* a strcat for array of xmlChar's. Since they are supposed to be
512
* encoded in UTF-8 or an encoding with 8bit based chars, we assume
513
* a termination mark of '0'.
515
* Returns a new xmlChar * containing the concatenated string.
518
xmlStrcat(xmlChar *cur, const xmlChar *add) {
519
const xmlChar *p = add;
521
if (add == NULL) return(cur);
523
return(xmlStrdup(add));
525
while (*p != 0) p++; /* non input consuming */
526
return(xmlStrncat(cur, add, p - add));
531
* @buf: the result buffer.
532
* @len: the result buffer length.
533
* @msg: the message with printf formatting.
534
* @...: extra parameters for the message.
536
* Formats @msg and places result into @buf.
538
* Returns the number of characters written to @buf or -1 if an error occurs.
541
xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
545
if((buf == NULL) || (msg == NULL)) {
550
ret = vsnprintf((char *) buf, len, (const char *) msg, args);
552
buf[len - 1] = 0; /* be safe ! */
559
* @buf: the result buffer.
560
* @len: the result buffer length.
561
* @msg: the message with printf formatting.
562
* @ap: extra parameters for the message.
564
* Formats @msg and places result into @buf.
566
* Returns the number of characters written to @buf or -1 if an error occurs.
569
xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
572
if((buf == NULL) || (msg == NULL)) {
576
ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
577
buf[len - 1] = 0; /* be safe ! */
582
/************************************************************************
584
* Generic UTF8 handling routines *
586
* From rfc2044: encoding of the Unicode values on UTF-8: *
588
* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
589
* 0000 0000-0000 007F 0xxxxxxx *
590
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
591
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
593
* I hope we won't use values > 0xFFFF anytime soon ! *
595
************************************************************************/
600
* @utf: pointer to the UTF8 character
602
* calculates the internal size of a UTF8 character
604
* returns the numbers of bytes in the character, -1 on format error
607
xmlUTF8Size(const xmlChar *utf) {
615
/* check valid UTF8 character */
618
/* determine number of bytes in char */
620
for (mask=0x20; mask != 0; mask>>=1) {
630
* @utf1: pointer to first UTF8 char
631
* @utf2: pointer to second UTF8 char
633
* compares the two UCS4 values
635
* returns result of the compare as with xmlStrncmp
638
xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
645
return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
650
* @utf: a sequence of UTF-8 encoded bytes
652
* compute the length of an UTF8 string, it doesn't do a full UTF8
653
* checking of the content of the string.
655
* Returns the number of characters in the string or -1 in case of error
658
xmlUTF8Strlen(const xmlChar *utf) {
666
if ((utf[1] & 0xc0) != 0x80)
668
if ((utf[0] & 0xe0) == 0xe0) {
669
if ((utf[2] & 0xc0) != 0x80)
671
if ((utf[0] & 0xf0) == 0xf0) {
672
if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
691
* @utf: a sequence of UTF-8 encoded bytes
692
* @len: a pointer to the minimum number of bytes present in
693
* the sequence. This is used to assure the next character
694
* is completely contained within the sequence.
696
* Read the first UTF8 character from @utf
698
* Returns the char value or -1 in case of error, and sets *len to
699
* the actual number of bytes consumed (0 in case of error)
702
xmlGetUTF8Char(const unsigned char *utf, int *len) {
716
if ((utf[1] & 0xc0) != 0x80)
718
if ((c & 0xe0) == 0xe0) {
721
if ((utf[2] & 0xc0) != 0x80)
723
if ((c & 0xf0) == 0xf0) {
726
if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
730
c = (utf[0] & 0x7) << 18;
731
c |= (utf[1] & 0x3f) << 12;
732
c |= (utf[2] & 0x3f) << 6;
737
c = (utf[0] & 0xf) << 12;
738
c |= (utf[1] & 0x3f) << 6;
744
c = (utf[0] & 0x1f) << 6;
761
* @utf: Pointer to putative UTF-8 encoded string.
763
* Checks @utf for being valid UTF-8. @utf is assumed to be
764
* null-terminated. This function is not super-strict, as it will
765
* allow longer UTF-8 sequences than necessary. Note that Java is
766
* capable of producing these sequences if provoked. Also note, this
767
* routine checks for the 4-byte maximum size, but does not check for
768
* 0x10ffff maximum value.
770
* Return value: true if @utf is valid.
773
xmlCheckUTF8(const unsigned char *utf)
781
* utf is a string of 1, 2, 3 or 4 bytes. The valid strings
782
* are as follows (in "bit format"):
783
* 0xxxxxxx valid 1-byte
784
* 110xxxxx 10xxxxxx valid 2-byte
785
* 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
786
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
788
for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
789
if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
791
} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
792
if ((utf[ix+1] & 0xc0 ) != 0x80)
795
} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
796
if (((utf[ix+1] & 0xc0) != 0x80) ||
797
((utf[ix+2] & 0xc0) != 0x80))
800
} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
801
if (((utf[ix+1] & 0xc0) != 0x80) ||
802
((utf[ix+2] & 0xc0) != 0x80) ||
803
((utf[ix+3] & 0xc0) != 0x80))
806
} else /* unknown encoding */
814
* @utf: a sequence of UTF-8 encoded bytes
815
* @len: the number of characters in the array
817
* storage size of an UTF8 string
818
* the behaviour is not garanteed if the input string is not UTF-8
820
* Returns the storage size of
821
* the first 'len' characters of ARRAY
825
xmlUTF8Strsize(const xmlChar *utf, int len) {
826
const xmlChar *ptr=utf;
838
if ( (ch = *ptr++) & 0x80)
839
while ((ch<<=1) & 0x80 ) {
841
if (*ptr == 0) break;
850
* @utf: the input UTF8 *
851
* @len: the len of @utf (in chars)
853
* a strndup for array of UTF8's
855
* Returns a new UTF8 * or NULL
858
xmlUTF8Strndup(const xmlChar *utf, int len) {
862
if ((utf == NULL) || (len < 0)) return(NULL);
863
i = xmlUTF8Strsize(utf, len);
864
ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
866
xmlGenericError(xmlGenericErrorContext,
867
"malloc of %ld byte failed\n",
868
(len + 1) * (long)sizeof(xmlChar));
871
memcpy(ret, utf, i * sizeof(xmlChar));
878
* @utf: the input UTF8 *
879
* @pos: the position of the desired UTF8 char (in chars)
881
* a function to provide the equivalent of fetching a
882
* character from a string array
884
* Returns a pointer to the UTF8 character or NULL
887
xmlUTF8Strpos(const xmlChar *utf, int pos) {
890
if (utf == NULL) return(NULL);
894
if ((ch=*utf++) == 0) return(NULL);
896
/* if not simple ascii, verify proper format */
897
if ( (ch & 0xc0) != 0xc0 )
899
/* then skip over remaining bytes for this char */
900
while ( (ch <<= 1) & 0x80 )
901
if ( (*utf++ & 0xc0) != 0x80 )
905
return((xmlChar *)utf);
910
* @utf: the input UTF8 *
911
* @utfchar: the UTF8 character to be found
913
* a function to provide the relative location of a UTF8 char
915
* Returns the relative character position of the desired char
919
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
923
if (utf==NULL || utfchar==NULL) return -1;
924
size = xmlUTF8Strsize(utfchar, 1);
925
for(i=0; (ch=*utf) != 0; i++) {
926
if (xmlStrncmp(utf, utfchar, size)==0)
930
/* if not simple ascii, verify proper format */
931
if ( (ch & 0xc0) != 0xc0 )
933
/* then skip over remaining bytes for this char */
934
while ( (ch <<= 1) & 0x80 )
935
if ( (*utf++ & 0xc0) != 0x80 )
944
* @utf: a sequence of UTF-8 encoded bytes
945
* @start: relative pos of first char
946
* @len: total number to copy
948
* Create a substring from a given UTF-8 string
949
* Note: positions are given in units of UTF-8 chars
951
* Returns a pointer to a newly created string
952
* or NULL if any problem
956
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
960
if (utf == NULL) return(NULL);
961
if (start < 0) return(NULL);
962
if (len < 0) return(NULL);
965
* Skip over any leading chars
967
for (i = 0;i < start;i++) {
968
if ((ch=*utf++) == 0) return(NULL);
970
/* if not simple ascii, verify proper format */
971
if ( (ch & 0xc0) != 0xc0 )
973
/* then skip over remaining bytes for this char */
974
while ( (ch <<= 1) & 0x80 )
975
if ( (*utf++ & 0xc0) != 0x80 )
980
return(xmlUTF8Strndup(utf, len));
983
#define bottom_xmlstring
984
#include "elfgcchack.h"