2
/******************************************************************************
4
* DESCRIPTION: Properties of characters and strings
5
* COPYRIGHT : (C) 1999 Joris van der Hoeven
6
*******************************************************************************
7
* This software falls under the GNU general public license and comes WITHOUT
8
* ANY WARRANTY WHATSOEVER. See the file $TEXMACS_PATH/LICENSE for more details.
9
* If you don't have this file, write to the Free Software Foundation, Inc.,
10
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
11
******************************************************************************/
15
/******************************************************************************
17
******************************************************************************/
20
is_alpha (register char c) {
21
return ((c>='a') && (c<='z')) || ((c>='A') && (c<='Z'));
25
is_iso_alpha (register char c) {
26
int i= ((int) ((unsigned char) c));
28
((c>='a') && (c<='z')) ||
29
((c>='A') && (c<='Z')) ||
30
((i >= 128) && (i != 159) && (i != 189) && (i != 190) && (i != 191));
34
is_locase (register char c) {
35
int code= (int) ((unsigned char) c);
37
((c>='a') && (c<='z')) ||
38
((code >= 160) && (code < 189)) ||
43
is_upcase (register char c) {
44
int code= (int) ((unsigned char) c);
46
((c>='A') && (c<='Z')) ||
47
((code >= 128) && (code < 159)) ||
48
((code >= 192) && (code < 224));
52
is_digit (register char c) {
53
return (c>='0') && (c<='9');
57
is_numeric (register char c) {
58
return ((c>='0') && (c<='9')) || (c=='.');
62
is_ponctuation (register char c) {
64
(c=='.') || (c==',') || (c==':') || (c=='\'') || (c=='`') ||
65
(c==';') || (c=='!') || (c=='?');
69
is_space (register char c) {
70
return (c == ' ') || (c == '\11') || (c == '\12') || (c == '\15');\
73
/******************************************************************************
75
******************************************************************************/
80
if (N(s)==0) return false;
81
for (i=0; i<N(s); i++)
82
if (!is_alpha (s[i])) return false;
87
is_iso_alpha (string s) {
89
if (N(s)==0) return false;
90
for (i=0; i<N(s); i++)
91
if (!is_iso_alpha (s[i])) return false;
96
is_numeric (string s) {
98
if (N(s)==0) return false;
99
for (i=0; i<N(s); i++)
100
if (!is_numeric (s[i])) return false;
104
/******************************************************************************
106
******************************************************************************/
109
upcase_first (string s) {
110
if ((N(s)==0) || (!is_locase (s[0]))) return s;
111
return string ((char) (((int) ((unsigned char) s[0]))-32)) * s (1, N(s));
115
locase_first (string s) {
116
if ((N(s)==0) || (!is_upcase (s[0]))) return s;
117
return string ((char) (((int) ((unsigned char) s[0]))+32)) * s (1, N(s));
121
upcase_all (string s) {
124
for (i=0; i<N(s); i++)
125
if (!is_locase (s[i])) r[i]= s[i];
126
else r[i]= (char) (((int) ((unsigned char) s[i]))-32);
131
locase_all (string s) {
134
for (i=0; i<N(s); i++)
135
if (!is_upcase (s[i])) r[i]= s[i];
136
else r[i]= (char) (((int) ((unsigned char) s[i]))+32);
140
/******************************************************************************
141
* Spanish in relation with ispell
142
******************************************************************************/
145
ispanish_to_spanish (string s) {
149
if ((s[i] == '\'') && ((i+1)<n)) {
151
case 'A': r << 'ļæ½'; break;
152
case 'E': r << 'ļæ½'; break;
153
case 'I': r << 'ļæ½'; break;
154
case 'N': r << 'ļæ½'; break;
155
case 'O': r << 'ļæ½'; break;
156
case 'U': r << 'ļæ½'; break;
157
case 'Y': r << 'ļæ½'; break;
158
case 'a': r << 'ļæ½'; break;
159
case 'e': r << 'ļæ½'; break;
160
case 'i': r << 'ļæ½'; break;
161
case 'n': r << 'ļæ½'; break;
162
case 'o': r << 'ļæ½'; break;
163
case 'u': r << 'ļæ½'; break;
164
case 'y': r << 'ļæ½'; break;
165
default : r << '\'' << s[i+1];
174
spanish_to_ispanish (string s) {
179
case 'ļæ½': r << "'A"; break;
180
case 'ļæ½': r << "'E"; break;
181
case 'ļæ½': r << "'I"; break;
182
case 'ļæ½': r << "'N"; break;
183
case 'ļæ½': r << "'O"; break;
184
case 'ļæ½': r << "'U"; break;
185
case 'ļæ½': r << "'Y"; break;
186
case 'ļæ½': r << "'a"; break;
187
case 'ļæ½': r << "'e"; break;
188
case 'ļæ½': r << "'i"; break;
189
case 'ļæ½': r << "'n"; break;
190
case 'ļæ½': r << "'o"; break;
191
case 'ļæ½': r << "'u"; break;
192
case 'ļæ½': r << "'y"; break;
199
igerman_to_german (string s) {
203
if (s[i] == 'ļæ½') r << 'ļæ½';
209
german_to_igerman (string s) {
213
if (s[i] == 'ļæ½') r << 'ļæ½';
218
/******************************************************************************
219
* Iso latin 2 encoding for polish and czech
220
******************************************************************************/
222
static string il2_to_cork_string=
223
"ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ ļæ½ļæ½ ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ĀÄļæ½ĒÉĖ
ļæ½ĪŠļæ½ļæ½Ōļæ½.ļæ½ļæ½Śļæ½Żļæ½ļæ½ļæ½ļæ½äØ¢ļæ½ļæ½ļæ½ļæ½ī¤ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½/ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ ";
224
static string cork_to_il2_string=
225
"Ć”ļæ½ļæ½ļæ½ļæ½ļæ½GÅ„ļæ½ļæ½ļæ½ ļæ½ļæ½Ų¦ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½Yļæ½ļæ½ļæ½IIļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½gåµ³ļæ½ļæ½ ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½yļæ½ļæ½ļæ½i!?LAļæ½ļæ½Aļæ½AAļæ½Eļæ½Eļæ½Iļæ½ļæ½Iļæ½NOļæ½ļæ½Oļæ½OOUļæ½Uļæ½ļæ½ Saļæ½ļæ½aļæ½aaļæ½eļæ½eļæ½iļæ½ļæ½iļæ½noļæ½ļæ½oļæ½oouļæ½uļæ½ļæ½ ļæ½";
228
il2_to_cork (char c) {
229
int i= (int) ((unsigned char) c);
231
return il2_to_cork_string [i-128];
235
cork_to_il2 (char c) {
236
int i= (int) ((unsigned char) c);
238
return cork_to_il2_string [i-128];
242
il2_to_cork (string s) {
246
r[i]= il2_to_cork (s[i]);
251
cork_to_il2 (string s) {
255
r[i]= cork_to_il2 (s[i]);
259
/******************************************************************************
260
* Koi8 encoding for russian
261
******************************************************************************/
263
static string koi8_to_iso_string=
264
"ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½";
265
static string iso_to_koi8_string=
266
"ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½ļæ½";
269
koi8_to_iso (char c, bool ukrainian) {
270
int i= (int) ((unsigned char) c);
271
if (i==156) return 'ļæ½';
272
if (i==188) return 'ļæ½';
277
case 'I':return 'ļæ½';
278
case 'ļæ½':return 'ļæ½';
279
case 'ļæ½':return 'ļæ½';
280
case 'ļæ½':return 'ļæ½';
281
case 'i':return 'ļæ½';
282
case 'ļæ½':return 'ļæ½';
283
case 'ļæ½':return 'ļæ½';
284
case 'ļæ½':return 'ļæ½';
288
return koi8_to_iso_string [i-192];
292
iso_to_koi8 (char c, bool ukrainian) {
293
int i= (int) ((unsigned char) c);
294
if (c=='ļæ½') return (char) 156;
295
if (c=='ļæ½') return (char) 188;
300
case 'ļæ½':return 'I';
301
case 'ļæ½':return 'ļæ½';
302
case 'ļæ½':return 'ļæ½';
303
case 'ļæ½':return 'ļæ½';
304
case 'ļæ½':return 'i';
305
case 'ļæ½':return 'ļæ½';
306
case 'ļæ½':return 'ļæ½';
307
case 'ļæ½':return 'ļæ½';
311
return iso_to_koi8_string [i-192];
315
koi8_to_iso (string s) {
319
r[i]= koi8_to_iso (s[i], false);
324
iso_to_koi8 (string s) {
328
r[i]= iso_to_koi8 (s[i], false);
333
koi8uk_to_iso (string s) {
337
r[i]= koi8_to_iso (s[i], true);
342
iso_to_koi8uk (string s) {
346
r[i]= iso_to_koi8 (s[i], true);
350
/******************************************************************************
351
* Convert between TeXmacs and XML strings
352
******************************************************************************/
355
is_xml_name (char c) {
357
is_alpha (c) || is_numeric (c) ||
358
(c == '.') || (c == '-') || (c == ':');
362
tm_to_xml_name (string s) {
366
if (is_xml_name (s[i])) r << s[i];
367
else r << "_" << as_string ((int) ((unsigned char) s[i])) << "_";
372
xml_name_to_tm (string s) {
376
if (s[i] != '_') r << s[i];
379
while ((i<n) && (s[i]!='_')) i++;
380
r << (char) ((unsigned char) as_int (s (start, i)));
386
tm_to_xml_cdata (string s) {
390
if (s[i] == '&') r << "&";
391
else if (s[i] == '>') r << ">";
392
else if (s[i] != '<') r << s[i];
395
while ((i<n) && (s[i]!='>')) i++;
396
r << "&" << tm_to_xml_name (s (start, i)) << ";";
402
xml_cdata_to_tm (string s) {
406
if (s[i] == '<') r << "<less>";
407
else if (s[i] == '>') r << "<gtr>";
408
else if (s[i] != '&') r << s[i];
411
while ((i<n) && (s[i]!=';')) i++;
412
string x= "<" * xml_name_to_tm (s (start, i)) * ">";
413
if (x == "<amp>") r << "&";
420
xml_unspace (string s, bool first, bool last) {
423
if (first) while ((i<n) && is_space (s[i])) i++;
425
if (!is_space (s[i])) r << s[i++];
427
while ((i<n) && is_space (s[i])) i++;
428
if ((i<n) || (!last)) r << ' ';
433
/******************************************************************************
434
* Roman and alpha numbers
435
******************************************************************************/
437
static string ones[10]= {
438
"", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" };
439
static string tens[10]= {
440
"", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" };
441
static string hundreds[10]= {
442
"", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" };
446
if (nr<0) return "-" * roman_nr (nr);
447
if (nr==0) return "o";
448
if (nr>1000) return "m" * roman_nr (nr-1000);
449
if (nr==1000) return "m";
450
if (nr==999) return "im";
451
if (nr==499) return "id";
452
if ((nr%100)==99) return hundreds[nr/100] * "ic";
453
if ((nr%100)==49) return hundreds[nr/100] * "il";
454
return hundreds[nr/100] * tens[(nr%100)/10] * ones[nr%10];
459
return upcase_all (roman_nr (nr));
464
if (nr<0) return "-" * alpha_nr (nr);
465
if (nr==0) return "0";
466
if (nr<=26) return string ((char) (((int) 'a')+ nr-1));
467
return alpha_nr ((nr-1)/26) * alpha_nr (((nr-1)%26)+1);
472
return upcase_all (alpha_nr (nr));
475
/******************************************************************************
476
* Conversions to and from hexadecimal
477
******************************************************************************/
479
static char* hex_string= "0123456789ABCDEF";
482
as_hexadecimal (int i) {
483
if (i<0) return "-" * as_hexadecimal (-i);
484
if (i<16) return hex_string [i & 15];
485
return as_hexadecimal (i >> 4) * hex_string [i & 15];
489
as_hexadecimal (int i, int len) {
490
if (len==1) return hex_string [i & 15];
491
else return as_hexadecimal (i >> 4, len-1) * hex_string [i & 15];
495
from_hexadecimal (string s) {
496
int i, n= N(s), res= 0;
497
if ((n>0) && (s[0]=='-'))
498
return -from_hexadecimal (s (1, n));
499
for (i=0; i<n; i++) {
501
if ((s[i] >= '0') && (s[i] <= '9')) res += (int) (s[i] - '0');
502
if ((s[i] >= 'A') && (s[i] <= 'F')) res += (int) (s[i] + 10 - 'A');
503
if ((s[i] >= 'a') && (s[i] <= 'f')) res += (int) (s[i] + 10 - 'a');
508
/******************************************************************************
509
* Convert between verbatim and TeXmacs encoding
510
******************************************************************************/
513
tm_encode (string s) {
516
for (i=0; i<N(s); i++) {
517
if (s[i]=='<') r << "<less>";
518
else if (s[i]=='>') r << "<gtr>";
525
tm_decode (string s) {
528
for (i=0; i<N(s); i++) {
531
for (j=i+1; j<N(s); j++)
532
if (s[j]=='>') break;
534
if (s(i,j) == "<less>") r << "<";
535
else if (s(i,j) == "<gtr>") r << ">";
537
if (s[i]!='>') return r;
539
else if (s[i]!='>') r << s[i];
545
tm_correct (string s) {
548
for (i=0; i<N(s); i++) {
550
register bool flag= true;
552
for (j=i+1; j<N(s); j++)
553
if (s[j]=='>') break;
554
if (j==N(s)) return r;
555
for (k=i+1; k<j; k++)
556
if (s[k]=='<') flag= false;
557
if (flag) r << s(i,j+1);
560
else if (s[i]!='>') r << s[i];
565
/******************************************************************************
566
* Handling escape characters
567
******************************************************************************/
570
escape_quotes (string s) {
573
for (i=0; i<n; i++) {
574
if ((s[i] == '\\') || (s[i] == '\"')) r << '\\';
581
escape_generic (string s) {
584
for (i=0; i<n; i++) {
585
if ((s[i] == '\2') || (s[i] == '\5') || (s[i] == '\33')) r << '\33';
592
escape_verbatim (string s) {
595
for (i=0; i<n; i++) {
596
unsigned char c= (unsigned char) s[i];
597
if ((c == '\n') || (c == '\t')) r << ' ';
598
else if (((int) c) >= 32) r << s[i];
604
dos_to_better (string s) {
613
/******************************************************************************
614
* Reading input from a string
615
******************************************************************************/
618
test (string s, int i, const char* test) {
620
while (test[j]!='\0') {
621
if (i>=n) return false;
622
if (s[i]!=test[j]) return false;
629
test (string s, int i, string test) {
630
int n= N(s), m= N(test), j=0;
632
if (i>=n) return false;
633
if (s[i]!=test[j]) return false;
640
starts (string s, const char* what) {
641
return test (s, 0, what);
645
starts (string s, const string what) {
646
return test (s, 0, what);
650
ends (string s, const char* what) {
651
string r ((char*) what);
652
if (N(r) > N(s)) return false;
653
return s (N(s)-N(r), N(s)) == r;
657
ends (string s, const string r) {
658
if (N(r) > N(s)) return false;
659
return s (N(s)-N(r), N(s)) == r;
663
read (string s, int& i, const char* test) {
664
int n= N(s), j=0, k=i;
665
while (test[j]!='\0') {
666
if (k>=n) return false;
667
if (s[k]!=test[j]) return false;
675
read (string s, int& i, string test) {
676
int n= N(s), m= N(test), j=0, k=i;
678
if (k>=n) return false;
679
if (s[k]!=test[j]) return false;
687
read_line (string s, int& i, string& result) {
689
for (; i<N(s); i++) {
691
result= s(start,i++);
700
read_int (string s, int& i, int& result) {
701
int n= N(s), start= i;
703
if (i==n) return false;
705
if (i+1==n) return false;
706
if (!is_digit (s[i+1])) return false;
709
else if (!is_digit (s[i])) return false;
710
while ((i<n) && is_digit (s[i])) i++;
711
result= as_int (s(start,i));
716
read_double (string s, int& i, double& result) {
717
int n= N(s), start= i;
719
if (i==n) return false;
721
if (i+1==n) return false;
722
if (!is_numeric (s[i+1])) return false;
725
else if (!is_numeric (s[i])) return false;
726
while ((i<n) && is_digit (s[i])) i++;
727
if ((i<n) && (s[i]=='.')) i++;
728
while ((i<n) && is_digit (s[i])) i++;
729
if ((i<n) && ((s[i]=='e') || (s[i]=='E'))) {
731
if ((i<n) && (s[i]=='-')) i++;
732
if ((i==n) || (!is_digit (s[i]))) { i=start; return false; }
733
while ((i<n) && is_digit (s[i])) i++;
735
result= as_double (s(start,i));
740
skip_spaces (string s, int& i) {
742
while ((i<n) && ((s[i]==' ') || (s[i]=='\t'))) i++;
746
skip_line (string s, int& i) {
748
while ((i<n) && (s[i]!='\n')) i++;
753
skip_symbol (string s, int& i) {
758
if (s[i-1]=='>') break;
764
/******************************************************************************
765
* Parsing binary data
766
******************************************************************************/
769
parse (string s, int& pos, QI& ret) {
774
parse (string s, int& pos, QN& ret) {
779
parse (string s, int& pos, HI& ret) {
780
QI c1= (QI) s[pos++];
781
QN c2= (QN) s[pos++];
782
ret= (((HI) c1)<<8)+ c2;
786
parse (string s, int& pos, HN& ret) {
787
QN c1= (QN) s[pos++];
788
QN c2= (QN) s[pos++];
789
ret= (((HN) c1)<<8)+ c2;
793
parse (string s, int& pos, SI& ret) {
794
QI c1= (QI) s[pos++];
795
QN c2= (QN) s[pos++];
796
QN c3= (QN) s[pos++];
797
QN c4= (QN) s[pos++];
798
ret= (((((((SI) c1)<<8)+ ((SI) c2))<<8)+ ((SI) c3))<<8)+ c4;
802
parse (string s, int& pos, SI*& a, int len) {
805
for (i=0; i<len; i++) parse (s, pos, a[i]);
808
/******************************************************************************
809
* Searching, replacing and pattern matching
810
******************************************************************************/
813
search_forwards (string s, int pos, string in) {
814
int k= N(s), n= N(in);
816
if (test (in, pos, s)) return pos;
823
search_forwards (string s, string in) {
824
return search_forwards (s, 0, in);
828
search_backwards (string s, int pos, string in) {
830
if (test (in, pos, s)) return pos;
837
search_backwards (string s, string in) {
838
return search_backwards (s, N(in)-N(s), in);
842
replace (string s, string what, string by) {
846
if (test (s, i, what)) {
858
match_wildcard (string s, int spos, string w, int wpos) {
859
if (wpos == N(w)) return spos == N(s);
861
return (spos < N(s)) && (s[spos] == w[wpos]) &&
862
match_wildcard (s, spos+1, w, wpos+1);
863
while ((wpos<N(w)) && (w[wpos]=='*')) wpos++;
864
while (spos <= N(s)) {
865
if (match_wildcard (s, spos, w, wpos)) return true;
872
match_wildcard (string s, string w) {
873
return match_wildcard (s, 0, w, 0);