2
* Format text, establish line breaks, manage whitespace.
3
* Copyright (c) Karl Dahlke, 2008
4
* This file is part of the edbrowse project, released under GPL.
10
/*********************************************************************
11
Prepare html for text processing.
12
Change nulls to spaces.
13
Make sure it doesn't already contain my magic code,
14
The one I use to indicate a tag.
15
If it does, well, change them to something else.
16
I can only hope this doesn't screw up some embedded javascript.
17
*********************************************************************/
20
prepareForBrowse(char *h, int h_len)
24
for(i = j = 0; i < h_len; ++i) {
28
if(i && !strchr("\n\b<>'\"&", h[i - 1]))
32
if(h[i] == (char)0xe2 && i < h_len - 1 && h[i + 1] == (char)0x80) {
36
if(h[i] == InternalCodeChar)
37
h[i] = InternalCodeCharAlternate;
40
h[j] = 0; /* now it's a string */
43
for(i = j = 0; h[i]; ++i) {
44
if(h[i] == '\r' && h[i + 1] == '\n')
49
} /* prepareForBrowse */
52
/*********************************************************************
53
Skip past an html comment.
54
Parse an html tag <tag foo=bar>
55
*********************************************************************/
58
skipHtmlComment(const char *h, int *lines)
61
bool comm = h[2] == '-' && h[3] == '-';
62
bool php = memEqualCI(h + 1, "?php", 4);
66
if(php) { /* special type of comment */
67
if(*h == '?' && h[1] == '>') {
75
if(!comm && *h == '>') {
80
if(comm && h[0] == '-' && h[1] == '-') {
84
while(isspaceByte(*h)) {
107
} /* skipHtmlComment */
109
/* an attribute character */
113
return (c > ' ' && c != '=' && c != '<' && c != '>');
116
/*********************************************************************
118
e is pointer to the begining of the element (*e must be '<').
119
eof is pointer to the end of the html page.
121
parsed tag name is stored in name, it's length is namelen.
122
first attribute is stored in attr.
123
end points to first character past the html tag.
124
lines records the number of newlines consumed by the tag.
125
*********************************************************************/
127
bool htmlAttrVal_nl; /* allow nl in attribute values */
131
const char **name, int *namelen, const char **attr, const char **end,
143
while(isA(*e) || *e == '=')
145
if(!isspaceByte(*e) && *e != '>' && *e != '<' && *e != '/' && *e != ':')
147
/* Note that name includes the leading / */
149
*namelen = e - *name;
150
/* skip past space colon slash */
151
while(isspaceByte(*e) || *e == '/' || *e == ':') {
156
/* should be the start of the first attribute, or < or > */
157
if(!atchr(*e) && *e != '>' && *e != '<')
162
if(*e == '>' || *e == '<')
168
while(isspaceByte(*e)) {
176
while(isspaceByte(*e)) {
182
unsigned char uu = *e;
185
while(*e != uu && *e) {
194
/* lots of tags end with an extra quote */
201
while(!isspaceByte(*e) && *e != '>' && *e != '<' && *e)
204
while(isspaceByte(*e)) {
211
/* could be < or > */
213
*end = e + (*e == '>');
219
/* Don't know why he didn't use the stringAndChar() functions, but he
220
* invented something new here, so on we go. */
222
valChar(char **sp, int *lp, char c)
227
*sp = s = reallocMem(s, l + ALLOC_GR);
232
/*********************************************************************
233
Find an attribute in an html tag.
234
e is attr pointer previously gotten from parseTag, DON'T PASS HERE ANY OTHER VALUE!!!
235
name is the sought attribute.
236
returns allocated string containing the attribute, or NULL on unsuccess.
237
*********************************************************************/
240
htmlAttrVal(const char *e, const char *name)
243
char *a = EMPTYSTRING; /* holds the value */
245
int l = 0; /* length */
250
while(isspaceByte(*e))
254
if(*e == '>' || *e == '<')
257
while(*n && !((*e ^ *n) & 0xdf))
262
while(isspaceByte(*e))
267
while(isspaceByte(*e))
270
while(*e && !isspaceByte(*e) && *e != '>' && *e != '<') {
284
if(!f && *e != '\r') {
285
if(*e != '\t' && *e != '\n')
287
else if(!htmlAttrVal_nl)
288
valChar(&a, &l, ' ');
301
goto top; /* no match, next attribute */
303
valChar(&a, &l, 0); /* null terminate */
306
a = andTranslate(b, true);
309
/* strip leading and trailing spaces.
310
* Are we really suppose to do this? */
311
for(b = a; *b == ' '; b++) ;
314
for(b = a + strlen(a) - 1; b >= a && *b == ' '; b--)
320
/*********************************************************************
321
Jump straight to the </script>, and don't look at anything in between.
323
end of the script, the extracted script, and the number of newlines.
324
*********************************************************************/
327
findEndScript(const char *h, const char *tagname,
328
bool is_js, char **end_p, char **new_p, int *lines)
336
sprintf(look, "</%s>", tagname);
339
end = strstrCI(s, look);
342
browseError(MSG_CloseTag, look);
343
end = (char *)h + strlen(h);
345
/* Check for document.write("</script>");
346
* This isn't legal javascript, but it happens all the time!
347
* This is a really stupid check.
348
* Scan forward 30 chars, on the same line, looking
349
* for a quote, and ) ; or + */
352
s = end + strlen(look);
353
for(j = 0; j < 30; ++j, ++s) {
359
if(c != '"' && c != '\'')
373
*new_p = pullString1(h, end);
374
/* count the newlines */
383
} /* findEndScript */
386
/*********************************************************************
387
The primary goal of this routine is to turn
388
Hey,{ click here } for more information
390
Hey, {click here} for more information
391
But of course we won't do that if the section is preformatted.
392
Nor can we muck with the whitespace that might be present in an input field <>.
393
State variables remember:
394
Whether we are in a preformatted section
395
Whether we have seen any visible text in the document
396
Whether we have seen any visible text in the current hyperlink,
398
Whether we are stepping through a span of whitespace.
399
A tag and adjacent whitespace might be swapped, depending on state.
400
If a change is made, the procedure is run again,
401
kinda like bubble sort.
402
It has the potential to be terribly inefficient,
403
but that's not likely.
404
Use cnt to count the iterations, just for debugging.
405
*********************************************************************/
408
anchorSwap(char *buf)
410
char c, d, *s, *ss, *w, *a;
411
bool premode, pretag, state_braces, state_text, state_atext;
412
bool strong, change, slash;
416
static const char from[] =
417
"\x1b\x95\x99\x9c\x9d\x91\x92\x93\x94\xa0\xad\x96\x97\x85\xa6\xc2";
418
static const char becomes[] = "_*'`'`'`' ----- ";
420
/* Transliterate a few characters. One of them is 0xa0 to space,
421
* so we need to do this now, before the anchors swap with whitespace.
422
* Watch out for utf8 - don't translate the a0 in c3a0. That is a grave.
423
* But a0 by itself is breakspace; turn it into space.
424
* And c2a0 is a0 is breakspace.
425
* Then get rid of hyperlinks with absolutely nothing to click on. */
427
for(s = w = buf; c = *s; ++s) {
430
if((c & 0xc0) == 0xc0 && (d & 0xc0) == 0x80) {
431
unsigned int uni = 0;
432
if((c & 0x3c) == 0) {
434
uni = ((uchar) c << 6) | (d & 0x3f);
435
ss = strchr(from, (char)uni);
437
c = becomes[ss - from];
442
/* copy the utf8 sequence */
446
while((c & 0x80) && ((d = *s) & 0xc0) == 0x80) {
454
ss = strchr(from, c);
456
c = becomes[ss - from];
458
if(c != InternalCodeChar)
460
if(!isdigitByte(s[1]))
462
for(a = s + 2; isdigitByte(*a); ++a) ;
465
for(++a; *a == ' '; ++a) ;
466
if(a[0] != InternalCodeChar || a[1] != '0' || a[2] != '}')
481
premode = state_text = state_atext = state_braces = false;
482
/* w represents the state of whitespace */
484
/* a represents the state of being in an anchor */
487
for(s = buf; c = *s; ++s) {
494
/* end of white space, should we swap it with prior tag? */
495
if(w && a && !premode &&
496
((state_braces & !state_atext) ||
497
((!state_braces) & !state_text))) {
499
memcpy(a + (s - w), tag, n);
504
/* prior anchor has no significance */
507
if(c == InternalCodeChar) {
508
if(!isdigitByte(s[1]))
510
n = strtol(s + 1, &ss, 10);
511
preFormatCheck(n, &pretag, &slash);
513
/* the following should never happen */
514
if(!strchr("{}<>*", d))
527
/* We have a tag, should we swap it with prior whitespace? */
531
((state_braces & state_atext) ||
532
((!state_braces) & state_text)))) {
533
memmove(w + n, w, s - w);
538
state_braces = false;
543
/* prior whitespace doesn't matter any more */
547
state_braces = state_text = true;
555
state_braces = false;
570
/* The remaining tags are <>, for an input field. */
573
/* end of tag processing */
577
w = 0; /* no more whitespace */
582
/* end of loop over the chars in the buffer */
584
/* end of loop making changes */
586
debugPrint(3, "anchorSwap %d", cnt);
588
/* Framing characters like [] around an anchor are unnecessary here,
589
* because we already frame it in braces.
590
* Get rid of these characters, even in premode.
591
* Also, remove trailing pipes on a line. */
592
ss = 0; /* remember location of first pipe */
593
for(s = w = buf; c = *s; ++s) {
594
char open, close, linkchar;
595
if(!strchr("{[(<", c))
597
if(s[1] != InternalCodeChar)
599
if(!isdigitByte(s[2]))
601
for(a = s + 3; isdigitByte(*a); ++a) ;
624
if(d != InternalCodeChar)
626
while(isdigitByte(a[n]))
630
break; /* should never happen */
631
if(strchr("{}<>", d))
650
if(strchr("\r\n\f", c) && ss)
652
if(!isspaceByte(c) && c != '|')
655
} /* loop over buffer */
657
debugPrint(3, "anchors unframed");
659
/* Now compress the implied linebreaks into one. */
661
for(s = buf; c = *s; ++s) {
662
if(c == InternalCodeChar && isdigitByte(s[1])) {
663
n = strtol(s + 1, &s, 10);
665
preFormatCheck(n, &pretag, &slash);
674
for(w = s; isspaceByte(*w); ++w) {
675
if(*w == '\n' || *w == '\f')
686
for(w = ss; w <= s; ++w)
691
for(w = ss; w <= s; ++w)
692
if(*w == '\r' && w != a)
694
} /* loop over buffer */
695
debugPrint(3, "whitespace combined");
699
/*********************************************************************
700
Format text, and break lines at sentence/phrase boundaries.
701
The prefix bl means breakline.
702
*********************************************************************/
704
static char *bl_start, *bl_cursor, *bl_end;
705
static bool bl_overflow;
706
static int colno; /* column number */
707
static const int optimalLine = 80; /* optimal line length */
708
static const int cutLineAfter = 36; /* cut sentence after this column */
709
static const int paraLine = 120; /* paragraph in a line */
710
static int longcut, pre_cr;
711
static int lspace; /* last space value, 3 = paragraph */
712
/* Location of period comma rightparen or any word.
713
* Question mark is equivalent to period etc.
714
* Other things being equal, we break at period, rather than comma, etc.
715
* First the column numbers, then the index into the string. */
716
static int lperiod, lcomma, lright, lany;
717
static int idxperiod, idxcomma, idxright, idxany;
720
debugChunk(const char *chunk, int len)
726
for(i = 0; i < len; ++i) {
750
printf(">%d.%d\n", colno, lspace);
754
appendOneChar(char c)
756
if(bl_cursor == bl_end)
760
} /* appendOneChar */
763
spaceNotInInput(void)
767
for(--t; t >= bl_start; --t) {
769
if(c == '\n' || c == '\r')
771
if(c == '>' && t >= bl_start + 2 &&
772
t[-1] == '0' && t[-2] == InternalCodeChar)
776
while(t > bl_start && isdigitByte(t[-1]))
780
if(t > bl_start && t[-1] == InternalCodeChar)
784
} /* spaceNotInInput */
787
appendSpaceChunk(const char *chunk, int len, bool premode)
789
int nlc = pre_cr; /* newline count */
790
int spc = 0; /* space count */
796
for(i = 0; i < len; ++i) {
798
if(c == '\n' || c == '\r') {
809
if(!premode && spaceNotInInput()) {
810
int l = bl_cursor - bl_start;
817
if(strchr(")\"|}", d))
819
if(strchr(".?!:", e)) {
821
/* Check for Mr. Mrs. and others. */
822
if(e == '.' && bl_cursor - bl_start > 10) {
823
static const char *const prefix[] =
824
{ "mr.", "mrs.", "sis.", "ms.", 0 };
826
for(i = 0; i < 6; ++i) {
827
c = bl_cursor[i - 6];
833
for(i = 0; prefix[i]; ++i)
834
if(strstr(trailing, prefix[i]))
836
/* Check for John C. Calhoon */
837
if(isupperByte(bl_cursor[-2]) && isspaceByte(bl_cursor[-3]))
841
lperiod = colno, idxperiod = l;
844
if(strchr(")\"|", d))
847
lcomma = colno, idxcomma = l;
848
if(strchr(")\"|", d))
849
lright = colno, idxright = l;
850
lany = colno, idxany = l;
851
/* tack a short fragment onto the previous line. */
852
if(longcut && colno <= 15 && (nlc || lperiod == colno)) {
853
bl_start[longcut] = ' ';
855
len = spc = 0, nlc = 1;
856
} /* pasting small fragment onto previous line */
857
} /* allowing line breaks */
867
longcut = lperiod = lcomma = lright = lany = 0;
868
if(lspace >= 2 || nlc > 1)
878
/* if the first char of the text to be reformatted is space,
879
* then we will wind up here, with lspace = 3. */
888
for(i = 0; i < len; ++i) {
890
if(c == '\n' || c == '\r' || c == '\f')
896
for(; i < len; ++i) {
907
} /* appendSpaceChunk */
910
appendPrintableChunk(const char *chunk, int len, bool premode)
913
for(i = 0; i < len; ++i)
914
appendOneChar(chunk[i]);
919
if(colno <= optimalLine)
921
/* Oops, line is getting long. Let's see where we can cut it. */
923
if(lperiod > cutLineAfter)
924
i = lperiod, j = idxperiod;
925
else if(lcomma > cutLineAfter)
926
i = lcomma, j = idxcomma;
927
else if(lright > cutLineAfter)
928
i = lright, j = idxright;
929
else if(lany > cutLineAfter)
930
i = lany, j = idxany;
932
return; /* nothing we can do about it */
942
} /* appendPrintableChunk */
944
/* Break up a line using the above routines.
945
* The buffer for the new text must be supplied.
946
* Return false (fail) if we ran out of room.
947
* This function is called from bufsup.c, implementing the bl command,
948
* and is only in this file because it shares the above routines and variables
949
* with the html reformatting, which really has to be here. */
951
breakLine(const char *line, int len, int *newlen)
953
char c, state, newstate;
957
if(len && line[len - 1] == '\r')
960
/* special continuation code from the previous invokation */
968
lspace = 2; /* should never happen */
971
bl_start = bl_cursor = replaceLine;
972
bl_end = replaceLine + REPLACELINELEN - 8;
975
longcut = lperiod = lcomma = lright = lany = 0;
981
for(i = 0; i < len; ++i) {
984
if(!c || strchr(" \t\n\r\f", c))
986
if(state == newstate)
993
/* state change here */
994
debugChunk(line + last, i - last);
996
appendSpaceChunk(line + last, i - last, false);
998
appendPrintableChunk(line + last, i - last, false);
1004
if(state) { /* last token */
1005
debugChunk(line + last, len - last);
1007
appendSpaceChunk(line + last, len - last, false);
1009
appendPrintableChunk(line + last, len - last, false);
1012
if(lspace < 2) { /* line didn't have a \r at the end */
1013
appendSpaceChunk("\n", 1, false);
1015
if(bl_cursor - bl_start > paraLine)
1017
debugPrint(7, "chunk<EOL>%d.%d", colno, lspace);
1018
*newlen = bl_cursor - bl_start;
1019
return !bl_overflow;
1023
breakLineSetup(void)
1029
htmlReformat(const char *buf)
1031
const char *h, *nh, *s;
1033
bool premode = false;
1038
longcut = lperiod = lcomma = lright = lany = 0;
1042
bl_start = bl_cursor = replaceLine;
1043
bl_end = replaceLine + REPLACELINELEN - 8;
1044
bl_overflow = false;
1045
new = initString(&l);
1047
for(h = buf; (c = *h); h = nh) {
1048
if(isspaceByte(c)) {
1049
for(s = h + 1; isspaceByte(*s); ++s) ;
1051
appendSpaceChunk(h, nh - h, premode);
1052
if(lspace == 3 || lspace == 2 &&
1053
(bl_cursor - bl_start) >= (bl_end - bl_start) * 2 / 3) {
1054
if(bl_cursor > bl_start)
1055
stringAndBytes(&new, &l, bl_start, bl_cursor - bl_start);
1056
bl_cursor = bl_start;
1058
longcut = lperiod = lcomma = lright = lany = 0;
1064
if(c != InternalCodeChar) {
1065
for(s = h + 1; *s; ++s)
1066
if(isspaceByte(*s) || *s == InternalCodeChar)
1069
appendPrintableChunk(h, nh - h, premode);
1075
tagno = strtol(h + 1, (char **)&nh, 10);
1077
if(!c || !strchr("{}<>*", c))
1078
i_printfExit(MSG_BadTagCode, tagno, c);
1079
appendPrintableChunk(h, nh - h, premode);
1080
preFormatCheck(tagno, &pretag, &slash);
1084
/* Insert newlines between adjacent hyperlinks. */
1085
if(c != '}' || premode)
1087
for(h = nh; c = *h; ++h)
1088
if(!strchr(" \t,:-|;", c))
1090
if(!c || strchr("\r\n\f", c)) {
1094
if(c != InternalCodeChar)
1096
/* Does this start a new hyperlink? */
1097
for(s = h + 1; isdigitByte(*s); ++s) ;
1100
appendSpaceChunk("\n", 1, false);
1102
} /* loop over text */
1104
/* close off the last line */
1106
appendSpaceChunk("\n", 1, true);
1107
if(bl_cursor > bl_start)
1108
stringAndBytes(&new, &l, bl_start, bl_cursor - bl_start);
1109
/* Get rid of last space. */
1110
if(l >= 2 && new[l - 1] == '\n' && new[l - 2] == ' ')
1111
new[l - 2] = '\n', new[--l] = 0;
1112
/* Don't need empty lines at the end. */
1113
while(l > 1 && new[l - 1] == '\n' && new[l - 2] == '\n')
1116
/* Don't allow an empty buffer */
1118
stringAndChar(&new, &l, '\n');
1121
} /* htmlReformat */
1124
/*********************************************************************
1125
And-convert the string; you know, < etc.
1126
This is the routine that makes it possible for me to read, and write,
1127
my math site. http://www.mathreference.com/accessible.html
1128
In the invisible mode, graphics characters are not rendered at all.
1129
This is used when translating attributes inside tags,
1130
such as HREF, in an anchor.
1131
The original string is not disturbed.
1132
The new string is allocated.
1133
*********************************************************************/
1136
andTranslate(const char *s, bool invisible)
1141
uchar alnum = 0; /* was last char an alphanumeric */
1145
static const char *const andwords[] = {
1257
"laquo\0left arrow",
1326
"#8592\0left arrow",
1329
"#8595\0down arrow",
1330
"#8660\0double arrow",
1337
"#8713\0not a member of",
1345
"#8773\0congruent to",
1349
"#8834\0proper subset of",
1350
"#8835\0proper superset of",
1351
"#8836\0not a subset of",
1353
"#8839\0superset of",
1360
if(s == EMPTYSTRING)
1362
new = initString(&l);
1365
if(c == InternalCodeChar && !invisible) {
1366
const char *t = s + 1;
1367
while(isdigitByte(*t))
1369
if(t > s + 1 && *t && strchr("{}<>*", *t)) { /* it's a tag */
1370
bool separate, pretag, slash;
1372
preFormatCheck(n, &pretag, &slash);
1373
separate = (*t != '*');
1376
debugPrint(7, "tag %d%c separate %d", n, *t, separate);
1380
stringAndBytes(&new, &l, s, t - s);
1390
for(j = 0; j < sizeof (andbuf); ++j) {
1392
if(d == '&' || d == ';' || d <= ' ')
1395
if(j == sizeof (andbuf))
1396
goto putc; /* too long, no match */
1397
strncpy(andbuf, s + 1, j);
1402
/* remove leading zeros */
1403
if(andbuf[0] == '#')
1404
while(andbuf[1] == '0')
1405
strcpy(andbuf + 1, andbuf + 2);
1408
debugPrint(6, "meta %s", andbuf);
1409
n = stringInList(andwords, andbuf);
1410
if(n >= 0) { /* match */
1411
const char *r = andwords[n] + strlen(andwords[n]) + 1; /* replacement string */
1413
if(!r[1]) { /* replace with a single character */
1415
if(c & 0x80 && cons_utf8) {
1416
static char utfbuf[4];
1417
utfbuf[0] = (0xc0 | ((uchar) c >> 6));
1418
utfbuf[1] = (0x80 | (c & 0x3f));
1430
/* We're replacing with a word */
1431
if(!invisible && isalnumByte(*r)) {
1432
/* insert spaces either side */
1434
stringAndChar(&new, &l, ' ');
1439
stringAndString(&new, &l, r);
1443
if(andbuf[0] != '#')
1445
n = stringIsNum(andbuf + 1);
1451
/* don't allow nulls */
1454
if(strchr("\r\n\f", c) && !premode)
1456
if(c == InternalCodeChar)
1462
if(isalnumByte(c)) {
1464
stringAndChar(&new, &l, ' ');
1468
stringAndChar(&new, &l, c);
1470
} /* loop over input string */
1473
} /* andTranslate */
1475
/*********************************************************************
1476
Crunch a to-list or a copy-to-list down to its email addresses.
1477
Delimit them with newlines.
1478
"Smith, John" <jsmith@whatever.com>
1481
*********************************************************************/
1484
extractEmailAddresses(char *line)
1487
char *mark; /* start of current entry */
1490
for(s = t = mark = line; c = *s; ++s) {
1491
if(c == ',' && !quote) {
1502
/* don't think you can quote in an email address */
1525
if(c == ' ' && quote == '<')
1533
spaceCrunch(line, true, false);
1534
for(s = line; c = *s; ++s)
1539
} /* extractEmailAddresses */
1542
cutDuplicateEmail(char *line, const char *dup, int duplen)
1546
s = strchr(line, ',');
1548
return; /* should never happen */
1549
if(duplen == s - line && memEqualCI(line, dup, duplen)) {
1556
} /* cutDuplicateEmail */
1559
cutDuplicateEmails(char *tolist, char *cclist, const char *reply)
1564
len = strlen(reply);
1566
cutDuplicateEmail(tolist, reply, len);
1567
cutDuplicateEmail(cclist, reply, len);
1574
break; /* should never happen */
1577
cutDuplicateEmail(t, s, len);
1578
cutDuplicateEmail(cclist, s, len);
1586
break; /* should never happen */
1589
cutDuplicateEmail(t, s, len);
1593
/* If your email address is on the to or cc list, drop it.
1594
* But retain it if it is the reply, in case you sent mail to yourself. */
1596
struct MACCOUNT *m = accounts;
1598
for(i = 0; i < maxAccount; ++i, ++m) {
1599
const char *r = m->reply;
1603
cutDuplicateEmail(tolist, r, len);
1604
cutDuplicateEmail(cclist, r, len);
1607
} /* cutDuplicateEmails */
1609
/*********************************************************************
1610
We got some data, from a file or from the internet.
1611
Count the binary characters and decide if this is, on the whole,
1612
binary or text. I allow some nonascii chars,
1613
like you might see in Spanish or German, and still call it text,
1614
but if there's too many such chars, I call it binary.
1615
It's not an exact science.
1616
*********************************************************************/
1619
looksBinary(const char *buf, int buflen)
1621
int i, bincount = 0;
1622
for(i = 0; i < buflen; ++i) {
1627
return (bincount * 4 - 10 >= buflen);
1631
looks_8859_utf8(const char *buf, int buflen, bool * iso_p, bool * utf8_p)
1633
int utfcount = 0, isocount = 0;
1634
int i, j, bothcount;
1636
for(i = 0; i < buflen; ++i) {
1640
/* This is the start of the nonascii sequence. */
1641
/* No second bit, it has to be iso. */
1647
/* Next byte has to start with 10 to be utf8, else it's iso */
1648
if(((uchar) buf[i + 1] & 0xc0) != 0x80)
1651
for(j = i + 2; c < 0; ++j, c <<= 1)
1652
if(((uchar) buf[j] & 0xc0) != 0x80)
1658
*iso_p = *utf8_p = false;
1660
bothcount = isocount + utfcount;
1664
if(utfcount * 7 >= bothcount)
1666
if(isocount * 7 >= bothcount)
1668
} /* looks_8859_utf8 */
1670
/*********************************************************************
1671
Convert a string from iso 8859 to utf8, or vice versa.
1672
In each case a new string is allocated.
1673
Don't forget to free it when you're done.
1674
*********************************************************************/
1677
iso2utf(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p)
1685
*outbuf_p = EMPTYSTRING;
1690
/* count chars, so we can allocate */
1691
for(i = 0; i < inbuflen; ++i) {
1697
outbuf = allocMem(inbuflen + nacount + 1);
1698
for(i = j = 0; i < inbuflen; ++i) {
1704
outbuf[j++] = ((uchar) c >> 6) | 0xc0;
1705
outbuf[j++] = (c & 0x3f) | 0x80;
1707
outbuf[j] = 0; /* just for fun */
1714
utf2iso(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p)
1721
*outbuf_p = EMPTYSTRING;
1726
outbuf = allocMem(inbuflen + 1);
1727
for(i = j = 0; i < inbuflen; ++i) {
1730
/* regular chars and nonascii chars that aren't utf8 pass through. */
1731
/* There shouldn't be any of the latter */
1732
if(((uchar) c & 0xc0) != 0xc0) {
1737
/* Convertable into 8 bit */
1738
if(((uchar) c & 0xfc) == 0xc0 && ((uchar) inbuf[i + 1] & 0xc0) == 0x80) {
1739
outbuf[j++] = ((uchar) c << 6) | (inbuf[i + 1] & 0x3f);
1744
/* Higher unicodes, more than 2 bytes, are converted into 0x80 */
1747
for(++i; c < 0; ++i, c <<= 1) {
1748
if(((uchar) outbuf[i] & 0xc0) != 0x80)
1754
outbuf[j] = 0; /* just for fun */
1761
iuReformat(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p)
1763
bool is8859, isutf8;
1770
looks_8859_utf8(inbuf, inbuflen, &is8859, &isutf8);
1771
if(cons_utf8 && is8859) {
1772
debugPrint(3, "converting to utf8");
1773
iso2utf(inbuf, inbuflen, outbuf_p, outbuflen_p);
1775
if(!cons_utf8 && isutf8) {
1776
debugPrint(3, "converting to iso8859");
1777
utf2iso(inbuf, inbuflen, outbuf_p, outbuflen_p);