2
* Format text, establish line breaks, manage whitespace.
3
* Copyright (c) Karl Dahlke, 2006
4
* This file is part of the edbrowse project, released under GPL.
10
/*********************************************************************
11
Prepare html for text processing.
12
Change nulls to spaces.
13
Make sure it doesn't already contain my magic code,
14
The one I use to indicate a tag.
15
If it does, well, change them to something else.
16
I can only hope this doesn't screw up some embedded javascript.
17
*********************************************************************/
20
prepareForBrowse(char *h, int h_len)
24
for(i = j = 0; i < h_len; ++i) {
28
if(i && !strchr("\n\b<>'\"&", h[i - 1]))
32
if(h[i] == (char)0xe2 && i < h_len - 1 && h[i + 1] == (char)0x80) {
36
if(h[i] == InternalCodeChar)
37
h[i] = InternalCodeCharAlternate;
40
h[j] = 0; /* now it's a string */
43
for(i = j = 0; h[i]; ++i) {
44
if(h[i] == '\r' && h[i + 1] == '\n')
49
} /* prepareForBrowse */
52
/*********************************************************************
53
Skip past an html comment.
54
Parse an html tag <tag foo=bar>
55
*********************************************************************/
58
skipHtmlComment(const char *h, int *lines)
61
bool comm = h[2] == '-' && h[3] == '-';
62
bool php = memEqualCI(h + 1, "?php", 4);
66
if(php) { /* special type of comment */
67
if(*h == '?' && h[1] == '>') {
75
if(!comm && *h == '>') {
80
if(comm && h[0] == '-' && h[1] == '-') {
84
while(isspaceByte(*h)) {
107
} /* skipHtmlComment */
109
/* an attribute character */
113
return (c > ' ' && c != '=' && c != '<' && c != '>');
116
/*********************************************************************
118
e is pointer to the begining of the element (*e must be '<').
119
eof is pointer to the end of the html page.
121
parsed tag name is stored in name, it's length is namelen.
122
first attribute is stored in attr.
123
end points to first character past the html tag.
124
lines records the number of newlines consumed by the tag.
125
*********************************************************************/
127
bool htmlAttrVal_nl; /* allow nl in attribute values */
131
const char **name, int *namelen, const char **attr, const char **end,
143
while(isA(*e) || *e == '=')
145
if(!isspaceByte(*e) && *e != '>' && *e != '<' && *e != '/' && *e != ':')
147
/* Note that name includes the leading / */
149
*namelen = e - *name;
150
/* skip past space colon slash */
151
while(isspaceByte(*e) || *e == '/' || *e == ':') {
156
/* should be the start of the first attribute, or < or > */
157
if(!atchr(*e) && *e != '>' && *e != '<')
162
if(*e == '>' || *e == '<')
168
while(isspaceByte(*e)) {
176
while(isspaceByte(*e)) {
182
unsigned char uu = *e;
185
while(*e != uu && *e) {
194
/* lots of tags end with an extra quote */
201
while(!isspaceByte(*e) && *e != '>' && *e != '<' && *e)
204
while(isspaceByte(*e)) {
211
/* could be < or > */
213
*end = e + (*e == '>');
219
/* Don't know why he didn't use the stringAndChar() functions, but he
220
* invented something new here, so on we go. */
222
valChar(char **sp, int *lp, char c)
227
*sp = s = reallocMem(s, l + ALLOC_GR);
232
/*********************************************************************
233
Find an attribute in an html tag.
234
e is attr pointer previously gotten from parseTag, DON'T PASS HERE ANY OTHER VALUE!!!
235
name is the sought attribute.
236
returns allocated string containing the attribute, or NULL on unsuccess.
237
*********************************************************************/
240
htmlAttrVal(const char *e, const char *name)
243
char *a = EMPTYSTRING; /* holds the value */
245
int l = 0; /* length */
250
while(isspaceByte(*e))
254
if(*e == '>' || *e == '<')
257
while(*n && !((*e ^ *n) & 0xdf))
262
while(isspaceByte(*e))
267
while(isspaceByte(*e))
270
while(*e && !isspaceByte(*e) && *e != '>' && *e != '<') {
284
if(!f && *e != '\r') {
285
if(*e != '\t' && *e != '\n')
287
else if(!htmlAttrVal_nl)
288
valChar(&a, &l, ' ');
301
goto top; /* no match, next attribute */
303
valChar(&a, &l, 0); /* null terminate */
306
a = andTranslate(b, true);
309
/* strip leading and trailing spaces.
310
* Are we really suppose to do this? */
311
for(b = a; *b == ' '; b++) ;
314
for(b = a + strlen(a) - 1; b >= a && *b == ' '; b--)
320
/*********************************************************************
321
Jump straight to the </script>, and don't look at anything in between.
323
end of the script, the extracted script, and the number of newlines.
324
*********************************************************************/
327
findEndScript(const char *h, const char *tagname,
328
bool is_js, char **end_p, char **new_p, int *lines)
336
sprintf(look, "</%s>", tagname);
339
end = strstrCI(s, look);
342
browseError("no closing %s", look);
343
end = (char *)h + strlen(h);
345
/* Check for document.write("</script>");
346
* This isn't legal javascript, but it happens all the time!
347
* This is a really stupid check.
348
* Scan forward 30 chars, on the same line, looking
349
* for a quote, and ) ; or + */
352
s = end + strlen(look);
353
for(j = 0; j < 30; ++j, ++s) {
359
if(c != '"' && c != '\'')
373
*new_p = pullString1(h, end);
374
/* count the newlines */
383
} /* findEndScript */
386
/*********************************************************************
387
The primary goal of this routine is to turn
388
Hey,{ click here } for more information
390
Hey, {click here} for more information
391
But of course we won't do that if the section is preformatted.
392
Nor can we muck with the whitespace that might be present in an input field <>.
393
State variables remember:
394
Whether we are in a preformatted section
395
Whether we have seen any visible text in the document
396
Whether we have seen any visible text in the current hyperlink,
398
Whether we are stepping through a span of whitespace.
399
A tag and adjacent whitespace might be swapped, depending on state.
400
If a change is made, the procedure is run again,
401
kinda like bubble sort.
402
It has the potential to be terribly inefficient,
403
but that's not likely.
404
Use cnt to count the iterations, just for debugging.
405
*********************************************************************/
408
anchorSwap(char *buf)
410
char c, d, *s, *ss, *w, *a;
411
bool premode, pretag, state_braces, state_text, state_atext;
412
bool strong, change, slash;
416
/* Transliterate a few characters. One of them is 0xa0 to space,
417
* so we need to do this now, before the anchors swap with whitespace.
418
* Also get rid of hyperlinks with absolutely nothing to click on. */
419
for(s = w = buf; c = *s; ++s) {
420
static const char from[] =
421
"\x1b\x95\x99\x9c\x9d\x91\x92\x93\x94\xa0\xad\x96\x97\x85\xa6\xc2";
422
static const char becomes[] = "_*'`'`'`' ----- ";
423
ss = strchr(from, c);
425
c = becomes[ss - from];
426
if(c != (char)InternalCodeChar)
428
if(!isdigitByte(s[1]))
430
for(a = s + 2; isdigitByte(*a); ++a) ;
433
for(++a; *a == ' '; ++a) ;
434
if(memcmp(a, "\2000}", 3))
448
premode = state_text = state_atext = state_braces = false;
449
/* w represents the state of whitespace */
451
/* a represents the state of being in an anchor */
454
for(s = buf; c = *s; ++s) {
461
/* end of white space, should we swap it with prior tag? */
462
if(w && a && !premode &&
463
((state_braces & !state_atext) ||
464
((!state_braces) & !state_text))) {
466
memcpy(a + (s - w), tag, n);
471
/* prior anchor has no significance */
474
if(c == (char)InternalCodeChar) {
475
if(!isdigitByte(s[1]))
477
n = strtol(s + 1, &ss, 10);
478
preFormatCheck(n, &pretag, &slash);
480
/* the following should never happen */
481
if(!strchr("{}<>*", d))
494
/* We have a tag, should we swap it with prior whitespace? */
498
((state_braces & state_atext) ||
499
((!state_braces) & state_text)))) {
500
memmove(w + n, w, s - w);
505
state_braces = false;
510
/* prior whitespace doesn't matter any more */
514
state_braces = state_text = true;
522
state_braces = false;
537
/* The remaining tags are <>, for an input field. */
540
/* end of tag processing */
544
w = 0; /* no more whitespace */
549
/* end of loop over the chars in the buffer */
551
/* end of loop making changes */
553
debugPrint(3, "anchorSwap %d", cnt);
555
/* Framing characters like [] around an anchor are unnecessary here,
556
* because we already frame it in braces.
557
* Get rid of these characters, even in premode.
558
* Also, remove trailing pipes on a line. */
559
ss = 0; /* remember location of first pipe */
560
for(s = w = buf; c = *s; ++s) {
561
char open, close, linkchar;
562
if(!strchr("{[(<", c))
564
if(s[1] != (char)InternalCodeChar)
566
if(!isdigitByte(s[2]))
568
for(a = s + 3; isdigitByte(*a); ++a) ;
591
if(d != (char)InternalCodeChar)
593
while(isdigitByte(a[n]))
597
break; /* should never happen */
598
if(strchr("{}<>", d))
617
if(strchr("\r\n\f", c) && ss)
619
if(!isspaceByte(c) && c != '|')
622
} /* loop over buffer */
624
debugPrint(3, "anchors unframed");
626
/* Now compress the implied linebreaks into one. */
628
for(s = buf; c = *s; ++s) {
629
if(c == (char)InternalCodeChar && isdigitByte(s[1])) {
630
n = strtol(s + 1, &s, 10);
632
preFormatCheck(n, &pretag, &slash);
641
for(w = s; isspaceByte(*w); ++w) {
642
if(*w == '\n' || *w == '\f')
653
for(w = ss; w <= s; ++w)
658
for(w = ss; w <= s; ++w)
659
if(*w == '\r' && w != a)
661
} /* loop over buffer */
662
debugPrint(3, "whitespace combined");
666
/*********************************************************************
667
Format text, and break lines at sentence/phrase boundaries.
668
The prefix bl means breakline.
669
*********************************************************************/
671
static char *bl_start, *bl_cursor, *bl_end;
672
static bool bl_overflow;
673
static int colno; /* column number */
674
static const int optimalLine = 80; /* optimal line length */
675
static const int cutLineAfter = 36; /* cut sentence after this column */
676
static const int paraLine = 120; /* paragraph in a line */
677
static int longcut, pre_cr;
678
static int lspace; /* last space value, 3 = paragraph */
679
/* Location of period comma rightparen or any word.
680
* Question mark is equivalent to period etc.
681
* Other things being equal, we break at period, rather than comma, etc.
682
* First the column numbers, then the index into the string. */
683
static int lperiod, lcomma, lright, lany;
684
static int idxperiod, idxcomma, idxright, idxany;
687
debugChunk(const char *chunk, int len)
693
for(i = 0; i < len; ++i) {
717
printf(">%d.%d\n", colno, lspace);
721
appendOneChar(char c)
723
if(bl_cursor == bl_end)
727
} /* appendOneChar */
730
spaceNotInInput(void)
734
for(--t; t >= bl_start; --t) {
736
if(c == '\n' || c == '\r')
738
if(c == '>' && t >= bl_start + 2 &&
739
t[-1] == '0' && t[-2] == (char)InternalCodeChar)
743
while(t > bl_start && isdigitByte(t[-1]))
747
if(t > bl_start && t[-1] == (char)InternalCodeChar)
751
} /* spaceNotInInput */
754
appendSpaceChunk(const char *chunk, int len, bool premode)
756
int nlc = pre_cr; /* newline count */
757
int spc = 0; /* space count */
763
for(i = 0; i < len; ++i) {
765
if(c == '\n' || c == '\r') {
776
if(!premode && spaceNotInInput()) {
777
int l = bl_cursor - bl_start;
784
if(strchr(")\"|}", d))
786
if(strchr(".?!:", e)) {
788
/* Check for Mr. Mrs. and others. */
789
if(e == '.' && bl_cursor - bl_start > 10) {
790
static const char *const prefix[] =
791
{ "mr.", "mrs.", "sis.", "ms.", 0 };
793
for(i = 0; i < 6; ++i) {
794
c = bl_cursor[i - 6];
800
for(i = 0; prefix[i]; ++i)
801
if(strstr(trailing, prefix[i]))
803
/* Check for John C. Calhoon */
804
if(isupperByte(bl_cursor[-2]) && isspaceByte(bl_cursor[-3]))
808
lperiod = colno, idxperiod = l;
811
if(strchr(")\"|", d))
814
lcomma = colno, idxcomma = l;
815
if(strchr(")\"|", d))
816
lright = colno, idxright = l;
817
lany = colno, idxany = l;
818
/* tack a short fragment onto the previous line. */
819
if(longcut && colno <= 15 && (nlc || lperiod == colno)) {
820
bl_start[longcut] = ' ';
822
len = spc = 0, nlc = 1;
823
} /* pasting small fragment onto previous line */
824
} /* allowing line breaks */
834
longcut = lperiod = lcomma = lright = lany = 0;
835
if(lspace >= 2 || nlc > 1)
845
/* if the first char of the text to be reformatted is space,
846
* then we will wind up here, with lspace = 3. */
855
for(i = 0; i < len; ++i) {
857
if(c == '\n' || c == '\r' || c == '\f')
863
for(; i < len; ++i) {
874
} /* appendSpaceChunk */
877
appendPrintableChunk(const char *chunk, int len, bool premode)
880
for(i = 0; i < len; ++i)
881
appendOneChar(chunk[i]);
886
if(colno <= optimalLine)
888
/* Oops, line is getting long. Let's see where we can cut it. */
890
if(lperiod > cutLineAfter)
891
i = lperiod, j = idxperiod;
892
else if(lcomma > cutLineAfter)
893
i = lcomma, j = idxcomma;
894
else if(lright > cutLineAfter)
895
i = lright, j = idxright;
896
else if(lany > cutLineAfter)
897
i = lany, j = idxany;
899
return; /* nothing we can do about it */
909
} /* appendPrintableChunk */
911
/* Break up a line using the above routines.
912
* The buffer for the new text must be supplied.
913
* Return false (fail) if we ran out of room.
914
* This function is called from bufsup.c, implementing the bl command,
915
* and is only in this file because it shares the above routines and variables
916
* with the html reformatting, which really has to be here. */
918
breakLine(const char *line, int len, int *newlen)
920
char c, state, newstate;
924
if(len && line[len - 1] == '\r')
927
/* special continuation code from the previous invokation */
935
lspace = 2; /* should never happen */
938
bl_start = bl_cursor = replaceLine;
939
bl_end = replaceLine + REPLACELINELEN - 8;
942
longcut = lperiod = lcomma = lright = lany = 0;
948
for(i = 0; i < len; ++i) {
951
if(!c || strchr(" \t\n\r\f", c))
953
if(state == newstate)
960
/* state change here */
961
debugChunk(line + last, i - last);
963
appendSpaceChunk(line + last, i - last, false);
965
appendPrintableChunk(line + last, i - last, false);
971
if(state) { /* last token */
972
debugChunk(line + last, len - last);
974
appendSpaceChunk(line + last, len - last, false);
976
appendPrintableChunk(line + last, len - last, false);
979
if(lspace < 2) { /* line didn't have a \r at the end */
980
appendSpaceChunk("\n", 1, false);
982
if(bl_cursor - bl_start > paraLine)
984
debugPrint(7, "chunk<EOL>%d.%d", colno, lspace);
985
*newlen = bl_cursor - bl_start;
996
htmlReformat(const char *buf)
998
const char *h, *nh, *s;
1000
bool premode = false;
1005
longcut = lperiod = lcomma = lright = lany = 0;
1009
bl_start = bl_cursor = replaceLine;
1010
bl_end = replaceLine + REPLACELINELEN - 8;
1011
bl_overflow = false;
1012
new = initString(&l);
1014
for(h = buf; (c = *h); h = nh) {
1015
if(isspaceByte(c)) {
1016
for(s = h + 1; isspaceByte(*s); ++s) ;
1018
appendSpaceChunk(h, nh - h, premode);
1019
if(lspace == 3 || lspace == 2 &&
1020
(bl_cursor - bl_start) >= (bl_end - bl_start) * 2 / 3) {
1021
if(bl_cursor > bl_start)
1022
stringAndBytes(&new, &l, bl_start, bl_cursor - bl_start);
1023
bl_cursor = bl_start;
1025
longcut = lperiod = lcomma = lright = lany = 0;
1031
if(c != (char)InternalCodeChar) {
1032
for(s = h + 1; *s; ++s)
1033
if(isspaceByte(*s) || *s == (char)InternalCodeChar)
1036
appendPrintableChunk(h, nh - h, premode);
1042
tagno = strtol(h + 1, (char **)&nh, 10);
1044
if(!c || !strchr("{}<>*", c))
1045
errorPrint("@tag code %d has bad character %c following", tagno, c);
1046
appendPrintableChunk(h, nh - h, premode);
1047
preFormatCheck(tagno, &pretag, &slash);
1051
/* Insert newlines between adjacent hyperlinks. */
1052
if(c != '}' || premode)
1054
for(h = nh; c = *h; ++h)
1055
if(!strchr(" \t,:-|;", c))
1057
if(!c || strchr("\r\n\f", c)) {
1061
if(c != (char)InternalCodeChar)
1063
/* Does this start a new hyperlink? */
1064
for(s = h + 1; isdigitByte(*s); ++s) ;
1067
appendSpaceChunk("\n", 1, false);
1069
} /* loop over text */
1071
/* close off the last line */
1073
appendSpaceChunk("\n", 1, true);
1074
if(bl_cursor > bl_start)
1075
stringAndBytes(&new, &l, bl_start, bl_cursor - bl_start);
1076
/* Get rid of last space. */
1077
if(l >= 2 && new[l - 1] == '\n' && new[l - 2] == ' ')
1078
new[l - 2] = '\n', new[--l] = 0;
1079
/* Don't need empty lines at the end. */
1080
while(l > 1 && new[l - 1] == '\n' && new[l - 2] == '\n')
1083
/* Don't allow an empty buffer */
1085
stringAndChar(&new, &l, '\n');
1088
} /* htmlReformat */
1091
/*********************************************************************
1092
And-convert the string; you know, < etc.
1093
This is the routine that makes it possible for me to read, and write,
1094
my math site. http://www.mathreference.com/accessible.html
1095
In the invisible mode, graphics characters are not rendered at all.
1096
This is used when translating attributes inside tags,
1097
such as HREF, in an anchor.
1098
The original string is not disturbed.
1099
The new string is allocated.
1100
*********************************************************************/
1103
andTranslate(const char *s, bool invisible)
1108
uchar alnum = 0; /* was last char an alphanumeric */
1112
static const char *const andwords[] = {
1206
"laquo\0left arrow",
1274
"#8592\0left arrow",
1277
"#8595\0down arrow",
1278
"#8660\0double arrow",
1285
"#8713\0not a member of",
1293
"#8773\0congruent to",
1297
"#8834\0proper subset of",
1298
"#8835\0proper superset of",
1299
"#8836\0not a subset of",
1301
"#8839\0superset of",
1308
if(s == EMPTYSTRING)
1310
new = initString(&l);
1313
if(c == (uchar) InternalCodeChar && !invisible) {
1314
const char *t = s + 1;
1315
while(isdigitByte(*t))
1317
if(t > s + 1 && *t && strchr("{}<>*", *t)) { /* it's a tag */
1318
bool separate, pretag, slash;
1320
preFormatCheck(n, &pretag, &slash);
1321
separate = (*t != '*');
1324
debugPrint(7, "tag %d%c separate %d", n, *t, separate);
1328
stringAndBytes(&new, &l, s, t - s);
1338
for(j = 0; j < sizeof (andbuf); ++j) {
1340
if(d == '&' || d == ';' || d <= ' ')
1343
if(j == sizeof (andbuf))
1344
goto putc; /* too long, no match */
1345
strncpy(andbuf, s + 1, j);
1350
/* remove leading zeros */
1351
if(andbuf[0] == '#')
1352
while(andbuf[1] == '0')
1353
strcpy(andbuf + 1, andbuf + 2);
1356
debugPrint(6, "meta %s", andbuf);
1357
n = stringInList(andwords, andbuf);
1358
if(n >= 0) { /* match */
1359
const char *r = andwords[n] + strlen(andwords[n]) + 1; /* replacement string */
1361
if(!r[1]) { /* replace with a single character */
1370
/* We're replacing with a word */
1371
if(!invisible && isalnumByte(*r)) {
1372
/* insert spaces either side */
1374
stringAndChar(&new, &l, ' ');
1378
stringAndString(&new, &l, r);
1382
if(andbuf[0] != '#')
1384
n = stringIsNum(andbuf + 1);
1390
/* don't allow nulls */
1393
if(strchr("\r\n\f", c) && !premode)
1395
if(c == (uchar) InternalCodeChar)
1401
if(isalnumByte(c)) {
1403
stringAndChar(&new, &l, ' ');
1407
stringAndChar(&new, &l, c);
1409
} /* loop over input string */
1412
} /* andTranslate */