1
/* General SGML Parser code SGML.c
2
** ========================
4
** This module implements an HTStream object. To parse an
5
** SGML file, create this object which is a parser. The object
6
** is (currently) created by being passed a DTD structure,
7
** and a target HTStructured object at which to throw the parsed stuff.
9
** 6 Feb 93 Binary searches used. Interface modified.
14
/* Remove the following to disable the experimental HTML DTD parsing.
15
Currently only used in this source file. - kw */
17
#ifndef NO_EXTENDED_HTMLDTD
18
#define EXTENDED_HTMLDTD
30
#include <LYCharSets.h>
31
#include <LYCharVals.h> /* S/390 -- gil -- 0635 */
32
#include <LYGlobalDefs.h>
33
#include <LYStrings.h>
36
#ifdef USE_COLOR_STYLE
40
# include <LYPrettySrc.h>
47
char* entity_string; /* this is used for printing entity name.
48
Unconditionally added since redundant assigments don't hurt much*/
50
PRIVATE void fake_put_character ARGS2(
59
#define PUTS_TR(x) psrc_convert_string = TRUE; PUTS(x)
63
/* my_casecomp() - optimized by the first character, NOT_ASCII ok */
64
#define my_casecomp(a,b) ((TOUPPER(*a) == TOUPPER(*b)) ? \
66
(TOASCII(TOUPPER(*a)) - TOASCII(TOUPPER(*b))))
69
/* will use partially inlined version */
70
#define orig_HTChunkPutUtf8Char HTChunkPutUtf8Char
71
#undef HTChunkPutUtf8Char
73
/* ...used for comments and attributes value like href... */
74
#define HTChunkPutUtf8Char(ch,x) \
76
if ((TOASCII(x) < 128) && (ch->size < ch->allocated)) \
77
ch->data[ch->size++] = (char)x; \
79
orig_HTChunkPutUtf8Char(ch,x); \
83
#define orig_HTChunkPutc HTChunkPutc
86
#define HTChunkPutc(ch,x) \
88
if (ch->size < ch->allocated) \
89
ch->data[ch->size++] = x; \
91
orig_HTChunkPutc(ch,x); \
94
#undef HTChunkTerminate
96
#define HTChunkTerminate(ch) \
97
HTChunkPutc(ch, (char)0)
99
#endif /* ANSI_PREPRO */
101
#define PUTS(str) ((*context->actions->put_string)(context->target, str))
102
#define PUTC(ch) ((*context->actions->put_character)(context->target, ch))
103
#define PUTUTF8(code) (UCPutUtf8_charstring((HTStream *)context->target, \
104
(putc_func_t*)(context->actions->put_character), code))
109
/*the following macros are used for pretty source view. */
110
#define IS_C(attr) (attr.type == HTMLA_CLASS)
112
PUBLIC HTCJKlang HTCJK = NOCJK; /* CJK enum value. */
113
PUBLIC BOOL HTPassEightBitRaw = FALSE; /* Pass 161-172,174-255 raw. */
114
PUBLIC BOOL HTPassEightBitNum = FALSE; /* Pass ^ numeric entities raw. */
115
PUBLIC BOOL HTPassHighCtrlRaw = FALSE; /* Pass 127-160,173, raw. */
116
PUBLIC BOOL HTPassHighCtrlNum = FALSE; /* Pass €-Ÿ raw. */
118
/* The State (context) of the parser
120
** This is passed with each call to make the parser reentrant
124
#define MAX_ATTRIBUTES 36 /* Max number of attributes per element */
129
** This allows us to return down the stack reselecting styles.
130
** As we return, attribute values will be garbage in general.
132
typedef struct _HTElement HTElement;
134
HTElement * next; /* Previously nested element or 0 */
135
HTTag* tag; /* The tag at this level */
184
/* Internal Context Data Structure
185
** -------------------------------
189
CONST HTStreamClass * isa; /* inherited from HTStream */
192
CONST HTStructuredClass *actions; /* target class */
193
HTStructured *target; /* target object */
197
CONST HTTag *unknown_tag;
199
BOOL no_lynx_specialcodes;
200
int current_attribute_number;
204
HTElement *element_stack;
206
unsigned char kanji_buf;
209
#endif /* CALLERDATA */
210
BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
211
char * value[MAX_ATTRIBUTES]; /* NULL, or strings alloc'd with StrAllocCopy_extra() */
213
BOOL lead_exclamation;
216
BOOL doctype_bracket;
221
HTParentAnchor * node_anchor;
222
LYUCcharset * inUCI; /* pointer to anchor UCInfo */
223
int inUCLYhndl; /* charset we are fed */
224
LYUCcharset * outUCI; /* anchor UCInfo for target */
225
int outUCLYhndl; /* charset for target */
231
int current_tag_charset; /* charset to pass attributes */
236
char * active_include;
242
BOOL cur_attr_is_href;
243
BOOL cur_attr_is_name;
244
BOOL seen_nonwhite_in_junk_tag;
248
#ifndef NO_LYNX_TRACE
249
PRIVATE char *state_name ARGS1(sgml_state, n)
253
case S_attr: result = "S_attr"; break;
254
case S_attr_gap: result = "S_attr_gap"; break;
255
case S_comment: result = "S_comment"; break;
256
case S_cro: result = "S_cro"; break;
257
case S_doctype: result = "S_doctype"; break;
258
case S_dollar: result = "S_dollar"; break;
259
case S_dollar_dq: result = "S_dollar_dq"; break;
260
case S_dollar_paren: result = "S_dollar_paren"; break;
261
case S_dollar_paren_dq: result = "S_dollar_paren_dq"; break;
262
case S_dollar_paren_sq: result = "S_dollar_paren_sq"; break;
263
case S_dollar_sq: result = "S_dollar_sq"; break;
264
case S_dquoted: result = "S_dquoted"; break;
265
case S_end: result = "S_end"; break;
266
case S_entity: result = "S_entity"; break;
267
case S_equals: result = "S_equals"; break;
268
case S_ero: result = "S_ero"; break;
269
case S_esc: result = "S_esc"; break;
270
case S_esc_dq: result = "S_esc_dq"; break;
271
case S_esc_sq: result = "S_esc_sq"; break;
272
case S_exclamation: result = "S_exclamation"; break;
273
case S_in_kanji: result = "S_in_kanji"; break;
274
case S_incro: result = "S_incro"; break;
275
case S_junk_pi: result = "S_junk_pi"; break;
276
case S_junk_tag: result = "S_junk_tag"; break;
277
case S_litteral: result = "S_litteral"; break;
278
case S_marked: result = "S_marked"; break;
279
case S_nonascii_text: result = "S_nonascii_text"; break;
280
case S_nonascii_text_dq: result = "S_nonascii_text_dq"; break;
281
case S_nonascii_text_sq: result = "S_nonascii_text_sq"; break;
282
case S_paren: result = "S_paren"; break;
283
case S_paren_dq: result = "S_paren_dq"; break;
284
case S_paren_sq: result = "S_paren_sq"; break;
285
case S_pcdata: result = "S_pcdata"; break;
286
case S_script: result = "S_script"; break;
287
case S_sgmlatt: result = "S_sgmlatt"; break;
288
case S_sgmlele: result = "S_sgmlele"; break;
289
case S_sgmlent: result = "S_sgmlent"; break;
290
case S_squoted: result = "S_squoted"; break;
291
case S_tag: result = "S_tag"; break;
292
case S_tag_gap: result = "S_tag_gap"; break;
293
case S_tagname_slash: result = "S_tagname_slash"; break;
294
case S_text: result = "S_text"; break;
295
case S_value: result = "S_value"; break;
301
/* storage for Element Stack */
303
static HTElement pool[DEPTH];
304
static int depth = 0;
306
PRIVATE HTElement* pool_alloc NOARGS
310
return (HTElement*) malloc(sizeof(HTElement));
311
return (pool + depth - 1);
314
PRIVATE void pool_free ARGS1(HTElement*, e)
324
PRIVATE void HTMLSRC_apply_markup ARGS3(
329
HT_tagspec* ts = *( ( start ? lexeme_start : lexeme_end ) + lexeme);
332
#ifdef USE_COLOR_STYLE
334
current_tag_style = ts->style;
335
force_current_tag_style = TRUE;
336
forced_classname = ts->class_name;
337
force_classname = TRUE;
340
CTRACE((tfp,ts->start ? "SRCSTART %d\n" : "SRCSTOP %d\n",(int)lexeme));
342
(*context->actions->start_element)(
346
(CONST char **)ts->value,
347
context->current_tag_charset,
348
(char **)&context->include);
350
(*context->actions->end_element)(
353
(char **)&context->include);
359
# define PSRCSTART(x) HTMLSRC_apply_markup(context,HTL_##x,START)
360
# define PSRCSTOP(x) HTMLSRC_apply_markup(context,HTL_##x,STOP)
362
# define PSRCSTART(x) HTMLSRC_apply_markup(context,HTL_/**/x,START)
363
# define PSRCSTOP(x) HTMLSRC_apply_markup(context,HTL_/**/x,STOP)
366
#define attr_is_href context->cur_attr_is_href
367
#define attr_is_name context->cur_attr_is_name
370
PRIVATE void set_chartrans_handling ARGS3(
372
HTParentAnchor *, anchor,
377
** Nothing was set for the parser in earlier stages,
378
** so the HTML parser's UCLYhndl should still be its
381
chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_STRUCTURED);
384
** That wasn't set either, so seek the HText default. - FM
386
chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT);
389
** That wasn't set either, so assume the current display
390
** character set. - FM
392
chndl = current_char_set;
394
** Try to set the HText and HTML stages' chartrans info
395
** with the default lock level (will not be changed if
396
** it was set previously with a higher lock level). - FM
398
HTAnchor_setUCInfoStage(anchor, chndl,
401
HTAnchor_setUCInfoStage(anchor, chndl,
402
UCT_STAGE_STRUCTURED,
405
** Get the chartrans info for output to the HTML parser. - FM
407
context->outUCI = HTAnchor_getUCInfoStage(anchor,
408
UCT_STAGE_STRUCTURED);
409
context->outUCLYhndl = HTAnchor_getUCLYhndl(context->node_anchor,
410
UCT_STAGE_STRUCTURED);
413
** Set the in->out transformation parameters. - FM
415
UCSetTransParams(&context->T,
416
context->inUCLYhndl, context->inUCI,
417
context->outUCLYhndl, context->outUCI);
419
** This is intended for passing the SGML parser's input
420
** charset as an argument in each call to the HTML
421
** parser's start tag function, but it would be better
422
** to call a Lynx_HTML_parser function to set an element
423
** in its HTStructured object, itself, if this were
426
if (HTCJK != NOCJK) {
427
context->current_tag_charset = -1;
428
} else if (context->T.transp) {
429
context->current_tag_charset = context->inUCLYhndl;
430
} else if (context->T.decode_utf8) {
431
context->current_tag_charset = context->inUCLYhndl;
432
} else if (context->T.do_8bitraw ||
433
context->T.use_raw_char_in) {
434
context->current_tag_charset = context->inUCLYhndl;
435
} else if (context->T.output_utf8 ||
436
context->T.trans_from_uni) {
437
context->current_tag_charset = UCGetLYhndl_byMIME("utf-8");
439
context->current_tag_charset = LATIN1;
443
PRIVATE void change_chartrans_handling ARGS1(
446
int new_LYhndl = HTAnchor_getUCLYhndl(context->node_anchor,
448
if (new_LYhndl != context->inUCLYhndl &&
451
* Something changed. but ignore if a META wants an unknown charset.
453
LYUCcharset * new_UCI = HTAnchor_getUCInfoStage(context->node_anchor,
456
LYUCcharset * next_UCI = HTAnchor_getUCInfoStage(
457
context->node_anchor, UCT_STAGE_STRUCTURED
459
int next_LYhndl = HTAnchor_getUCLYhndl(
460
context->node_anchor, UCT_STAGE_STRUCTURED
462
context->inUCI = new_UCI;
463
context->inUCLYhndl = new_LYhndl;
464
context->outUCI = next_UCI;
465
context->outUCLYhndl = next_LYhndl;
466
set_chartrans_handling(context,
467
context->node_anchor, next_LYhndl);
472
#ifdef USE_COLOR_STYLE
473
#include <AttrList.h>
474
static int current_is_class = 0;
480
/* PUBLIC CONST char * SGML_default = ""; ?? */
482
PRIVATE void handle_attribute_name ARGS2(
486
HTTag * tag = context->current_tag;
487
attr * attributes = tag->attributes;
488
int high, low, i, diff;
492
attr_is_href = FALSE;
493
attr_is_name = FALSE;
497
** Ignore unknown tag. - KW
499
if (tag == context->unknown_tag) {
502
context->current_attribute_number = 1; /* anything !=INVALID */
508
** Binary search for attribute name.
510
for (low = 0, high = tag->number_of_attributes;
512
diff < 0 ? (low = i+1) : (high = i)) {
513
i = (low + (high-low)/2);
514
diff = my_casecomp(attributes[i].name, s);
515
if (diff == 0) { /* success: found it */
516
context->current_attribute_number = i;
519
attr_is_name = (BOOL) (attributes[i].type == HTMLA_ANAME);
520
attr_is_href = (BOOL) (attributes[i].type == HTMLA_HREF);
524
context->present[i] = YES;
525
Clear_extra(context->value[i]);
526
#ifdef USE_COLOR_STYLE
527
# ifdef USE_PRETTYSRC
528
current_is_class = IS_C(attributes[i]);
530
current_is_class = (!strcasecomp("class", s));
532
CTRACE((tfp, "SGML: found attribute %s, %d\n", s, current_is_class));
540
CTRACE((tfp, "SGML: Unknown attribute %s for tag %s\n",
541
s, context->current_tag->name));
542
context->current_attribute_number = INVALID; /* Invalid */
546
/* Handle attribute value
547
** ----------------------
549
PRIVATE void handle_attribute_value ARGS2(
553
if (context->current_attribute_number != INVALID) {
554
StrAllocCopy_extra(context->value[context->current_attribute_number], s);
555
#ifdef USE_COLOR_STYLE
556
if (current_is_class)
558
strncpy (class_string, s, TEMPSTRINGSIZE);
559
CTRACE((tfp, "SGML: class is '%s'\n", s));
563
CTRACE((tfp, "SGML: attribute value is '%s'\n", s));
567
CTRACE((tfp, "SGML: Attribute value %s ***ignored\n", s));
569
context->current_attribute_number = INVALID; /* can't have two assignments! */
574
** Translate some Unicodes to Lynx special codes and output them.
575
** Special codes - ones those output depend on parsing.
577
** Additional issue, like handling bidirectional text if necessary
578
** may be called from here: zwnj (8204), zwj (8205), lrm (8206), rlm (8207)
579
** - currently they are ignored in SGML.c and LYCharUtils.c
580
** but also in UCdomap.c because they are non printable...
583
PRIVATE BOOL put_special_unicodes ARGS2(
588
if (context->no_lynx_specialcodes) {
590
** We were asked by a "DTD" flag to not generate lynx specials. - kw
595
if (code == CH_NBSP) { /* S/390 -- gil -- 0657 */
597
** Use Lynx special character for nbsp.
602
PUTC(HT_NON_BREAK_SPACE);
603
} else if (code == CH_SHY) {
605
** Use Lynx special character for shy.
610
PUTC(LY_SOFT_HYPHEN);
611
} else if (code == 8194 || code == 8201) {
613
** Use Lynx special character for ensp or thinsp.
615
** Originally, Lynx use space '32' as word delimiter and omits this
616
** space at end of line if word is wrapped to the next line. There
617
** are several other spaces in the Unicode repertoire and we should
618
** teach Lynx to understand them, not only as regular characters but
619
** in the context of line wrapping. Unfortunately, if we use
620
** HT_EN_SPACE we override the chartrans tables for those spaces
621
** with a single '32' for all (but do line wrapping more fancy).
623
** We may treat emsp as one or two ensp (below).
629
} else if (code == 8195) {
631
** Use Lynx special character for emsp.
636
/* PUTC(HT_EN_SPACE); let's stay with a single space :) */
643
** Return NO if nothing done.
648
** We have handled it.
654
PRIVATE void put_pretty_entity ARGS2(HTStream *, context, int, term)
664
PRIVATE void put_pretty_number ARGS1(HTStream *, context)
667
PUTS( (context->isHex ? "&#x" : "&#") );
672
#endif /* USE_PRETTYSRC */
678
** s contains the entity name zero terminated
680
** If the entity name is unknown, the terminator is treated as
681
** a printable non-special character in all cases, even if it is '<'
683
** Modified SGML_character() so we only come here with terminator
684
** as '\0' and check a FoundEntity flag. -- Foteos Macrides
686
** Modified more (for use with Lynx character translation code):
688
PRIVATE char replace_buf [64]; /* buffer for replacement strings */
689
PRIVATE BOOL FoundEntity = FALSE;
691
PRIVATE void handle_entity ARGS2(
697
CONST char *s = context->string->data;
700
** Handle all entities normally. - FM
703
if ((code = HTMLGetEntityUCValue(s)) != 0) {
705
** We got a Unicode value for the entity name.
706
** Check for special Unicodes. - FM
708
if (put_special_unicodes(context, code)) {
711
put_pretty_entity(context, term);
718
** Seek a translation from the chartrans tables.
720
if ((uck = UCTransUniChar(code, context->outUCLYhndl)) >= 32 &&
721
/* =============== work in ASCII below here =============== S/390 -- gil -- 0672 */
724
uck >= LYlowest_eightbit[context->outUCLYhndl])) {
727
put_pretty_entity(context, term);
730
PUTC(FROMASCII((char)uck));
733
} else if ((uck == -4 ||
734
(context->T.repl_translated_C0 &&
735
uck > 0 && uck < 32)) &&
737
** Not found; look for replacement string.
739
(uck = UCTransUniCharStr(replace_buf, 60, code,
740
context->outUCLYhndl, 0) >= 0)) {
743
put_pretty_entity(context, term);
751
** If we're displaying UTF-8, try that now. - FM
753
#ifndef USE_PRETTYSRC
754
if (context->T.output_utf8 && PUTUTF8(code)) {
759
if (context->T.output_utf8 && (psrc_view ?
760
(UCPutUtf8_charstring((HTStream *)context->target,
761
(putc_func_t*)(fake_put_character), code)): PUTUTF8(code) ) ) {
764
put_pretty_entity(context, term);
772
** If it's safe ASCII, use it. - FM
774
if (code >= 32 && code < 127) {
777
put_pretty_entity(context, term);
781
PUTC(FROMASCII((char)code));
785
/* =============== work in ASCII above here =============== S/390 -- gil -- 0682 */
787
** Ignore zwnj (8204) and zwj (8205), if we get to here.
788
** Note that zwnj may have been handled as <WBR>
789
** by the calling function. - FM
791
if (!strcmp(s, "zwnj") ||
793
CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s));
796
put_pretty_entity(context, term);
803
** Ignore lrm (8206), and rln (8207), if we get to here. - FM
805
if (!strcmp(s, "lrm") ||
807
CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s));
810
put_pretty_entity(context, term);
819
** If entity string not found, display as text.
825
CTRACE((tfp, "SGML: Unknown entity '%s' %ld %ld\n", s, (long)code, uck)); /* S/390 -- gil -- 0695 */
840
PRIVATE void handle_comment ARGS1(
843
CONST char *s = context->string->data;
845
CTRACE((tfp, "SGML Comment:\n<%s>\n", s));
847
if (context->csi == NULL &&
848
strncmp(s, "!--#", 4) == 0 &&
849
LYCheckForCSI(context->node_anchor, (char **)&context->url) == TRUE) {
850
LYDoCSI(context->url, s, (char **)&context->csi);
852
LYCommentHacks(context->node_anchor, context->string->data);
862
PRIVATE void handle_identifier ARGS1(
865
CONST char *s = context->string->data;
867
CTRACE((tfp, "SGML Identifier:\n<%s>\n", s));
876
PRIVATE void handle_doctype ARGS1(
879
CONST char *s = context->string->data;
881
CTRACE((tfp, "SGML Doctype:\n<%s>\n", s));
886
PRIVATE void SGML_write PARAMS((
894
PRIVATE void handle_marked ARGS1(
897
CONST char *s = context->string->data;
899
CTRACE((tfp, "SGML Marked Section:\n<%s>\n", s));
901
if (!strncmp(context->string->data, "![INCLUDE[", 10)) {
902
context->string->data[context->string->size - 3] = '\0';
903
StrAllocCat(context->include, context->string->data + 10);
904
/* @@@ This needs to take charset into account! @@@
905
the wrong assumptions will be made about the data's
906
charset once it is in include - kw */
908
} else if (!strncmp(context->string->data, "![CDATA[", 8)) {
909
(*context->actions->_write)(context->target,
910
context->string->data + 8,
911
context->string->size - 11);
921
PRIVATE void handle_sgmlent ARGS1(
924
CONST char *s = context->string->data;
926
CTRACE((tfp, "SGML Entity Declaration:\n<%s>\n", s));
935
PRIVATE void handle_sgmlele ARGS1(
938
CONST char *s = context->string->data;
940
CTRACE((tfp, "SGML Element Declaration:\n<%s>\n", s));
949
PRIVATE void handle_sgmlatt ARGS1(
952
CONST char *s = context->string->data;
954
CTRACE((tfp, "SGML Attribute Declaration:\n<%s>\n", s));
960
* Convenience macros - tags (elements) are identified sometimes
961
* by an int or enum value ('TAGNUM'), sometimes
962
* by a pointer to HTTag ('TAGP'). - kw
964
#define TAGNUM_OF_TAGP(t) (t - context->dtd->tags)
965
#define TAGP_OF_TAGNUM(e) (context->dtd->tags + e)
968
* The following implement special knowledge about OBJECT.
969
* As long as HTML_OBJECT is the only tag for which an alternative
970
* variant exist, they can be simple macros. - kw
972
/* does 'TAGNUM' e have an alternative (variant) parsing mode? */
973
#define HAS_ALT_TAGNUM(e) (e == HTML_OBJECT)
975
/* return 'TAGNUM' of the alternative mode for 'TAGNUM' e, if any. */
976
#define ALT_TAGNUM(e) ((e == HTML_OBJECT) ? HTML_ALT_OBJECT : e)
978
/* return 'TAGNUM' of the normal mode for 'TAGNUM' e which may be alt. */
979
#define NORMAL_TAGNUM(e) ((e >= HTML_ELEMENTS) ? HTML_OBJECT : e)
981
/* More convenience stuff. - kw */
982
#define ALT_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(ALT_TAGNUM(e))
983
#define NORMAL_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(NORMAL_TAGNUM(e))
985
#define ALT_TAGP(t) ALT_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t))
986
#define NORMAL_TAGP(t) NORMAL_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t))
989
#ifdef EXTENDED_HTMLDTD
991
PRIVATE BOOL element_valid_within ARGS3(
993
HTTag *, stacked_tag,
996
TagClass usecontains, usecontained;
997
if (!stacked_tag || !new_tag)
999
usecontains = (direct ? stacked_tag->contains : stacked_tag->icontains);
1000
usecontained = (direct ? new_tag->contained : new_tag->icontained);
1001
if (new_tag == stacked_tag)
1002
return (BOOL) ((Tgc_same & usecontains) &&
1003
(Tgc_same & usecontained));
1005
return (BOOL) ((new_tag->tagclass & usecontains) &&
1006
(stacked_tag->tagclass & usecontained));
1015
PRIVATE canclose_t can_close ARGS2(
1017
HTTag *, stacked_tag)
1021
if (stacked_tag->flags & Tgf_endO)
1023
else if (new_tag == stacked_tag)
1024
return ((Tgc_same & new_tag->canclose) ? close_error : close_NO);
1026
return ((stacked_tag->tagclass & new_tag->canclose) ?
1027
close_error : close_NO);
1030
PRIVATE void do_close_stacked ARGS1(
1031
HTStream *, context)
1033
HTElement * stacked = context->element_stack;
1036
return; /* stack was empty */
1037
if (context->inSELECT && !strcasecomp(stacked->tag->name, "SELECT")) {
1038
context->inSELECT = FALSE;
1040
e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(stacked->tag));
1041
#ifdef USE_PRETTYSRC
1042
if (!psrc_view) /* Don't actually pass call on if viewing psrc - kw */
1044
(*context->actions->end_element)(
1047
(char **)&context->include);
1048
context->element_stack = stacked->next;
1050
context->no_lynx_specialcodes = context->element_stack ?
1051
(context->element_stack->tag->flags & Tgf_nolyspcl) : NO;
1054
PRIVATE int is_on_stack ARGS2(
1055
HTStream *, context,
1058
HTElement * stacked = context->element_stack;
1060
for (; stacked; stacked = stacked->next, i++) {
1061
if (stacked->tag == old_tag ||
1062
stacked->tag == ALT_TAGP(old_tag))
1067
#endif /* EXTENDED_HTMLDTD */
1072
PRIVATE void end_element ARGS2(
1073
HTStream *, context,
1076
#ifdef EXTENDED_HTMLDTD
1078
BOOL extra_action_taken = NO;
1079
canclose_t canclose_check = close_valid;
1080
int stackpos = is_on_stack(context, old_tag);
1083
while (canclose_check != close_NO &&
1084
context->element_stack &&
1085
(stackpos > 1 || (!extra_action_taken && stackpos == 0))) {
1086
if (stackpos == 0 && (old_tag->flags & Tgf_startO) &&
1087
element_valid_within(old_tag, context->element_stack->tag, YES)) {
1088
CTRACE((tfp, "SGML: </%s> ignored\n", old_tag->name));
1091
canclose_check = can_close(old_tag, context->element_stack->tag);
1092
if (canclose_check != close_NO) {
1093
CTRACE((tfp, "SGML: End </%s> \t<- %s end </%s>\n",
1094
context->element_stack->tag->name,
1095
canclose_check == close_valid ? "supplied," : "***forced by",
1097
do_close_stacked(context);
1098
extra_action_taken = YES;
1099
stackpos = is_on_stack(context, old_tag);
1103
if (stackpos == 0 && old_tag->contents != SGML_EMPTY) {
1104
CTRACE((tfp, "SGML: Still open %s, ***no open %s for </%s>\n",
1105
context->element_stack ?
1106
context->element_stack->tag->name : "none",
1112
CTRACE((tfp, "SGML: Nesting <%s>...<%s> \t<- ***invalid end </%s>\n",
1114
context->element_stack->tag->name,
1119
/* Now let the non-extended code deal with the rest. - kw */
1121
#endif /* EXTENDED_HTMLDTD */
1124
** If we are in a SELECT block, ignore anything
1125
** but a SELECT end tag. - FM
1127
if (context->inSELECT) {
1128
if (!strcasecomp(old_tag->name, "SELECT")) {
1130
** Turn off the inSELECT flag and fall through. - FM
1132
context->inSELECT = FALSE;
1135
** Ignore the end tag. - FM
1137
CTRACE((tfp, "SGML: ***Ignoring end tag </%s> in SELECT block.\n",
1143
** Handle the end tag. - FM
1145
CTRACE((tfp, "SGML: End </%s>\n", old_tag->name));
1146
if (old_tag->contents == SGML_EMPTY) {
1147
CTRACE((tfp, "SGML: ***Illegal end tag </%s> found.\n",
1151
#ifdef WIND_DOWN_STACK
1152
while (context->element_stack) /* Loop is error path only */
1154
if (context->element_stack) /* Substitute and remove one stack element */
1155
#endif /* WIND_DOWN_STACK */
1159
HTElement * N = context->element_stack;
1160
HTTag * t = (N->tag != old_tag) ? NORMAL_TAGP(N->tag) : N->tag;
1162
if (old_tag != t) { /* Mismatch: syntax error */
1163
if (context->element_stack->next) { /* This is not the last level */
1164
CTRACE((tfp, "SGML: Found </%s> when expecting </%s>. </%s> ***assumed.\n",
1165
old_tag->name, t->name, t->name));
1166
} else { /* last level */
1167
CTRACE((tfp, "SGML: Found </%s> when expecting </%s>. </%s> ***Ignored.\n",
1168
old_tag->name, t->name, old_tag->name));
1169
return; /* Ignore */
1173
e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(t));
1174
CTRACE2(TRACE_SGML, (tfp, "tagnum(%p) = %d\n", t, e));
1175
#ifdef USE_PRETTYSRC
1176
if (!psrc_view) /* Don't actually pass call on if viewing psrc - kw */
1178
status = (*context->actions->end_element)(context->target,
1179
e, (char **)&context->include);
1180
if (status == HT_PARSER_REOPEN_ELT) {
1181
CTRACE((tfp, "SGML: Restart <%s>\n", t->name));
1182
(*context->actions->start_element)(
1187
context->current_tag_charset,
1188
(char **)&context->include);
1189
} else if (status == HT_PARSER_OTHER_CONTENT) {
1190
CTRACE((tfp, "SGML: Continue with other content model for <%s>\n", t->name));
1191
context->element_stack->tag = ALT_TAGP_OF_TAGNUM(e);
1193
context->element_stack = N->next; /* Remove from stack */
1196
context->no_lynx_specialcodes = context->element_stack ?
1197
(context->element_stack->tag->flags & Tgf_nolyspcl) : NO;
1198
#ifdef WIND_DOWN_STACK
1200
return; /* Correct sequence */
1203
#endif /* WIND_DOWN_STACK */
1205
/* Syntax error path only */
1208
CTRACE((tfp, "SGML: Extra end tag </%s> found and ignored.\n",
1215
PRIVATE void start_element ARGS1(
1216
HTStream *, context)
1219
HTTag * new_tag = context->current_tag;
1220
HTMLElement e = TAGNUM_OF_TAGP(new_tag);
1223
#ifdef EXTENDED_HTMLDTD
1226
BOOL direct_container = YES;
1227
BOOL extra_action_taken = NO;
1228
canclose_t canclose_check = close_valid;
1231
while (context->element_stack &&
1232
(canclose_check == close_valid ||
1233
(canclose_check == close_error &&
1234
new_tag == context->element_stack->tag)) &&
1235
!(valid = element_valid_within(new_tag, context->element_stack->tag,
1236
direct_container))) {
1237
canclose_check = can_close(new_tag, context->element_stack->tag);
1238
if (canclose_check != close_NO) {
1239
CTRACE((tfp, "SGML: End </%s> \t<- %s start <%s>\n",
1240
context->element_stack->tag->name,
1241
canclose_check == close_valid ? "supplied," : "***forced by",
1243
do_close_stacked(context);
1244
extra_action_taken = YES;
1245
if (canclose_check == close_error)
1246
direct_container = NO;
1248
CTRACE((tfp, "SGML: Still open %s \t<- ***invalid start <%s>\n",
1249
context->element_stack->tag->name,
1253
if (context->element_stack && !valid &&
1254
(context->element_stack->tag->flags & Tgf_strict) &&
1255
!(valid = element_valid_within(new_tag, context->element_stack->tag,
1256
direct_container))) {
1257
CTRACE((tfp, "SGML: Still open %s \t<- ***ignoring start <%s>\n",
1258
context->element_stack->tag->name,
1263
if (context->element_stack && !extra_action_taken &&
1264
canclose_check == close_NO && !valid && (new_tag->flags & Tgf_mafse)) {
1265
BOOL has_attributes = NO;
1267
for (; i< new_tag->number_of_attributes && !has_attributes; i++)
1268
has_attributes = context->present[i];
1269
if (!has_attributes) {
1270
CTRACE((tfp, "SGML: Still open %s, ***converting invalid <%s> to </%s>\n",
1271
context->element_stack->tag->name,
1274
end_element(context, new_tag);
1279
if (context->element_stack &&
1280
canclose_check == close_error && !(valid =
1281
element_valid_within(
1283
context->element_stack->tag,
1284
direct_container))) {
1285
CTRACE((tfp, "SGML: Still open %s \t<- ***invalid start <%s>\n",
1286
context->element_stack->tag->name,
1290
/* Fall through to the non-extended code - kw */
1292
#endif /* EXTENDED_HTMLDTD */
1295
** If we are not in a SELECT block, check if this is
1296
** a SELECT start tag. Otherwise (i.e., we are in a
1297
** SELECT block) accept only OPTION as valid, terminate
1298
** the SELECT block if it is any other form-related
1299
** element, and otherwise ignore it. - FM
1301
if (!context->inSELECT) {
1303
** We are not in a SELECT block, so check if this starts one. - FM
1306
/* my_casecomp() - optimized by the first character */
1307
if (!my_casecomp(new_tag->name, "SELECT")) {
1309
** Set the inSELECT flag and fall through. - FM
1311
context->inSELECT = TRUE;
1315
** We are in a SELECT block. - FM
1317
if (strcasecomp(new_tag->name, "OPTION")) {
1319
** Ugh, it is not an OPTION. - FM
1322
case HTML_INPUT: case HTML_TEXTAREA: case HTML_SELECT:
1323
case HTML_BUTTON: case HTML_FIELDSET: case HTML_LABEL:
1324
case HTML_LEGEND: case HTML_FORM:
1333
** It is another form-related start tag, so terminate
1334
** the current SELECT block and fall through. - FM
1336
CTRACE((tfp, "SGML: ***Faking SELECT end tag before <%s> start tag.\n",
1338
end_element(context, SGMLFindTag(context->dtd, "SELECT"));
1341
** Ignore the start tag. - FM
1343
CTRACE((tfp, "SGML: ***Ignoring start tag <%s> in SELECT block.\n",
1350
** Handle the start tag. - FM
1352
CTRACE((tfp, "SGML: Start <%s>\n", new_tag->name));
1353
status = (*context->actions->start_element)(
1355
TAGNUM_OF_TAGP(new_tag),
1357
(CONST char**) context->value, /* coerce type for think c */
1358
context->current_tag_charset,
1359
(char **)&context->include);
1360
if (status == HT_PARSER_OTHER_CONTENT)
1361
new_tag = ALT_TAGP(new_tag); /* this is only returned for OBJECT */
1362
if (new_tag->contents != SGML_EMPTY) { /* i.e., tag not empty */
1363
HTElement * N = pool_alloc();
1365
outofmem(__FILE__, "start_element");
1366
N->next = context->element_stack;
1368
context->element_stack = N;
1369
context->no_lynx_specialcodes = (new_tag->flags & Tgf_nolyspcl);
1371
} else if (e == HTML_META ) {
1373
** Check for result of META tag. - KW & FM
1375
change_chartrans_handling(context);
1380
/* Find Tag in DTD tag list
1381
** ------------------------
1384
** dtd points to dtd structure including valid tag list
1385
** string points to name of tag in question
1389
** NULL tag not found
1390
** else address of tag structure in dtd
1392
PUBLIC HTTag * SGMLFindTag ARGS2(
1393
CONST SGML_dtd*, dtd,
1396
int high, low, i, diff;
1397
static HTTag* last[64] = {NULL}; /*optimize using the previous results*/
1398
HTTag** res = last + (UCH(*s) % 64); /*pointer arithmetic*/
1400
if (*res && !strcasecomp((*res)->name, s))
1403
for (low = 0, high=dtd->number_of_tags;
1405
diff < 0 ? (low = i+1) : (high = i)) { /* Binary search */
1406
i = (low + (high-low)/2);
1407
/* my_casecomp() - optimized by the first character, NOT_ASCII ok */
1408
diff = my_casecomp(dtd->tags[i].name, s); /* Case insensitive */
1409
if (diff == 0) { /* success: found it */
1410
*res = &dtd->tags[i];
1414
if (IsNmStart(*s)) {
1416
** Unrecognized, but may be valid. - KW
1418
return &HTTag_unrecognized;
1423
/*________________________________________________________________________
1428
/* Could check that we are back to bottom of stack! @@ */
1429
/* Do check! - FM */
1431
PRIVATE void SGML_free ARGS1(
1432
HTStream *, context)
1439
** Free the buffers. - FM
1441
FREE(context->recover);
1444
FREE(context->include);
1445
FREE(context->active_include);
1448
** Wind down stack if any elements are open. - FM
1450
while (context->element_stack) {
1451
cur = context->element_stack;
1453
context->element_stack = cur->next; /* Remove from stack */
1455
#ifdef USE_PRETTYSRC
1456
if (!psrc_view) /* Don't actually call on target if viewing psrc - kw */
1458
(*context->actions->end_element)(context->target,
1459
NORMAL_TAGNUM(TAGNUM_OF_TAGP(t)),
1460
(char **)&context->include);
1461
FREE(context->include);
1465
** Finish off the target. - FM
1467
(*context->actions->_free)(context->target);
1470
** Free the strings and context structure. - FM
1472
HTChunkFree(context->string);
1473
for (i = 0; i < MAX_ATTRIBUTES; i++)
1474
FREE_extra(context->value[i]);
1477
#ifdef USE_PRETTYSRC
1478
sgml_in_psrc_was_initialized = FALSE;
1482
PRIVATE void SGML_abort ARGS2(
1483
HTStream *, context,
1490
** Abort the target. - FM
1492
(*context->actions->_abort)(context->target, e);
1495
** Free the buffers. - FM
1497
FREE(context->recover);
1498
FREE(context->include);
1499
FREE(context->active_include);
1504
** Free stack memory if any elements were left open. - KW
1506
while (context->element_stack) {
1507
cur = context->element_stack;
1508
context->element_stack = cur->next; /* Remove from stack */
1513
** Free the strings and context structure. - FM
1515
HTChunkFree(context->string);
1516
for (i = 0; i < MAX_ATTRIBUTES; i++)
1517
FREE_extra(context->value[i]);
1520
#ifdef USE_PRETTYSRC
1521
sgml_in_psrc_was_initialized = FALSE;
1526
/* Read and write user callback handle
1527
** -----------------------------------
1529
** The callbacks from the SGML parser have an SGML context parameter.
1530
** These calls allow the caller to associate his own context with a
1531
** particular SGML context.
1535
PUBLIC void* SGML_callerData ARGS1(
1536
HTStream *, context)
1538
return context->callerData;
1541
PUBLIC void SGML_setCallerData ARGS2(
1542
HTStream *, context,
1545
context->callerData = data;
1547
#endif /* CALLERDATA */
1549
PRIVATE void SGML_character ARGS2(
1550
HTStream *, context,
1553
CONST SGML_dtd *dtd = context->dtd;
1554
HTChunk *string = context->string;
1555
CONST char * EntityName;
1556
HTTag * testtag = NULL;
1557
BOOLEAN chk; /* Helps (?) walk through all the else ifs... */
1558
UCode_t clong, uck = 0; /* Enough bits for UCS4 ... */
1565
char saved_char_in = '\0';
1568
** Now some fun with the preprocessor.
1569
** Use copies for c and unsign_c == clong, so that
1570
** we can revert back to the unchanged c_in. - KW
1572
#define unsign_c clong
1575
clong = UCH(c); /* a.k.a. unsign_c */
1577
if (context->T.decode_utf8) {
1579
** Combine UTF-8 into Unicode.
1580
** Incomplete characters silently ignored.
1581
** From Linux kernel's console.c. - KW
1583
if (TOASCII(UCH(c)) > 127) { /* S/390 -- gil -- 0710 */
1585
** We have an octet from a multibyte character. - FM
1587
if (context->utf_count > 0 && (TOASCII(c) & 0xc0) == 0x80) {
1588
context->utf_char = (context->utf_char << 6) | (TOASCII(c) & 0x3f);
1589
context->utf_count--;
1590
*(context->utf_buf_p) = c;
1591
(context->utf_buf_p)++;
1592
if (context->utf_count == 0) {
1594
** We have all of the bytes, so terminate
1595
** the buffer and set 'clong' to the UCode_t
1598
*(context->utf_buf_p) = '\0';
1599
clong = context->utf_char;
1601
c = ((char)(clong & 0xff));
1606
** Wait for more. - KW
1612
** Start handling a new multibyte character. - FM
1614
context->utf_buf_p = context->utf_buf;
1615
*(context->utf_buf_p) = c;
1616
(context->utf_buf_p)++;
1617
if ((c & 0xe0) == 0xc0) {
1618
context->utf_count = 1;
1619
context->utf_char = (c & 0x1f);
1620
} else if ((c & 0xf0) == 0xe0) {
1621
context->utf_count = 2;
1622
context->utf_char = (c & 0x0f);
1623
} else if ((c & 0xf8) == 0xf0) {
1624
context->utf_count = 3;
1625
context->utf_char = (c & 0x07);
1626
} else if ((c & 0xfc) == 0xf8) {
1627
context->utf_count = 4;
1628
context->utf_char = (c & 0x03);
1629
} else if ((c & 0xfe) == 0xfc) {
1630
context->utf_count = 5;
1631
context->utf_char = (c & 0x01);
1636
context->utf_count = 0;
1637
context->utf_buf_p = context->utf_buf;
1638
*(context->utf_buf_p) = '\0';
1641
** Wait for more. - KW
1647
** Got an ASCII char. - KW
1649
context->utf_count = 0;
1650
context->utf_buf_p = context->utf_buf;
1651
*(context->utf_buf_p) = '\0';
1654
} /* end of context->T.decode_utf8 S/390 -- gil -- 0726 */
1658
** If we have a koi8-r input and do not have
1659
** koi8-r as the output, save the raw input
1660
** in saved_char_in before we potentially
1661
** convert it to Unicode. - FM
1663
if (context->T.strip_raw_char_in)
1665
#endif /* NOTDEFINED */
1668
** If we want the raw input converted
1669
** to Unicode, try that now. - FM
1671
if (context->T.trans_to_uni &&
1672
((TOASCII(unsign_c) >= LYlowest_eightbit[context->inUCLYhndl]) || /* S/390 -- gil -- 0744 */
1673
(unsign_c < ' ' && unsign_c != 0 &&
1674
context->T.trans_C0_to_uni))) {
1676
** Convert the octet to Unicode. - FM
1678
clong = UCTransToUni(c, context->inUCLYhndl);
1682
c = FROMASCII((char)clong);
1686
} else if (unsign_c < ' ' && unsign_c != 0 && /* S/390 -- gil -- 0768 */
1687
context->T.trans_C0_to_uni) {
1689
** This else if may be too ugly to keep. - KW
1691
if (context->T.trans_from_uni &&
1692
(((clong = UCTransToUni(c, context->inUCLYhndl)) >= ' ') ||
1693
(context->T.transp &&
1694
(clong = UCTransToUni(c, context->inUCLYhndl)) > 0))) {
1697
c = FROMASCII((char)clong);
1702
if (context->T.transp) {
1703
uck = UCTransCharStr(replace_buf, 60, c,
1704
context->inUCLYhndl,
1705
context->inUCLYhndl, NO);
1707
if (!context->T.transp || uck < 0) {
1708
uck = UCTransCharStr(replace_buf, 60, c,
1709
context->inUCLYhndl,
1710
context->outUCLYhndl, YES);
1714
} else if (uck < 0) {
1718
if (c && replace_buf[1]) {
1719
if (context->state == S_text) {
1723
StrAllocCat(context->recover, replace_buf + 1);
1726
} /* Next line end of ugly stuff for C0. - KW */
1727
} else { /* end of context->T.trans_to_uni S/390 -- gil -- 0791 */
1732
** At this point we have either unsign_c a.k.a. clong in
1733
** Unicode (and c in latin1 if clong is in the latin1 range),
1734
** or unsign_c and c will have to be passed raw. - KW
1737
** We jump up to here from below if we have
1738
** stuff in the recover, insert, or csi buffers
1739
** to process. We zero saved_char_in, in effect
1740
** as a flag that the octet in not that of the
1741
** actual call to this function. This may be OK
1742
** for now, for the stuff this function adds to
1743
** its recover buffer, but it might not be for
1744
** stuff other functions added to the insert or
1745
** csi buffer, so bear that in mind. - FM
1746
** Stuff from the recover buffer is now handled
1747
** as UTF-8 if we can expect that's what it is,
1748
** and in that case we don't come back up here. - kw
1751
saved_char_in = '\0';
1753
** We jump to here from above when we don't have
1754
** UTF-8 input, haven't converted to Unicode, and
1755
** want clong set to the input octet (unsigned)
1756
** without zeroing its saved_char_in copy (which
1760
*(context->utf_buf) = '\0';
1763
** We jump to here from above if we have converted
1764
** the input, or a multibyte sequence across calls,
1765
** to a Unicode value and loaded it into clong (to
1766
** which unsign_c has been defined), and from below
1767
** when we are recycling a character (e.g., because
1768
** it terminated an entity but is not the standard
1769
** semi-colon). The character will already have
1770
** been put through the Unicode conversions. - FM
1774
** Ignore low ISO 646 7-bit control characters
1775
** if HTCJK is not set. - FM
1778
** Works for both ASCII and EBCDIC. -- gil
1779
*/ /* S/390 -- gil -- 0811 */
1780
if (TOASCII(unsign_c) < 32 &&
1781
c != '\t' && c != '\n' && c != '\r' &&
1786
** Ignore 127 if we don't have HTPassHighCtrlRaw
1787
** or HTCJK set. - FM
1789
#define PASSHICTRL (context->T.transp || \
1790
unsign_c >= LYlowest_eightbit[context->inUCLYhndl])
1791
if (TOASCII(c) == 127 && /* S/390 -- gil -- 0830 */
1792
!(PASSHICTRL || HTCJK != NOCJK))
1796
** Ignore 8-bit control characters 128 - 159 if
1797
** neither HTPassHighCtrlRaw nor HTCJK is set. - FM
1799
if (TOASCII(unsign_c) > 127 && TOASCII(unsign_c) < 160 && /* S/390 -- gil -- 0847 */
1800
!(PASSHICTRL || HTCJK != NOCJK))
1803
/* Almost all CJK characters are double byte but only Japanese
1804
* JIS X0201 Kana is single byte. To prevent to fail SGML parsing
1805
* we have to care them here. -- TH
1807
if ((HTCJK==JAPANESE) && (context->state==S_in_kanji) &&
1808
!IS_JAPANESE_2BYTE(context->kanji_buf, UCH(c))) {
1809
#ifdef CONV_JISX0201KANA_JISX0208KANA
1810
if (IS_SJIS_X0201KANA(context->kanji_buf)) {
1811
unsigned char sjis_hi, sjis_lo;
1812
JISx0201TO0208_SJIS(context->kanji_buf, &sjis_hi, &sjis_lo);
1818
PUTC(context->kanji_buf);
1819
context->state = S_text;
1823
** Handle character based on context->state.
1825
CTRACE2(TRACE_SGML, (tfp, "SGML before %s|%.*s|%c|\n",
1826
state_name(context->state),
1828
NonNull(string->data),
1830
switch(context->state) {
1834
** Note that if we don't have a CJK input, then this
1835
** is not the second byte of a CJK di-byte, and we're
1836
** trashing the input. That's why 8-bit characters
1837
** followed by, for example, '<' can cause the tag to
1838
** be treated as text, not markup. We could try to deal
1839
** with it by holding each first byte and then checking
1840
** byte pairs, but that doesn't seem worth the overhead
1841
** (see below). - FM
1843
context->state = S_text;
1844
PUTC(context->kanji_buf);
1848
case S_tagname_slash:
1850
* We had something link "<name/" so far, set state to S_text
1851
* but keep context->slashedtag as as a flag; except if we get
1852
* '>' directly after the "<name/", and really have a tag for
1853
* that name in context->slashedtag, in which case keep state as
1854
* is and let code below deal with it. - kw
1856
if (!(c == '>' && context->slashedtag && TOASCII(unsign_c) < 127)) {
1857
context->state = S_text;
1858
} /* fall through in any case! */
1861
if (HTCJK != NOCJK && (TOASCII(c) & 0200) != 0) { /* S/390 -- gil -- 0864 */
1863
** Setting up for Kanji multibyte handling (based on
1864
** Takuya ASADA's (asada@three-a.co.jp) CJK Lynx).
1865
** Note that if the input is not in fact CJK, the
1866
** next byte also will be mishandled, as explained
1867
** above. Toggle raw mode off in such cases, or
1868
** select the "7 bit approximations" display
1869
** character set, which is largely equivalent
1870
** to having raw mode off with CJK. - FM
1872
context->state = S_in_kanji;
1873
context->kanji_buf = c;
1875
} else if (HTCJK != NOCJK && TOASCII(c) == '\033') { /* S/390 -- gil -- 0881 */
1877
** Setting up for CJK escape sequence handling (based on
1878
** Takuya ASADA's (asada@three-a.co.jp) CJK Lynx). - FM
1880
context->state = S_esc;
1885
if (c == '&' || c == '<') {
1886
#ifdef USE_PRETTYSRC
1887
if (psrc_view) { /*there is nothing useful in the element_stack*/
1888
testtag = context->current_tag;
1892
testtag = context->element_stack ?
1893
context->element_stack->tag : NULL;
1897
if (c == '&' && TOASCII(unsign_c) < 127 && /* S/390 -- gil -- 0898 */
1899
(testtag->contents == SGML_MIXED ||
1900
testtag->contents == SGML_ELEMENT ||
1901
testtag->contents == SGML_PCDATA ||
1902
#ifdef USE_PRETTYSRC
1903
testtag->contents == SGML_EMPTY ||
1905
testtag->contents == SGML_RCDATA))) {
1907
** Setting up for possible entity, without the leading '&'. - FM
1910
context->state = S_ero;
1911
} else if (c == '<' && TOASCII(unsign_c) < 127) { /* S/390 -- gil -- 0915 */
1913
** Setting up for possible tag. - FM
1916
if (testtag && testtag->contents == SGML_PCDATA) {
1917
context->state = S_pcdata;
1918
} else if (testtag && (testtag->contents == SGML_LITTERAL
1919
|| testtag->contents == SGML_CDATA)) {
1920
context->state = S_litteral;
1921
} else if (testtag && (testtag->contents == SGML_SCRIPT)) {
1922
context->state = S_script;
1924
context->state = S_tag;
1926
context->slashedtag = NULL;
1927
} else if (context->slashedtag &&
1929
(c == '>' && context->state == S_tagname_slash)) &&
1930
TOASCII(unsign_c) < 127) {
1932
** We got either the second slash of a pending "<NAME/blah blah/"
1933
** shortref construct, or the '>' of a mere "<NAME/>". In both
1934
** cases generate a "</NAME>" end tag in the recover buffer for
1935
** reparsing unless NAME is really an empty element. - kw
1937
#ifdef USE_PRETTYSRC
1939
PSRCSTART(abracket);
1944
if (context->slashedtag != context->unknown_tag &&
1945
!ReallyEmptyTag(context->slashedtag)) {
1946
if (context->recover == NULL) {
1947
StrAllocCopy(context->recover, "</");
1948
context->recover_index = 0;
1950
StrAllocCat(context->recover, "</");
1952
StrAllocCat(context->recover, context->slashedtag->name);
1953
StrAllocCat(context->recover, ">");
1955
context->slashedtag = NULL;
1957
} else if (context->element_stack &&
1958
(context->element_stack->tag->flags & Tgf_frecyc)) {
1960
* The element stack says we are within the contents of an
1961
* element that the next stage (HTML.c) may want to feed
1962
* us back again (via the *include string). So try to output
1963
* text in UTF-8 if possible, using the same logic as for
1964
* attribute values (which should be in line with what
1965
* context->current_tag_charset indicates). - kw
1967
if (context->T.decode_utf8 &&
1968
*context->utf_buf) {
1969
PUTS(context->utf_buf);
1970
context->utf_buf_p = context->utf_buf;
1971
*(context->utf_buf_p) = '\0';
1972
} else if (HTCJK == NOCJK &&
1973
(context->T.output_utf8 ||
1974
context->T.trans_from_uni)) {
1975
if (LYIsASCII(clong)) {
1977
} else if (clong == 0xfffd && saved_char_in &&
1978
HTPassEightBitRaw &&
1979
UCH(saved_char_in) >=
1980
LYlowest_eightbit[context->outUCLYhndl]) {
1981
PUTUTF8((0xf000 | UCH(saved_char_in)));
1985
} else if (saved_char_in && context->T.use_raw_char_in) {
1986
PUTC(saved_char_in);
1991
#define PASS8859SPECL context->T.pass_160_173_raw
1993
** Convert 160 (nbsp) to Lynx special character if
1994
** neither HTPassHighCtrlRaw nor HTCJK is set. - FM
1996
} else if (unsign_c == CH_NBSP && /* S/390 -- gil -- 0932 */
1997
!context->no_lynx_specialcodes &&
1998
!(PASS8859SPECL || HTCJK != NOCJK)) {
1999
PUTC(HT_NON_BREAK_SPACE);
2001
** Convert 173 (shy) to Lynx special character if
2002
** neither HTPassHighCtrlRaw nor HTCJK is set. - FM
2004
} else if (unsign_c == CH_SHY && /* S/390 -- gil -- 0949 */
2005
!context->no_lynx_specialcodes &&
2006
!(PASS8859SPECL || HTCJK != NOCJK)) {
2007
PUTC(LY_SOFT_HYPHEN);
2009
** Handle the case in which we think we have a character
2010
** which doesn't need further processing (e.g., a koi8-r
2011
** input for a koi8-r output). - FM
2013
} else if (context->T.use_raw_char_in && saved_char_in) {
2015
** Only if the original character is still in saved_char_in,
2016
** otherwise we may be iterating from a goto top. - KW
2018
PUTC(saved_char_in);
2019
saved_char_in = '\0';
2020
/******************************************************************
2021
* I. LATIN-1 OR UCS2 TO DISPLAY CHARSET
2022
******************************************************************/
2023
} else if ((chk = (BOOL) (context->T.trans_from_uni && TOASCII(unsign_c) >= 160)) && /* S/390 -- gil -- 0968 */
2024
(uck = UCTransUniChar(unsign_c,
2025
context->outUCLYhndl)) >= ' ' &&
2027
CTRACE((tfp, "UCTransUniChar returned 0x%.2lX:'%c'.\n",
2028
uck, FROMASCII((char)uck)));
2030
** We got one octet from the conversions, so use it. - FM
2032
PUTC(FROMASCII((char)uck));
2035
(context->T.repl_translated_C0 &&
2036
uck > 0 && uck < 32))) &&
2038
** Not found; look for replacement string. - KW
2040
(uck = UCTransUniCharStr(replace_buf, 60, clong,
2041
context->outUCLYhndl,
2044
** Got a replacement string.
2045
** No further tests for validity - assume that whoever
2046
** defined replacement strings knew what she was doing. - KW
2050
** If we're displaying UTF-8, try that now. - FM
2052
} else if (context->T.output_utf8 && PUTUTF8(clong)) {
2053
; /* do nothing more */
2055
** If it's any other (> 160) 8-bit character, and
2056
** we have not set HTPassEightBitRaw nor HTCJK, nor
2057
** have the "ISO Latin 1" character set selected,
2058
** back translate for our character set. - FM
2060
#define IncludesLatin1Enc \
2061
(context->outUCLYhndl == LATIN1 || \
2062
(context->outUCI && \
2063
(context->outUCI->enc & (UCT_CP_SUPERSETOF_LAT1))))
2065
#define PASSHI8BIT (HTPassEightBitRaw || \
2066
(context->T.do_8bitraw && !context->T.trans_from_uni))
2068
} else if (unsign_c > 160 && unsign_c < 256 &&
2069
!(PASSHI8BIT || HTCJK != NOCJK) &&
2070
!IncludesLatin1Enc) {
2071
#ifdef USE_PRETTYSRC
2072
int psrc_view_backup = 0;
2076
EntityName = HTMLGetEntityName((int)(unsign_c - 160));
2077
HTChunkPuts(string, EntityName);
2078
HTChunkTerminate(string);
2079
#ifdef USE_PRETTYSRC
2080
/* we need to disable it temporary*/
2082
psrc_view_backup =1; psrc_view =0;
2085
handle_entity(context, '\0');
2086
#ifdef USE_PRETTYSRC
2087
/* we need to disable it temporary*/
2088
if (psrc_view_backup)
2096
** If we get to here and have an ASCII char,
2097
** pass the character. - KW
2099
} else if (TOASCII(unsign_c) < 127 && unsign_c > 0) { /* S/390 -- gil -- 0987 */
2102
** If we get to here, and should have translated,
2103
** translation has failed so far. - KW
2105
** We should have sent UTF-8 output to the parser
2106
** already, but what the heck, try again. - FM
2108
} else if (context->T.output_utf8 && *context->utf_buf) {
2109
PUTS(context->utf_buf);
2110
context->utf_buf_p = context->utf_buf;
2111
*(context->utf_buf_p) = '\0';
2114
** Check for a strippable koi8-r 8-bit character. - FM
2116
} else if (context->T.strip_raw_char_in && saved_char_in &&
2117
(UCH(saved_char_in) >= 0xc0) &&
2118
(UCH(saved_char_in) < 255)) {
2120
** KOI8 special: strip high bit, gives (somewhat) readable
2121
** ASCII or KOI7 - it was constructed that way! - KW
2123
PUTC(((char)(saved_char_in & 0x7f)));
2124
saved_char_in = '\0';
2125
#endif /* NOTDEFINED */
2127
** If we don't actually want the character,
2128
** make it safe and output that now. - FM
2130
} else if (TOASCII(UCH(c)) < /* S/390 -- gil -- 0997 */
2131
LYlowest_eightbit[context->outUCLYhndl] ||
2132
(context->T.trans_from_uni && !HTPassEightBitRaw)) {
2134
** If we get to here, pass the character. - FM
2142
** Found '<' in SGML_PCDATA content; treat this mode nearly like
2143
** S_litteral, but recognize '<!' and '<?' to filter out comments
2144
** and processing instructions. - kw
2147
if (!string->size && TOASCII(unsign_c) < 127) { /* first after '<' */
2148
if (c == '!') { /* <! */
2150
** Terminate and set up for possible comment,
2151
** identifier, declaration, or marked section
2152
** as under S_tag. - kw
2154
context->state = S_exclamation;
2155
context->lead_exclamation = TRUE;
2156
context->doctype_bracket = FALSE;
2157
context->first_bracket = FALSE;
2158
HTChunkPutc(string, c);
2160
} else if (c == '?') { /* <? - ignore as a PI until '>' - kw */
2162
"SGML: Found PI in PCDATA, junking it until '>'\n"));
2163
#ifdef USE_PRETTYSRC
2165
PSRCSTART(abracket);
2168
context->seen_nonwhite_in_junk_tag = TRUE; /* show all */
2171
context->state = S_junk_pi;
2175
goto case_S_litteral;
2178
** Found '<' in SGML_SCRIPT content; treat this mode nearly like
2179
** S_litteral, but recognize '<!' to allow the content to be treated
2180
** as a comment by lynx.
2183
if (!string->size && TOASCII(unsign_c) < 127) { /* first after '<' */
2184
if (c == '!') { /* <! */
2186
** Terminate and set up for possible comment,
2187
** identifier, declaration, or marked section
2188
** as under S_tag. - kw
2190
context->state = S_exclamation;
2191
context->lead_exclamation = TRUE;
2192
context->doctype_bracket = FALSE;
2193
context->first_bracket = FALSE;
2194
HTChunkPutc(string, c);
2198
goto case_S_litteral;
2201
** In litteral mode, waits only for specific end tag (for
2202
** compatibility with old servers, and for Lynx). - FM
2205
case S_litteral: /*PSRC:this case not understood completely by HV, not done*/
2206
HTChunkPutc(string, c);
2207
#ifdef USE_PRETTYSRC
2208
if (psrc_view) { /*there is nothing useful in the element_stack*/
2209
testtag = context->current_tag;
2212
testtag = context->element_stack ?
2213
context->element_stack->tag : NULL;
2215
if (testtag == NULL) {
2217
context->state = S_text;
2222
* Normally when we get the closing ">",
2223
* testtag contains something like "TITLE"
2224
* string contains something like "/title>"
2225
* so we decrement by 2 to compare the final character of each.
2227
testlast = string->size - 2 - context->trailing_spaces - context->leading_spaces;
2229
if (TOUPPER(c) != ((testlast < 0)
2231
: testtag->name[testlast])) {
2235
** If complete match, end litteral.
2238
testlast >= 0 && !testtag->name[testlast]) {
2239
#ifdef USE_PRETTYSRC
2241
PSRCSTART(abracket);
2245
strcpy(string->data,context->current_tag->name);
2246
if (tagname_transform != 1) {
2247
if (tagname_transform == 0)
2248
LYLowerCase(string->data);
2250
LYUpperCase(string->data);
2254
PSRCSTART(abracket);
2258
context->current_tag = NULL;
2261
end_element(context, context->element_stack->tag);
2264
context->current_attribute_number = INVALID;
2265
context->state = S_text;
2266
context->leading_spaces = 0;
2267
context->trailing_spaces = 0;
2272
* Allow whitespace between the "<" or ">" and the keyword, for
2275
if (isspace(UCH(c))) {
2276
if (testlast == -1) {
2277
context->leading_spaces += 1;
2278
CTRACE2(TRACE_SGML, (tfp, "leading spaces: %d\n", context->leading_spaces));
2280
} else if (testlast > 0) {
2281
context->trailing_spaces += 1;
2282
CTRACE2(TRACE_SGML, (tfp, "trailing spaces: %d\n", context->trailing_spaces));
2288
* Mismatch - recover.
2290
context->leading_spaces = 0;
2291
context->trailing_spaces = 0;
2292
if (((testtag->contents != SGML_LITTERAL &&
2293
(testtag->flags & Tgf_strict)) ||
2294
(context->state == S_pcdata &&
2295
(testtag->flags & (Tgf_strict|Tgf_endO)))) &&
2297
(c == '>' || testlast > 0 || IsNmStart(c)))) {
2298
context->state = S_end;
2300
for (i = 0; i < string->size; i++) /* remove '/' */
2301
string->data[i] = string->data[i+1];
2302
if ((string->size == 1) ? IsNmStart(c) : IsNmChar(c))
2307
if (context->state == S_pcdata &&
2308
(testtag->flags & (Tgf_strict|Tgf_endO)) &&
2309
(testlast < 0 && IsNmStart(c))) {
2310
context->state = S_tag;
2314
** If Mismatch: recover string literally.
2317
for (i = 0; i < string->size-1; i++) /* recover, except last c */
2318
PUTC(string->data[i]);
2320
context->state = S_text;
2321
goto top1; /* to recover last c */
2326
** Character reference (numeric entity) or named entity.
2331
** Setting up for possible numeric entity.
2333
context->state = S_cro; /* &# is Char Ref Open */
2336
context->state = S_entity; /* Fall through! */
2339
** Handle possible named entity.
2342
if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1029 */
2343
isalnum(UCH(c)) : isalpha(UCH(c)))) {
2344
/* Should probably use IsNmStart/IsNmChar above (is that right?),
2345
but the world is not ready for that - there's  : (note
2346
colon!) and stuff around. */
2348
** Accept valid ASCII character. - FM
2350
HTChunkPutc(string, c);
2351
} else if (string->size == 0) {
2353
** It was an ampersand that's just text, so output
2354
** the ampersand and recycle this character. - FM
2356
#ifdef USE_PRETTYSRC
2361
#ifdef USE_PRETTYSRC
2365
context->state = S_text;
2369
** Terminate entity name and try to handle it. - FM
2371
HTChunkTerminate(string);
2372
#ifdef USE_PRETTYSRC
2373
entity_string = string->data;
2375
/* S/390 -- gil -- 1039 */
2376
/* CTRACE((tfp, "%s: %d: %s\n", __FILE__, __LINE__, string->data)); */
2377
if (!strcmp(string->data, "zwnj") &&
2378
(!context->element_stack ||
2379
(context->element_stack->tag &&
2380
context->element_stack->tag->contents == SGML_MIXED))) {
2382
** Handle zwnj (8204) as <WBR>. - FM
2386
CTRACE((tfp, "SGML_character: Handling 'zwnj' entity as 'WBR' element.\n"));
2389
sprintf(temp, "<WBR>%c", c);
2391
sprintf(temp, "<WBR>");
2393
if (context->recover == NULL) {
2394
StrAllocCopy(context->recover, temp);
2395
context->recover_index = 0;
2397
StrAllocCat(context->recover, temp);
2400
context->state = S_text;
2403
handle_entity(context, '\0');
2406
context->state = S_text;
2408
** Don't eat the terminator if we didn't find the
2409
** entity name and therefore sent the raw string
2410
** via handle_entity(), or if the terminator is
2411
** not the "standard" semi-colon for HTML. - FM
2413
#ifdef USE_PRETTYSRC
2414
if (psrc_view && FoundEntity && c == ';') {
2420
if (!FoundEntity || c != ';')
2426
** Check for a numeric entity.
2429
if (TOASCII(unsign_c) < 127 && TOLOWER(UCH(c)) == 'x') { /* S/390 -- gil -- 1060 */
2430
context->isHex = TRUE;
2431
context->state = S_incro;
2432
} else if (TOASCII(unsign_c) < 127 && isdigit(UCH(c))) {
2434
** Accept only valid ASCII digits. - FM
2436
HTChunkPutc(string, c); /* accumulate a character NUMBER */
2437
context->isHex = FALSE;
2438
context->state = S_incro;
2439
} else if (string->size == 0) {
2441
** No 'x' or digit following the "&#" so recover
2442
** them and recycle the character. - FM
2444
#ifdef USE_PRETTYSRC
2450
#ifdef USE_PRETTYSRC
2454
context->state = S_text;
2460
** Handle a numeric entity.
2463
/* S/390 -- gil -- 1075 */ /* CTRACE((tfp, "%s: %d: numeric %d %d\n",
2464
__FILE__, __LINE__, unsign_c, c)); */
2465
if ((TOASCII(unsign_c) < 127) &&
2466
(context->isHex ? isxdigit(UCH(c)) :
2469
** Accept only valid hex or ASCII digits. - FM
2471
HTChunkPutc(string, c); /* accumulate a character NUMBER */
2472
} else if (string->size == 0) {
2474
** No hex digit following the "&#x" so recover
2475
** them and recycle the character. - FM
2477
#ifdef USE_PRETTYSRC
2482
#ifdef USE_PRETTYSRC
2486
context->isHex = FALSE;
2487
context->state = S_text;
2491
** Terminate the numeric entity and try to handle it. - FM
2495
HTChunkTerminate(string);
2496
#ifdef USE_PRETTYSRC
2497
entity_string = string->data;
2499
if ((context->isHex ? sscanf(string->data, "%lx", &code) :
2500
sscanf(string->data, "%ld", &code)) == 1) {
2501
/* =============== work in ASCII below here =============== S/390 -- gil -- 1092 */
2503
(code > 127 && code < 156)) {
2505
** Assume these are Microsoft code points,
2506
** inflicted on us by FrontPage. - FM
2508
** MS FrontPage uses syntax like ™ in 128-159 range
2509
** and doesn't follow Unicode standards for this area.
2510
** Windows-1252 codepoints are assumed here.
2515
** WHITE SMILING FACE
2521
** EURO currency sign
2527
** SINGLE LOW-9 QUOTATION MARK (sbquo)
2533
** DOUBLE LOW-9 QUOTATION MARK (bdquo)
2539
** HORIZONTAL ELLIPSIS (hellip)
2551
** DOUBLE DAGGER (Dagger)
2557
** PER MILLE SIGN (permil)
2563
** SINGLE LEFT-POINTING ANGLE QUOTATION MARK
2570
** LEFT SINGLE QUOTATION MARK (lsquo)
2576
** RIGHT SINGLE QUOTATION MARK (rsquo)
2582
** LEFT DOUBLE QUOTATION MARK (ldquo)
2588
** RIGHT DOUBLE QUOTATION MARK (rdquo)
2612
** SMALL TILDE (tilde)
2618
** TRADE MARK SIGN (trade)
2624
** SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
2631
** Do not attempt a conversion
2632
** to valid Unicode values.
2638
** Check for special values. - FM
2640
if ((code == 8204) &&
2641
(!context->element_stack ||
2642
(context->element_stack->tag &&
2643
context->element_stack->tag->contents == SGML_MIXED))) {
2645
** Handle zwnj (8204) as <WBR>. - FM
2649
CTRACE((tfp, "SGML_character: Handling '8204' (zwnj) reference as 'WBR' element.\n"));
2652
** Include the terminator if it is not
2653
** the standard semi-colon. - FM
2656
sprintf(temp, "<WBR>%c", c);
2658
sprintf(temp, "<WBR>");
2661
** Add the replacement string to the
2662
** recover buffer for processing. - FM
2664
if (context->recover == NULL) {
2665
StrAllocCopy(context->recover, temp);
2666
context->recover_index = 0;
2668
StrAllocCat(context->recover, temp);
2671
context->isHex = FALSE;
2672
context->state = S_text;
2674
} else if (put_special_unicodes(context, code)) {
2676
** We handled the value as a special character,
2677
** so recycle the terminator or break. - FM
2679
#ifdef USE_PRETTYSRC
2682
PUTS( (context->isHex ? "&#x" : "&#") );
2683
PUTS(entity_string);
2690
context->isHex = FALSE;
2691
context->state = S_text;
2697
** Seek a translation from the chartrans tables.
2699
if ((uck = UCTransUniChar(code,
2700
context->outUCLYhndl)) >= 32 &&
2703
uck >= LYlowest_eightbit[context->outUCLYhndl])) {
2704
#ifdef USE_PRETTYSRC
2707
PUTC(FROMASCII((char)uck));
2708
#ifdef USE_PRETTYSRC
2710
put_pretty_number(context);
2713
} else if ((uck == -4 ||
2714
(context->T.repl_translated_C0 &&
2715
uck > 0 && uck < 32)) &&
2717
** Not found; look for replacement string.
2719
(uck = UCTransUniCharStr(replace_buf, 60, code,
2720
context->outUCLYhndl,
2722
#ifdef USE_PRETTYSRC
2724
put_pretty_number(context);
2729
** If we're displaying UTF-8, try that now. - FM
2731
} else if (context->T.output_utf8 && PUTUTF8(code)) {
2732
; /* do nothing more */
2734
** Ignore 8205 (zwj),
2735
** 8206 (lrm), and 8207 (rln), if we get to here. - FM
2737
} else if (code == 8205 ||
2742
LYstrncpy(replace_buf,
2744
(string->size < 64 ? string->size : 63));
2746
"SGML_character: Ignoring '%s%s'.\n",
2747
(context->isHex ? "&#x" : "&#"),
2750
#ifdef USE_PRETTYSRC
2753
PUTS( (context->isHex ? "&#x" : "&#") );
2754
PUTS(entity_string);
2761
context->isHex = FALSE;
2762
context->state = S_text;
2767
** Show the numeric entity if we get to here
2769
** (1) Is greater than 255 (but use ASCII characters
2770
** for spaces or dashes).
2771
** (2) Is less than 32, and not valid or we don't
2773
** (3) Is 127 and we don't have HTPassHighCtrlRaw or
2775
** (4) Is 128 - 159 and we don't have HTPassHighCtrlNum
2779
} else if ((code > 255) ||
2780
(code < ' ' && /* S/390 -- gil -- 1140 */
2781
code != '\t' && code != '\n' && code != '\r' &&
2783
(TOASCII(code) == 127 &&
2784
!(HTPassHighCtrlRaw || HTCJK != NOCJK)) ||
2785
(TOASCII(code) > 127 && code < 160 &&
2786
!HTPassHighCtrlNum)) {
2788
** Unhandled or illegal value. Recover the
2789
** "&#" or "&#x" and digit(s), and recycle
2790
** the terminator. - FM
2792
#ifdef USE_PRETTYSRC
2797
if (context->isHex) {
2799
context->isHex = FALSE;
2804
for (i = 0; i < string->size; i++) /* recover */
2805
PUTC(string->data[i]);
2806
#ifdef USE_PRETTYSRC
2812
context->isHex = FALSE;
2813
context->state = S_text;
2815
} else if (TOASCII(code) < 161 || /* S/390 -- gil -- 1162 */
2816
HTPassEightBitNum ||
2817
IncludesLatin1Enc) {
2819
** No conversion needed. - FM
2821
#ifdef USE_PRETTYSRC
2823
put_pretty_number(context);
2826
PUTC(FROMASCII((char)code));
2829
** Handle as named entity. - FM
2832
EntityName = HTMLGetEntityName(code);
2833
if (EntityName && EntityName[0] != '\0') {
2835
HTChunkPuts(string, EntityName);
2836
HTChunkTerminate(string);
2837
handle_entity(context, '\0');
2839
** Add a semi-colon if something went wrong
2840
** and handle_entity() sent the string. - FM
2847
** Our conversion failed, so recover the "&#"
2848
** and digit(s), and recycle the terminator. - FM
2850
#ifdef USE_PRETTYSRC
2854
if (context->isHex) {
2856
context->isHex = FALSE;
2861
for (i = 0; i < string->size; i++) /* recover */
2862
PUTC(string->data[i]);
2863
#ifdef USE_PRETTYSRC
2868
context->isHex = FALSE;
2869
context->state = S_text;
2874
** If we get to here, we succeeded. Hoorah!!! - FM
2877
context->isHex = FALSE;
2878
context->state = S_text;
2880
** Don't eat the terminator if it's not
2881
** the "standard" semi-colon for HTML. - FM
2888
** Not an entity, and don't know why not, so add
2889
** the terminator to the string, output the "&#"
2890
** or "&#x", and process the string via the recover
2894
HTChunkPutc(string, c);
2895
HTChunkTerminate(string);
2896
#ifdef USE_PRETTYSRC
2900
if (context->isHex) {
2902
context->isHex = FALSE;
2906
#ifdef USE_PRETTYSRC
2910
if (context->recover == NULL) {
2911
StrAllocCopy(context->recover, string->data);
2912
context->recover_index = 0;
2914
StrAllocCat(context->recover, string->data);
2917
context->isHex = FALSE;
2918
context->state = S_text;
2927
case S_tag: /* new tag */
2928
if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1179 */
2929
IsNmChar(c) : IsNmStart(c))) {
2931
** Add valid ASCII character. - FM
2933
HTChunkPutc(string, c);
2934
} else if (c == '!' && !string->size) { /* <! */
2936
** Terminate and set up for possible comment,
2937
** identifier, declaration, or marked section. - FM
2939
context->state = S_exclamation;
2940
context->lead_exclamation = TRUE;
2941
context->doctype_bracket = FALSE;
2942
context->first_bracket = FALSE;
2943
HTChunkPutc(string, c);
2945
} else if (!string->size &&
2946
(TOASCII(unsign_c) <= 160 && /* S/390 -- gil -- 1196 */
2947
(c != '/' && c != '?' && c != '_' && c != ':'))) {
2949
** '<' must be followed by an ASCII letter to be a valid
2950
** start tag. Here it isn't, nor do we have a '/' for an
2951
** end tag, nor one of some other characters with a
2952
** special meaning for SGML or which are likely to be legal
2953
** Name Start characters in XML or some other extension.
2954
** So recover the '<' and following character as data. - FM & KW
2956
context->state = S_text;
2957
#ifdef USE_PRETTYSRC
2962
#ifdef USE_PRETTYSRC
2967
} else { /* End of tag name */
2969
** Try to handle tag. - FM
2973
if (string->size == 0) {
2974
context->state = S_end;
2977
CTRACE((tfp,"SGML: `<%.*s/' found!\n", string->size, string->data));
2979
HTChunkTerminate(string) ;
2981
t = SGMLFindTag(dtd, string->data);
2982
if (t == context->unknown_tag &&
2984
string->size == 4 && 0 == strcasecomp(string->data, "URL")) ||
2985
(string->size > 4 && 0 == strncasecomp(string->data, "URL:", 4)))) {
2987
** Treat <URL: as text rather than a junk tag,
2988
** so we display it and the URL (Lynxism 8-). - FM
2990
#ifdef USE_PRETTYSRC
2995
PUTS(string->data); /* recover */
2997
#ifdef USE_PRETTYSRC
3001
CTRACE((tfp, "SGML: Treating <%s%c as text\n",
3004
context->state = S_text;
3007
if (c == '/' && t) {
3009
* Element name was ended by '/'. Remember the tag that
3010
* ended thusly, we'll interpret this as either an indication
3011
* of an empty element (if '>' follows directly) or do
3012
* some SGMLshortref-ish treatment. - kw
3014
context->slashedtag = t;
3017
if (c == '?' && string->size <= 1) {
3018
CTRACE((tfp, "SGML: Found PI, junking it until '>'\n"));
3019
#ifdef USE_PRETTYSRC
3021
PSRCSTART(abracket);
3024
context->seen_nonwhite_in_junk_tag = TRUE; /*show all*/
3027
context->state = S_junk_pi;
3030
CTRACE((tfp, "SGML: *** Invalid element %s\n",
3033
#ifdef USE_PRETTYSRC
3035
PSRCSTART(abracket);
3039
if (tagname_transform != 1) {
3040
if (tagname_transform == 0)
3041
LYLowerCase(string->data);
3043
LYUpperCase(string->data);
3048
PSRCSTART(abracket);
3056
context->state = (c == '>') ? S_text : S_junk_tag;
3058
} else if (t == context->unknown_tag) {
3059
CTRACE((tfp, "SGML: *** Unknown element %s\n",
3062
** Fall through and treat like valid
3063
** tag for attribute parsing. - KW
3067
context->current_tag = t;
3069
#ifdef USE_PRETTYSRC
3071
PSRCSTART(abracket);
3074
if (t != context->unknown_tag)
3078
if (tagname_transform != 1) {
3079
if (tagname_transform == 0)
3080
LYLowerCase(string->data);
3082
LYUpperCase(string->data);
3085
if (t != context->unknown_tag)
3090
if (!psrc_view) /*don't waste time */
3094
** Clear out attributes.
3096
memset( (void*)context->present, 0 , sizeof(BOOL)*
3097
context->current_tag->number_of_attributes);
3101
context->current_attribute_number = INVALID;
3102
#ifdef USE_PRETTYSRC
3104
if (c == '>' || c == '<' || (c == '/' && context->slashedtag)) {
3106
PSRCSTART(abracket);
3109
context->state = (c == '>') ? S_text : S_tagname_slash;
3111
context->state = S_tag;
3116
context->state = S_tag_gap;
3120
if (c == '>' || c == '<' || (c == '/' && context->slashedtag)) {
3121
if (context->current_tag->name)
3122
start_element(context);
3123
context->state = (c == '>') ? S_text :
3124
(c == '<') ? S_tag : S_tagname_slash;
3126
context->state = S_tag_gap;
3132
if (context->lead_exclamation && c == '-') {
3134
** Set up for possible comment. - FM
3136
context->lead_exclamation = FALSE;
3137
context->first_dash = TRUE;
3138
HTChunkPutc(string, c);
3141
if (context->lead_exclamation && c == '[') {
3143
** Set up for possible marked section. - FM
3145
context->lead_exclamation = FALSE;
3146
context->first_bracket = TRUE;
3147
context->second_bracket = FALSE;
3148
HTChunkPutc(string, c);
3149
context->state = S_marked;
3152
if (context->first_dash && c == '-') {
3154
** Set up to handle comment. - FM
3156
context->lead_exclamation = FALSE;
3157
context->first_dash = FALSE;
3158
context->end_comment = FALSE;
3159
HTChunkPutc(string, c);
3160
context->state = S_comment;
3163
context->lead_exclamation = FALSE;
3164
context->first_dash = FALSE;
3167
** Try to handle identifier. - FM
3169
HTChunkTerminate(string);
3170
#ifdef USE_PRETTYSRC
3172
PSRCSTART(sgmlspecial);
3176
PSRCSTOP(sgmlspecial);
3179
handle_identifier(context);
3181
context->state = S_text;
3185
if (string->size == 8 &&
3186
!strncasecomp(string->data, "!DOCTYPE", 8)) {
3188
** Set up for DOCTYPE declaration. - FM
3190
HTChunkPutc(string, c);
3191
context->doctype_bracket = FALSE;
3192
context->state = S_doctype;
3195
if (string->size == 7 &&
3196
!strncasecomp(string->data, "!ENTITY", 7)) {
3198
** Set up for ENTITY declaration. - FM
3200
HTChunkPutc(string, c);
3201
context->first_dash = FALSE;
3202
context->end_comment = TRUE;
3203
context->state = S_sgmlent;
3206
if (string->size == 8 &&
3207
!strncasecomp(string->data, "!ELEMENT", 8)) {
3209
** Set up for ELEMENT declaration. - FM
3211
HTChunkPutc(string, c);
3212
context->first_dash = FALSE;
3213
context->end_comment = TRUE;
3214
context->state = S_sgmlele;
3217
if (string->size == 8 &&
3218
!strncasecomp(string->data, "!ATTLIST", 8)) {
3220
** Set up for ATTLIST declaration. - FM
3222
HTChunkPutc(string, c);
3223
context->first_dash = FALSE;
3224
context->end_comment = TRUE;
3225
context->state = S_sgmlatt;
3229
HTChunkPutc(string, c);
3232
case S_comment: /* Expecting comment. - FM */
3233
if (historical_comments) {
3235
** Any '>' terminates. - FM
3238
HTChunkTerminate(string);
3239
#ifdef USE_PRETTYSRC
3243
PUTS_TR(string->data);
3248
handle_comment(context);
3250
context->end_comment = FALSE;
3251
context->first_dash = FALSE;
3252
context->state = S_text;
3255
goto S_comment_put_c;
3257
if (!context->first_dash && c == '-') {
3258
HTChunkPutc(string, c);
3259
context->first_dash = TRUE;
3262
if (context->first_dash && c == '-') {
3263
HTChunkPutc(string, c);
3264
context->first_dash = FALSE;
3265
if (!context->end_comment)
3266
context->end_comment = TRUE;
3267
else if (!minimal_comments)
3269
** Validly treat '--' pairs as successive comments
3270
** (for minimal, any "--WHITE>" terminates). - FM
3272
context->end_comment = FALSE;
3275
if (context->end_comment && c == '>') {
3277
** Terminate and handle the comment. - FM
3279
HTChunkTerminate(string);
3280
#ifdef USE_PRETTYSRC
3284
PUTS_TR(string->data);
3289
handle_comment(context);
3291
context->end_comment = FALSE;
3292
context->first_dash = FALSE;
3293
context->state = S_text;
3296
context->first_dash = FALSE;
3297
if (context->end_comment && !isspace(UCH(c)))
3298
context->end_comment = FALSE;
3301
if (context->T.decode_utf8 &&
3302
*context->utf_buf) {
3303
HTChunkPuts(string, context->utf_buf);
3304
context->utf_buf_p = context->utf_buf;
3305
*(context->utf_buf_p) = '\0';
3306
} else if (HTCJK == NOCJK &&
3307
(context->T.output_utf8 ||
3308
context->T.trans_from_uni)) {
3309
if (clong == 0xfffd && saved_char_in &&
3310
HTPassEightBitRaw &&
3311
UCH(saved_char_in) >=
3312
LYlowest_eightbit[context->outUCLYhndl]) {
3313
HTChunkPutUtf8Char(string,
3314
(0xf000 | UCH(saved_char_in)));
3316
HTChunkPutUtf8Char(string, clong);
3318
} else if (saved_char_in && context->T.use_raw_char_in) {
3319
HTChunkPutc(string, saved_char_in);
3321
HTChunkPutc(string, c);
3325
case S_doctype: /* Expecting DOCTYPE. - FM */
3326
if (context->doctype_bracket) {
3327
HTChunkPutc(string, c);
3329
context->doctype_bracket = FALSE;
3332
if (c == '[' && WHITE(string->data[string->size - 1])) {
3333
HTChunkPutc(string, c);
3334
context->doctype_bracket = TRUE;
3338
HTChunkTerminate(string);
3339
#ifdef USE_PRETTYSRC
3341
PSRCSTART(sgmlspecial);
3345
PSRCSTOP(sgmlspecial);
3348
handle_doctype(context);
3350
context->state = S_text;
3353
HTChunkPutc(string, c);
3356
case S_marked: /* Expecting marked section. - FM */
3357
if (context->first_bracket && c == '[') {
3358
HTChunkPutc(string, c);
3359
context->first_bracket = FALSE;
3360
context->second_bracket = TRUE;
3363
if (context->second_bracket && c == ']' &&
3364
string->data[string->size - 1] == ']') {
3365
HTChunkPutc(string, c);
3366
context->second_bracket = FALSE;
3369
if (!context->second_bracket && c == '>') {
3370
HTChunkTerminate(string);
3371
#ifdef USE_PRETTYSRC
3373
PSRCSTART(sgmlspecial);
3377
PSRCSTOP(sgmlspecial);
3380
handle_marked(context);
3382
context->state = S_text;
3385
HTChunkPutc(string, c);
3388
case S_sgmlent: /* Expecting ENTITY. - FM */
3389
if (!context->first_dash && c == '-') {
3390
HTChunkPutc(string, c);
3391
context->first_dash = TRUE;
3394
if (context->first_dash && c == '-') {
3395
HTChunkPutc(string, c);
3396
context->first_dash = FALSE;
3397
if (!context->end_comment)
3398
context->end_comment = TRUE;
3400
context->end_comment = FALSE;
3403
if (context->end_comment && c == '>') {
3404
HTChunkTerminate(string);
3405
#ifdef USE_PRETTYSRC
3407
PSRCSTART(sgmlspecial);
3411
PSRCSTOP(sgmlspecial);
3414
handle_sgmlent(context);
3416
context->end_comment = FALSE;
3417
context->first_dash = FALSE;
3418
context->state = S_text;
3421
context->first_dash = FALSE;
3422
HTChunkPutc(string, c);
3425
case S_sgmlele: /* Expecting ELEMENT. - FM */
3426
if (!context->first_dash && c == '-') {
3427
HTChunkPutc(string, c);
3428
context->first_dash = TRUE;
3431
if (context->first_dash && c == '-') {
3432
HTChunkPutc(string, c);
3433
context->first_dash = FALSE;
3434
if (!context->end_comment)
3435
context->end_comment = TRUE;
3437
context->end_comment = FALSE;
3440
if (context->end_comment && c == '>') {
3441
HTChunkTerminate(string);
3442
#ifdef USE_PRETTYSRC
3444
PSRCSTART(sgmlspecial);
3448
PSRCSTOP(sgmlspecial);
3451
handle_sgmlele(context);
3453
context->end_comment = FALSE;
3454
context->first_dash = FALSE;
3455
context->state = S_text;
3458
context->first_dash = FALSE;
3459
HTChunkPutc(string, c);
3462
case S_sgmlatt: /* Expecting ATTLIST. - FM */
3463
if (!context->first_dash && c == '-') {
3464
HTChunkPutc(string, c);
3465
context->first_dash = TRUE;
3468
if (context->first_dash && c == '-') {
3469
HTChunkPutc(string, c);
3470
context->first_dash = FALSE;
3471
if (!context->end_comment)
3472
context->end_comment = TRUE;
3474
context->end_comment = FALSE;
3477
if (context->end_comment && c == '>') {
3478
HTChunkTerminate(string);
3479
#ifdef USE_PRETTYSRC
3481
PSRCSTART(sgmlspecial);
3485
PSRCSTOP(sgmlspecial);
3488
handle_sgmlatt(context);
3490
context->end_comment = FALSE;
3491
context->first_dash = FALSE;
3492
context->state = S_text;
3495
context->first_dash = FALSE;
3496
HTChunkPutc(string, c);
3499
case S_tag_gap: /* Expecting attribute or '>' */
3501
break; /* Gap between attributes */
3502
if (c == '>') { /* End of tag */
3503
#ifdef USE_PRETTYSRC
3506
if (context->current_tag->name)
3507
start_element(context);
3508
#ifdef USE_PRETTYSRC
3510
PSRCSTART(abracket);
3515
context->state = S_text;
3518
HTChunkPutc(string, c);
3519
context->state = S_attr; /* Get attribute */
3522
/* accumulating value */
3524
if (WHITE(c) || (c == '>') || (c == '=')) { /* End of word */
3525
HTChunkTerminate(string);
3526
handle_attribute_name(context, string->data);
3527
#ifdef USE_PRETTYSRC
3531
if (c == '>') { /* End of tag */
3532
if (context->current_tag->name)
3533
start_element(context);
3534
context->state = S_text;
3537
#ifdef USE_PRETTYSRC
3540
if (context->current_attribute_number == INVALID)
3544
if (attrname_transform != 1) {
3545
if (attrname_transform == 0)
3546
LYLowerCase(string->data);
3548
LYUpperCase(string->data);
3551
if (c == '=' ) PUTC('=');
3552
if (c == '=' || c == '>') {
3553
if (context->current_attribute_number == INVALID)
3559
PSRCSTART(abracket);
3562
context->state = S_text;
3568
context->state = (c == '=' ? S_equals: S_attr_gap);
3570
HTChunkPutc(string, c);
3574
case S_attr_gap: /* Expecting attribute or '=' or '>' */
3576
break; /* Gap after attribute */
3577
if (c == '>') { /* End of tag */
3578
#ifdef USE_PRETTYSRC
3580
if (context->current_attribute_number == INVALID) {
3585
PSRCSTART(abracket);
3590
if (context->current_tag->name)
3591
start_element(context);
3592
context->state = S_text;
3594
} else if (c == '=') {
3595
#ifdef USE_PRETTYSRC
3598
if (context->current_attribute_number == INVALID) {
3605
context->state = S_equals;
3608
HTChunkPutc(string, c);
3609
context->state = S_attr; /* Get next attribute */
3612
case S_equals: /* After attr = */
3614
break; /* Before attribute value */
3615
if (c == '>') { /* End of tag */
3616
CTRACE((tfp, "SGML: found = but no value\n"));
3617
#ifdef USE_PRETTYSRC
3619
PSRCSTART(abracket);
3624
if (context->current_tag->name)
3625
start_element(context);
3626
context->state = S_text;
3629
} else if (c == '\'') {
3630
#ifdef USE_PRETTYSRC
3636
context->state = S_squoted;
3639
} else if (c == '"') {
3640
#ifdef USE_PRETTYSRC
3646
context->state = S_dquoted;
3649
#ifdef USE_PRETTYSRC
3653
context->state = S_value;
3654
/* no break! fall through to S_value and process current `c` */
3657
if (WHITE(c) || (c == '>')) { /* End of word */
3658
HTChunkTerminate(string) ;
3659
#ifdef USE_PRETTYSRC
3661
/*PSRCSTART(attrval);*/
3663
HTStartAnchor(context->target, string->data, NULL);
3664
(*context->actions->end_element)(
3667
(char **)&context->include);
3668
} else if (attr_is_href) {
3670
HTStartAnchor(context->target,NULL,string->data);
3672
PUTS_TR(string->data);
3674
(*context->actions->end_element)(
3677
(char **)&context->include);
3683
#ifdef CJK_EX /* Quick hack. - JH7AYN */
3684
{ char jis_buf[512];
3685
if (string->data[0] == '$') {
3686
if (string->data[1] == 'B' || string->data[1] == '@') {
3687
jis_buf[0] = '\033';
3688
strcpy(jis_buf + 1, string->data);
3689
TO_EUC((CONST unsigned char *)jis_buf, (unsigned char *)string->data);
3694
handle_attribute_value(context, string->data);
3696
if (c == '>') { /* End of tag */
3697
#ifdef USE_PRETTYSRC
3699
PSRCSTART(abracket);
3704
if (context->current_tag->name)
3705
start_element(context);
3706
context->state = S_text;
3709
else context->state = S_tag_gap;
3710
} else if (context->T.decode_utf8 &&
3711
*context->utf_buf) {
3712
HTChunkPuts(string, context->utf_buf);
3713
context->utf_buf_p = context->utf_buf;
3714
*(context->utf_buf_p) = '\0';
3715
} else if (HTCJK == NOCJK &&
3716
(context->T.output_utf8 ||
3717
context->T.trans_from_uni)) {
3718
if (clong == 0xfffd && saved_char_in &&
3719
HTPassEightBitRaw &&
3720
UCH(saved_char_in) >=
3721
LYlowest_eightbit[context->outUCLYhndl]) {
3722
HTChunkPutUtf8Char(string,
3723
(0xf000 | UCH(saved_char_in)));
3725
HTChunkPutUtf8Char(string, clong);
3727
} else if (saved_char_in && context->T.use_raw_char_in) {
3728
HTChunkPutc(string, saved_char_in);
3730
HTChunkPutc(string, c);
3734
case S_squoted: /* Quoted attribute value */
3735
if (c == '\'') { /* End of attribute value */
3736
HTChunkTerminate(string) ;
3737
#ifdef USE_PRETTYSRC
3739
/*PSRCSTART(attrval);*/
3741
HTStartAnchor(context->target,string->data, NULL);
3742
(*context->actions->end_element)(
3745
(char **)&context->include);
3746
} else if (attr_is_href) {
3748
HTStartAnchor(context->target,NULL,string->data);
3750
PUTS_TR(string->data);
3752
(*context->actions->end_element)(
3755
(char **)&context->include);
3762
handle_attribute_value(context, string->data);
3764
context->state = S_tag_gap;
3765
} else if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1213 */
3767
** Setting up for possible single quotes in CJK escape
3768
** sequences. - Takuya ASADA (asada@three-a.co.jp)
3770
context->state = S_esc_sq;
3771
HTChunkPutc(string, c);
3772
} else if (context->T.decode_utf8 &&
3773
*context->utf_buf) {
3774
HTChunkPuts(string, context->utf_buf);
3775
context->utf_buf_p = context->utf_buf;
3776
*(context->utf_buf_p) = '\0';
3777
} else if (HTCJK == NOCJK &&
3778
(context->T.output_utf8 ||
3779
context->T.trans_from_uni)) {
3780
if (clong == 0xfffd && saved_char_in &&
3781
HTPassEightBitRaw &&
3782
UCH(saved_char_in) >=
3783
LYlowest_eightbit[context->outUCLYhndl]) {
3784
HTChunkPutUtf8Char(string,
3785
(0xf000 | UCH(saved_char_in)));
3787
HTChunkPutUtf8Char(string, clong);
3789
} else if (saved_char_in && context->T.use_raw_char_in) {
3790
HTChunkPutc(string, saved_char_in);
3792
HTChunkPutc(string, c);
3796
case S_dquoted: /* Quoted attribute value */
3797
if (c == '"' || /* Valid end of attribute value */
3798
(soft_dquotes && /* If emulating old Netscape bug, treat '>' */
3799
c == '>')) { /* as a co-terminator of dquoted and tag */
3800
HTChunkTerminate(string) ;
3801
#ifdef USE_PRETTYSRC
3803
/*PSRCSTART(attrval);*/
3805
HTStartAnchor(context->target,string->data, NULL);
3806
(*context->actions->end_element)(
3809
(char **)&context->include);
3810
} else if (attr_is_href) {
3812
HTStartAnchor(context->target,NULL,string->data);
3814
PUTS_TR(string->data);
3816
(*context->actions->end_element)(
3819
(char **)&context->include);
3827
handle_attribute_value(context, string->data);
3829
context->state = S_tag_gap;
3830
if (c == '>') /* We emulated the Netscape bug, so we go */
3831
goto top1; /* back and treat it as the tag terminator */
3832
} else if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1230 */
3834
** Setting up for possible double quotes in CJK escape
3835
** sequences. - Takuya ASADA (asada@three-a.co.jp)
3837
context->state = S_esc_dq;
3838
HTChunkPutc(string, c);
3839
} else if (context->T.decode_utf8 &&
3840
*context->utf_buf) {
3841
HTChunkPuts(string, context->utf_buf);
3842
context->utf_buf_p = context->utf_buf;
3843
*(context->utf_buf_p) = '\0';
3844
} else if (HTCJK == NOCJK &&
3845
(context->T.output_utf8 ||
3846
context->T.trans_from_uni)) {
3847
if (clong == 0xfffd && saved_char_in &&
3848
HTPassEightBitRaw &&
3849
UCH(saved_char_in) >=
3850
LYlowest_eightbit[context->outUCLYhndl]) {
3851
HTChunkPutUtf8Char(string,
3852
(0xf000 | UCH(saved_char_in)));
3854
HTChunkPutUtf8Char(string, clong);
3856
} else if (saved_char_in && context->T.use_raw_char_in) {
3857
HTChunkPutc(string, saved_char_in);
3859
HTChunkPutc(string, c);
3863
case S_end: /* </ */
3864
if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1247 */
3865
IsNmChar(c) : IsNmStart(c))) {
3866
HTChunkPutc(string, c);
3867
} else { /* End of end tag name */
3869
#ifdef USE_PRETTYSRC
3870
BOOL psrc_tagname_processed = FALSE;
3873
HTChunkTerminate(string);
3874
if (!*string->data) { /* Empty end tag */
3875
if (context->element_stack)
3876
t = context->element_stack->tag;
3878
t = SGMLFindTag(dtd, string->data);
3880
if (!t || t == context->unknown_tag) {
3881
CTRACE((tfp, "Unknown end tag </%s>\n", string->data));
3882
#ifdef USE_PRETTYSRC
3884
PSRCSTART(abracket);
3888
if (tagname_transform != 1) {
3889
if (tagname_transform == 0)
3890
LYLowerCase(string->data);
3892
LYUpperCase(string->data);
3899
PSRCSTART(abracket);
3903
psrc_tagname_processed=TRUE;
3905
} else if (psrc_view) {
3908
BOOL tag_OK = (BOOL) (c == '>' || WHITE(c));
3910
HTMLElement e = TAGNUM_OF_TAGP(t);
3911
int branch = 2; /* it can be 0,1,2*/
3913
context->current_tag = t;
3914
if (HAS_ALT_TAGNUM(TAGNUM_OF_TAGP(t)) &&
3915
context->element_stack &&
3916
ALT_TAGP(t) == context->element_stack->tag)
3917
context->element_stack->tag = NORMAL_TAGP(context->element_stack->tag);
3920
#ifdef EXTENDED_HTMLDTD
3925
case HTML_DD: case HTML_DT: case HTML_LI: case HTML_LH :
3926
case HTML_TD: case HTML_TH: case HTML_TR: case HTML_THEAD:
3927
case HTML_TFOOT : case HTML_TBODY : case HTML_COLGROUP:
3931
case HTML_A: case HTML_B: case HTML_BLINK: case HTML_CITE:
3932
case HTML_EM: case HTML_FONT: case HTML_FORM: case HTML_I:
3933
case HTML_P: case HTML_STRONG: case HTML_TT: case HTML_U:
3942
#ifdef EXTENDED_HTMLDTD
3944
** Just handle ALL end tags normally :-) - kw
3947
end_element( context, context->current_tag);
3949
#endif /* EXTENDED_HTMLDTD */
3955
(!strcasecomp(string->data, "DD") ||
3956
!strcasecomp(string->data, "DT") ||
3957
!strcasecomp(string->data, "LI") ||
3958
!strcasecomp(string->data, "LH") ||
3959
!strcasecomp(string->data, "TD") ||
3960
!strcasecomp(string->data, "TH") ||
3961
!strcasecomp(string->data, "TR") ||
3962
!strcasecomp(string->data, "THEAD") ||
3963
!strcasecomp(string->data, "TFOOT") ||
3964
!strcasecomp(string->data, "TBODY") ||
3965
!strcasecomp(string->data, "COLGROUP"))
3969
** Don't treat these end tags as invalid,
3970
** nor act on them. - FM
3972
CTRACE((tfp, "SGML: `</%s%c' found! Ignoring it.\n",
3975
context->current_attribute_number = INVALID;
3977
context->state = S_junk_tag;
3979
context->current_tag = NULL;
3980
context->state = S_text;
3983
} else if (tag_OK &&
3987
(!strcasecomp(string->data, "A") ||
3988
!strcasecomp(string->data, "B") ||
3989
!strcasecomp(string->data, "BLINK") ||
3990
!strcasecomp(string->data, "CITE") ||
3991
!strcasecomp(string->data, "EM") ||
3992
!strcasecomp(string->data, "FONT") ||
3993
!strcasecomp(string->data, "FORM") ||
3994
!strcasecomp(string->data, "I") ||
3995
!strcasecomp(string->data, "P") ||
3996
!strcasecomp(string->data, "STRONG") ||
3997
!strcasecomp(string->data, "TT") ||
3998
!strcasecomp(string->data, "U"))
4002
** Handle end tags for container elements declared
4003
** as SGML_EMPTY to prevent "expected tag substitution"
4004
** but still processed via HTML_end_element() in HTML.c
4005
** with checks there to avoid throwing the HTML.c stack
4006
** out of whack (Ugh, what a hack! 8-). - FM
4008
if (context->inSELECT) {
4010
** We are in a SELECT block. - FM
4012
if (strcasecomp(string->data, "FORM")) {
4014
** It is not at FORM end tag, so ignore it. - FM
4016
CTRACE((tfp, "SGML: ***Ignoring end tag </%s> in SELECT block.\n",
4020
** End the SELECT block and then
4021
** handle the FORM end tag. - FM
4023
CTRACE((tfp, "SGML: ***Faking SELECT end tag before </%s> end tag.\n",
4025
end_element(context,
4026
SGMLFindTag(context->dtd, "SELECT"));
4027
CTRACE((tfp, "SGML: End </%s>\n", string->data));
4029
#ifdef USE_PRETTYSRC
4030
if (!psrc_view) /* Don't actually call if viewing psrc - kw */
4032
(*context->actions->end_element)
4034
TAGNUM_OF_TAGP(context->current_tag),
4035
(char **)&context->include);
4037
} else if (!strcasecomp(string->data, "P")) {
4039
** Treat a P end tag like a P start tag (Ugh,
4040
** what a hack! 8-). - FM
4042
CTRACE((tfp, "SGML: `</%s%c' found! Treating as '<%s%c'.\n",
4043
string->data, c, string->data, c));
4047
i < context->current_tag->number_of_attributes;
4049
context->present[i] = NO;
4052
if (context->current_tag->name)
4053
start_element(context);
4055
CTRACE((tfp, "SGML: End </%s>\n", string->data));
4057
#ifdef USE_PRETTYSRC
4058
if (!psrc_view) /* Don't actually call if viewing psrc - kw */
4060
(*context->actions->end_element)
4062
TAGNUM_OF_TAGP(context->current_tag),
4063
(char **)&context->include);
4066
context->current_attribute_number = INVALID;
4068
context->state = S_junk_tag;
4070
context->current_tag = NULL;
4071
context->state = S_text;
4076
** Handle all other end tags normally. - FM
4078
end_element( context, context->current_tag);
4082
#ifdef USE_PRETTYSRC
4083
if (psrc_view && !psrc_tagname_processed) {
4084
PSRCSTART(abracket);
4088
if (tagname_transform != 1) {
4089
if (tagname_transform == 0)
4090
LYLowerCase(string->data);
4092
LYUpperCase(string->data);
4100
PSRCSTART(abracket);
4108
context->current_attribute_number = INVALID;
4111
CTRACE((tfp,"SGML: `</%s%c' found!\n", string->data, c));
4112
context->state = S_junk_tag;
4114
context->current_tag = NULL;
4115
context->state = S_text;
4121
case S_esc: /* Expecting '$'or '(' following CJK ESC. */
4123
context->state = S_dollar;
4124
} else if (c == '(') {
4125
context->state = S_paren;
4127
context->state = S_text;
4132
case S_dollar: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
4133
if (c == '@' || c == 'B' || c == 'A') {
4134
context->state = S_nonascii_text;
4135
} else if (c == '(') {
4136
context->state = S_dollar_paren;
4141
case S_dollar_paren: /* Expecting 'C' after CJK "ESC$(". */
4143
context->state = S_nonascii_text;
4145
context->state = S_text;
4150
case S_paren: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
4151
if (c == 'B' || c == 'J' || c == 'T') {
4152
context->state = S_text;
4153
} else if (c == 'I') {
4154
context->state = S_nonascii_text;
4156
context->state = S_text;
4161
case S_nonascii_text: /* Expecting CJK ESC after non-ASCII text. */
4162
if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1264 */
4163
context->state = S_esc;
4167
context->state = S_text;
4170
case S_esc_sq: /* Expecting '$'or '(' following CJK ESC. */
4172
context->state = S_dollar_sq;
4173
} else if (c == '(') {
4174
context->state = S_paren_sq;
4176
context->state = S_squoted;
4178
HTChunkPutc(string, c);
4181
case S_dollar_sq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
4182
if (c == '@' || c == 'B' || c == 'A') {
4183
context->state = S_nonascii_text_sq;
4184
} else if (c == '(') {
4185
context->state = S_dollar_paren_sq;
4187
HTChunkPutc(string, c);
4190
case S_dollar_paren_sq: /* Expecting 'C' after CJK "ESC$(". */
4192
context->state = S_nonascii_text_sq;
4194
context->state = S_squoted;
4196
HTChunkPutc(string, c);
4199
case S_paren_sq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
4200
if (c == 'B' || c == 'J' || c == 'T') {
4201
context->state = S_squoted;
4202
} else if (c == 'I') {
4203
context->state = S_nonascii_text_sq;
4205
context->state = S_squoted;
4207
HTChunkPutc(string, c);
4210
case S_nonascii_text_sq: /* Expecting CJK ESC after non-ASCII text. */
4211
if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1281 */
4212
context->state = S_esc_sq;
4214
HTChunkPutc(string, c);
4217
case S_esc_dq: /* Expecting '$'or '(' following CJK ESC. */
4219
context->state = S_dollar_dq;
4220
} else if (c == '(') {
4221
context->state = S_paren_dq;
4223
context->state = S_dquoted;
4225
HTChunkPutc(string, c);
4228
case S_dollar_dq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
4229
if (c == '@' || c == 'B' || c == 'A') {
4230
context->state = S_nonascii_text_dq;
4231
} else if (c == '(') {
4232
context->state = S_dollar_paren_dq;
4234
HTChunkPutc(string, c);
4237
case S_dollar_paren_dq: /* Expecting 'C' after CJK "ESC$(". */
4239
context->state = S_nonascii_text_dq;
4241
context->state = S_dquoted;
4243
HTChunkPutc(string, c);
4246
case S_paren_dq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
4247
if (c == 'B' || c == 'J' || c == 'T') {
4248
context->state = S_dquoted;
4249
} else if (c == 'I') {
4250
context->state = S_nonascii_text_dq;
4252
context->state = S_dquoted;
4254
HTChunkPutc(string, c);
4257
case S_nonascii_text_dq: /* Expecting CJK ESC after non-ASCII text. */
4258
if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1298 */
4259
context->state = S_esc_dq;
4261
HTChunkPutc(string, c);
4267
#ifdef USE_PRETTYSRC
4269
if (context->state == S_junk_tag) {
4272
PSRCSTART(abracket);
4275
context->seen_nonwhite_in_junk_tag = FALSE;
4278
context->current_tag = NULL;
4279
context->state = S_text;
4281
#ifdef USE_PRETTYSRC
4282
else if (psrc_view) {
4283
/*pack spaces until first non-space is seen*/
4284
if (!context->seen_nonwhite_in_junk_tag) {
4286
context->seen_nonwhite_in_junk_tag = TRUE;
4294
} /* switch on context->state */
4295
CTRACE2(TRACE_SGML, (tfp, "SGML after %s|%.*s|%c|\n",
4296
state_name(context->state),
4298
NonNull(string->data),
4303
** Check whether an external function has added
4304
** anything to the include buffer. If so, move the
4305
** new stuff to the beginning of active_include. - kw
4307
if (context->include != NULL) {
4308
if (context->include[0] == '\0') {
4309
FREE(context->include);
4311
if (context->active_include &&
4312
context->active_include[context->include_index] != '\0')
4313
StrAllocCat(context->include,
4314
context->active_include + context->include_index);
4315
FREE(context->active_include);
4316
context->active_include = context->include;
4317
context->include_index = 0;
4318
context->include = NULL;
4323
** Check whether we've added anything to the recover buffer. - FM
4325
if (context->recover != NULL) {
4326
if (context->recover[context->recover_index] == '\0') {
4327
FREE(context->recover);
4328
context->recover_index = 0;
4330
c = context->recover[context->recover_index];
4331
context->recover_index++;
4337
** Check whether an external function had added
4338
** anything to the include buffer; it should now be
4339
** in active_include. - FM / kw
4341
if (context->active_include != NULL) {
4342
if (context->active_include[context->include_index] == '\0') {
4343
FREE(context->active_include);
4344
context->include_index = 0;
4346
if (context->current_tag_charset == UTF8_handle ||
4347
context->T.trans_from_uni) {
4349
* If it looks like we would have fed UTF-8 to the
4350
* next processing stage, assume that whatever we were
4351
* fed back is in UTF-8 form, too. This won't be always
4352
* true for all uses of the include buffer, but it's a
4355
char *puni = context->active_include + context->include_index;
4357
clong = UCGetUniFromUtf8String(&puni);
4358
if (clong < 256 && clong >= 0) {
4359
c = ((char)(clong & 0xff));
4361
saved_char_in = '\0';
4362
context->include_index = puni - context->active_include + 1;
4366
* Otherwise assume no UTF-8 - do charset-naive processing
4367
* and hope for the best. - kw
4369
c = context->active_include[context->include_index];
4370
context->include_index++;
4377
** Check whether an external function has added
4378
** anything to the csi buffer. - FM
4380
if (context->csi != NULL) {
4381
if (context->csi[context->csi_index] == '\0') {
4383
context->csi_index = 0;
4385
c = context->csi[context->csi_index];
4386
context->csi_index++;
4390
} /* SGML_character */
4393
PRIVATE void SGML_string ARGS2(
4394
HTStream *, context,
4398
for (p = str; *p; p++)
4399
SGML_character(context, *p);
4403
PRIVATE void SGML_write ARGS3(
4404
HTStream *, context,
4409
CONST char *e = str+l;
4410
for (p = str; p < e; p++)
4411
SGML_character(context, *p);
4414
/*_______________________________________________________________________
4417
/* Structured Object Class
4418
** -----------------------
4420
PUBLIC CONST HTStreamClass SGMLParser =
4430
/* Create SGML Engine
4431
** ------------------
4434
** dtd represents the DTD, along with
4435
** actions is the sink for the data as a set of routines.
4439
PUBLIC HTStream* SGML_new ARGS3(
4440
CONST SGML_dtd *, dtd,
4441
HTParentAnchor *, anchor,
4442
HTStructured *, target)
4445
HTStream* context = (HTStream *) malloc(sizeof(*context));
4447
outofmem(__FILE__, "SGML_begin");
4449
context->isa = &SGMLParser;
4450
context->string = HTChunkCreate(128); /* Grow by this much */
4451
context->leading_spaces = 0;
4452
context->trailing_spaces = 0;
4454
context->target = target;
4455
context->actions = (CONST HTStructuredClass*)(((HTStream*)target)->isa);
4457
context->unknown_tag = &HTTag_unrecognized;
4458
context->current_tag = context->slashedtag = NULL;
4459
context->state = S_text;
4460
context->kanji_buf = '\0';
4461
context->element_stack = 0; /* empty */
4462
context->inSELECT = FALSE;
4463
context->no_lynx_specialcodes = NO; /* special codes normally generated */
4465
context->callerData = (void*) callerData;
4466
#endif /* CALLERDATA */
4467
for (i = 0; i < MAX_ATTRIBUTES; i++)
4468
context->value[i] = 0;
4470
context->lead_exclamation = FALSE;
4471
context->first_dash = FALSE;
4472
context->end_comment = FALSE;
4473
context->doctype_bracket = FALSE;
4474
context->first_bracket = FALSE;
4475
context->second_bracket = FALSE;
4476
context->isHex = FALSE;
4478
context->node_anchor = anchor; /* Could be NULL? */
4479
context->utf_count = 0;
4480
context->utf_char = 0;
4481
context->utf_buf[0] = context->utf_buf[6] = '\0';
4482
context->utf_buf_p = context->utf_buf;
4483
UCTransParams_clear(&context->T);
4484
context->inUCLYhndl = HTAnchor_getUCLYhndl(anchor,
4486
if (context->inUCLYhndl < 0) {
4487
HTAnchor_copyUCInfoStage(anchor,
4491
context->inUCLYhndl = HTAnchor_getUCLYhndl(anchor,
4494
#ifdef CAN_SWITCH_DISPLAY_CHARSET /* Allow a switch to a more suitable display charset */
4495
else if (anchor->UCStages
4496
&& anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl >= 0
4497
&& anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl != current_char_set ) {
4498
int o = anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl;
4500
anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl = -1; /* Force reset */
4501
HTAnchor_resetUCInfoStage(anchor, o, UCT_STAGE_PARSER,
4502
/* Preserve change this: */
4503
anchor->UCStages->s[UCT_STAGE_PARSER].lock);
4507
context->inUCI = HTAnchor_getUCInfoStage(anchor,
4509
set_chartrans_handling(context, anchor, -1);
4511
context->recover = NULL;
4512
context->recover_index = 0;
4513
context->include = NULL;
4514
context->active_include = NULL;
4515
context->include_index = 0;
4516
context->url = NULL;
4517
context->csi = NULL;
4518
context->csi_index = 0;
4520
#ifdef USE_PRETTYSRC
4523
mark_htext_as_source = TRUE;
4524
SGML_string(context,
4525
"<HTML><HEAD><TITLE>source</TITLE></HEAD><BODY><PRE>");
4527
psrc_convert_string = FALSE;
4528
sgml_in_psrc_was_initialized = TRUE;
4529
context->seen_nonwhite_in_junk_tag = FALSE;
4536
/* Asian character conversion functions
4537
** ====================================
4539
** Added 24-Mar-96 by FM, based on:
4541
////////////////////////////////////////////////////////////////////////
4542
Copyright (c) 1993 Electrotechnical Laboratory (ETL)
4544
Permission to use, copy, modify, and distribute this material
4545
for any purpose and without fee is hereby granted, provided
4546
that the above copyright notice and this permission notice
4547
appear in all copies, and that the name of ETL not be
4548
used in advertising or publicity pertaining to this
4549
material without the specific, prior written permission
4550
of an authorized representative of ETL.
4551
ETL MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY
4552
OF THIS MATERIAL FOR ANY PURPOSE. IT IS PROVIDED "AS IS",
4553
WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
4554
/////////////////////////////////////////////////////////////////////////
4555
Content-Type: program/C; charset=US-ASCII
4557
Author: Yutaka Sato <ysato@etl.go.jp>
4560
930923 extracted from codeconv.c of cosmos
4561
///////////////////////////////////////////////////////////////////////
4564
PUBLIC int TREAT_SJIS = 1;
4566
PUBLIC void JISx0201TO0208_EUC ARGS4(
4567
register unsigned char, IHI,
4568
register unsigned char, ILO,
4569
register unsigned char *, OHI,
4570
register unsigned char *, OLO)
4572
static char *table[] = {
4573
"\241\243", /* A1,A3 */
4574
"\241\326", /* A1,D6 */
4575
"\241\327", /* A1,D7 */
4576
"\241\242", /* A1,A2 */
4577
"\241\246", /* A1,A6 */
4578
"\245\362", /* A5,F2 */
4579
"\245\241", /* A5,A1 */
4580
"\245\243", /* A5,A3 */
4581
"\245\245", /* A5,A5 */
4582
"\245\247", /* A5,A7 */
4583
"\245\251", /* A5,A9 */
4584
"\245\343", /* A5,E3 */
4585
"\245\345", /* A5,E5 */
4586
"\245\347", /* A5,E7 */
4587
"\245\303", /* A5,C3 */
4588
"\241\274", /* A1,BC */
4589
"\245\242", /* A5,A2 */
4590
"\245\244", /* A5,A4 */
4591
"\245\246", /* A5,A6 */
4592
"\245\250", /* A5,A8 */
4593
"\245\252", /* A5,AA */
4594
"\245\253", /* A5,AB */
4595
"\245\255", /* A5,AD */
4596
"\245\257", /* A5,AF */
4597
"\245\261", /* A5,B1 */
4598
"\245\263", /* A5,B3 */
4599
"\245\265", /* A5,B5 */
4600
"\245\267", /* A5,B7 */
4601
"\245\271", /* A5,B9 */
4602
"\245\273", /* A5,BB */
4603
"\245\275", /* A5,BD */
4604
"\245\277", /* A5,BF */
4605
"\245\301", /* A5,C1 */
4606
"\245\304", /* A5,C4 */
4607
"\245\306", /* A5,C6 */
4608
"\245\310", /* A5,C8 */
4609
"\245\312", /* A5,CA */
4610
"\245\313", /* A5,CB */
4611
"\245\314", /* A5,CC */
4612
"\245\315", /* A5,CD */
4613
"\245\316", /* A5,CE */
4614
"\245\317", /* A5,CF */
4615
"\245\322", /* A5,D2 */
4616
"\245\325", /* A5,D5 */
4617
"\245\330", /* A5,D8 */
4618
"\245\333", /* A5,DB */
4619
"\245\336", /* A5,DE */
4620
"\245\337", /* A5,DF */
4621
"\245\340", /* A5,E0 */
4622
"\245\341", /* A5,E1 */
4623
"\245\342", /* A5,E2 */
4624
"\245\344", /* A5,E4 */
4625
"\245\346", /* A5,E6 */
4626
"\245\350", /* A5,E8 */
4627
"\245\351", /* A5,E9 */
4628
"\245\352", /* A5,EA */
4629
"\245\353", /* A5,EB */
4630
"\245\354", /* A5,EC */
4631
"\245\355", /* A5,ED */
4632
"\245\357", /* A5,EF */
4633
"\245\363", /* A5,F3 */
4634
"\241\253", /* A1,AB */
4635
"\241\254" /* A1,AC */
4638
if ((IHI == 0x8E) && (ILO >= 0xA1) && (ILO <= 0xDF)) {
4639
*OHI = table[ILO - 0xA1][0];
4640
*OLO = table[ILO - 0xA1][1];
4647
PRIVATE int IS_SJIS_STR ARGS1(CONST unsigned char *, str)
4649
CONST unsigned char *s;
4654
while ((ch = *s++) != '\0') {
4656
if (IS_SJIS(ch, *s, is_sjis))
4662
PUBLIC unsigned char * SJIS_TO_JIS1 ARGS3(
4663
register unsigned char, HI,
4664
register unsigned char, LO,
4665
register unsigned char *, JCODE)
4667
HI -= UCH((HI <= 0x9F) ? 0x71 : 0xB1);
4668
HI = UCH((HI << 1) + 1);
4682
PUBLIC unsigned char * JIS_TO_SJIS1 ARGS3(
4683
register unsigned char, HI,
4684
register unsigned char, LO,
4685
register unsigned char *, SJCODE)
4694
HI = UCH(((HI - 0x21) >> 1) + 0x81);
4702
PUBLIC unsigned char * EUC_TO_SJIS1 ARGS3(
4705
register unsigned char *, SJCODE)
4708
JISx0201TO0208_EUC(HI, LO, &HI, &LO);
4709
JIS_TO_SJIS1(UCH(HI & 0x7F), UCH(LO & 0x7F), SJCODE);
4713
PUBLIC void JISx0201TO0208_SJIS ARGS3(
4714
register unsigned char, I,
4715
register unsigned char *, OHI,
4716
register unsigned char *, OLO)
4718
unsigned char SJCODE[2];
4720
JISx0201TO0208_EUC(0x8E, I, OHI, OLO);
4721
JIS_TO_SJIS1(UCH(*OHI & 0x7F), UCH(*OLO & 0x7F), SJCODE);
4726
PUBLIC unsigned char * SJIS_TO_EUC1 ARGS3(
4729
unsigned char *, data)
4731
SJIS_TO_JIS1(HI, LO, data);
4737
PUBLIC unsigned char * SJIS_TO_EUC ARGS2(
4738
unsigned char *, src,
4739
unsigned char *, dst)
4741
register unsigned char hi, lo, *sp, *dp;
4742
register int in_sjis = 0;
4744
in_sjis = IS_SJIS_STR(src);
4745
for (sp = src, dp = dst; (hi = sp[0]) != '\0';) {
4747
if (TREAT_SJIS && IS_SJIS(hi, lo, in_sjis)) {
4748
SJIS_TO_JIS1(hi, lo, dp);
4760
PUBLIC unsigned char * EUC_TO_SJIS ARGS2(
4761
unsigned char *, src,
4762
unsigned char *, dst)
4764
register unsigned char *sp, *dp;
4766
for (sp = src, dp = dst; *sp;) {
4768
if (sp[1] && (sp[1] & 0x80)) {
4769
JIS_TO_SJIS1(UCH(sp[0] & 0x7F), UCH(sp[1] & 0x7F), dp);
4783
#define Strcpy(a,b) (strcpy((char*)a,(CONST char*)b),&a[strlen((CONST char*)a)])
4785
PUBLIC unsigned char *EUC_TO_JIS ARGS4(
4786
unsigned char *, src,
4787
unsigned char *, dst,
4791
register unsigned char kana_mode = 0;
4792
register unsigned char cch;
4793
register unsigned char *sp = src;
4794
register unsigned char *dp = dst;
4797
while ((cch = *sp++) != '\0') {
4799
if (!IS_EUC(cch, *sp)) {
4800
if (cch == 0xA0 && is_JIS) /* ignore NBSP */
4807
kana_mode = UCH(~kana_mode);
4808
dp = Strcpy(dp, toK);
4811
*dp++ = UCH(cch & ~0x80);
4812
*dp++ = UCH(*sp++ & ~0x80);
4816
kana_mode = UCH(~kana_mode);
4817
dp = Strcpy(dp, toA);
4823
dp = Strcpy(dp, toA);
4830
#define IS_JIS7(c1,c2) (0x20<(c1)&&(c1)<0x7F && 0x20<(c2)&&(c2)<0x7F)
4831
#define SO ('N'-0x40)
4832
#define SI ('O'-0x40)
4834
PUBLIC int repair_JIS = 0;
4836
PRIVATE CONST unsigned char *repairJIStoEUC ARGS2(
4837
CONST unsigned char *, src,
4838
unsigned char **, dstp)
4840
CONST unsigned char *s;
4841
unsigned char *d, ch1, ch2;
4845
while ((ch1 = s[0]) && (ch2 = s[1])) {
4848
if (ch2 == 'B' || ch2 == 'J') {
4852
if (!IS_JIS7(ch1, ch2))
4855
*d++ = UCH(0x80 | ch1);
4856
*d++ = UCH(0x80 | ch2);
4861
PUBLIC unsigned char *TO_EUC ARGS2(
4862
CONST unsigned char *, jis,
4863
unsigned char *, euc)
4865
register CONST unsigned char *s;
4866
register unsigned char c, jis_stat;
4868
register int to1B, to2B;
4869
register int in_sjis = 0;
4881
in_sjis = IS_SJIS_STR(jis);
4884
while ((c = *s++) != '\0') {
4886
continue; /* ignore it */
4887
if (c == 0xA0 && is_JIS)
4888
continue; /* ignore Non-breaking space */
4890
if (c == to2B && jis_stat == 0 && repair_JIS) {
4891
if (*s == 'B' || *s == '@') {
4892
CONST unsigned char *ts;
4893
if ((ts = repairJIStoEUC(s + 1, &d)) != NULL) {
4901
if ((s[1] == 'B') || (s[1] == '@')) {
4908
} else if (*s == to1B) {
4910
if ((s[1] == 'B') || (s[1] == 'J') || (s[1] == 'H')) {
4914
} else if (*s == ',') { /* MULE */
4921
if (IS_SJIS(c, *s, in_sjis)) {
4922
SJIS_TO_EUC1(c, *s, d);
4926
} else if (jis_stat) {
4927
if (c <= 0x20 || 0x7F <= c) {
4932
if (IS_JIS7(c, *s)) {
4933
*d++ = jis_stat | c;
4934
*d++ = jis_stat | *s++;
4939
if (n8bits == 0 && (c == SI || c == SO)) {
4949
#define non94(ch) ((ch) <= 0x20 || (ch) == 0x7F)
4951
PRIVATE int is_EUC_JP ARGS1(unsigned char *, euc)
4956
for (cp = euc; (ch1 = *cp) != '\0'; cp++) {
4959
if ((ch2 & 0x80) == 0) {
4960
/* sv1log("NOT_EUC1[%x][%x]\n",ch1,ch2); */
4963
if (non94(ch1 & 0x7F) || non94(ch2 & 0x7F)) {
4964
/* sv1log("NOT_EUC2[%x][%x]\n",ch1,ch2); */
4973
PUBLIC void TO_SJIS ARGS2(
4974
CONST unsigned char *, any,
4975
unsigned char *, sjis)
4979
euc = malloc(strlen((CONST char *) any) + 1);
4982
outofmem(__FILE__, "TO_SJIS");
4986
EUC_TO_SJIS(euc, sjis);
4988
strcpy((char *) sjis, (CONST char *) any);
4992
PUBLIC void TO_JIS ARGS2(
4993
CONST unsigned char *, any,
4994
unsigned char *, jis)
5002
euc = malloc(strlen((CONST char *) any) + 1);
5005
outofmem(__FILE__, "TO_JIS");
5009
EUC_TO_JIS(euc, jis, TO_KANJI, TO_ASCII);