2
* HTMLparser.c : an HTML 4.0 non-verifying parser
4
* See Copyright for the status of this software.
11
#ifdef LIBXML_HTML_ENABLED
20
#ifdef HAVE_SYS_STAT_H
33
#include <libxml/xmlmemory.h>
34
#include <libxml/tree.h>
35
#include <libxml/parser.h>
36
#include <libxml/parserInternals.h>
37
#include <libxml/xmlerror.h>
38
#include <libxml/HTMLparser.h>
39
#include <libxml/HTMLtree.h>
40
#include <libxml/entities.h>
41
#include <libxml/encoding.h>
42
#include <libxml/valid.h>
43
#include <libxml/xmlIO.h>
44
#include <libxml/globals.h>
45
#include <libxml/uri.h>
47
#define HTML_MAX_NAMELEN 1000
48
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49
#define HTML_PARSER_BUFFER_SIZE 100
52
/* #define DEBUG_PUSH */
54
static int htmlOmittedDefaultValue = 1;
56
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57
xmlChar end, xmlChar end2, xmlChar end3);
58
static void htmlParseComment(htmlParserCtxtPtr ctxt);
60
/************************************************************************
62
* Some factorized error routines *
64
************************************************************************/
68
* @ctxt: an HTML parser context
69
* @extra: extra informations
71
* Handle a redefinition of attribute error
74
htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
76
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77
(ctxt->instate == XML_PARSER_EOF))
80
ctxt->errNo = XML_ERR_NO_MEMORY;
81
ctxt->instate = XML_PARSER_EOF;
85
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86
XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
88
"Memory allocation failed : %s\n", extra);
90
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91
XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92
NULL, NULL, 0, 0, "Memory allocation failed\n");
97
* @ctxt: an HTML parser context
98
* @error: the error number
99
* @msg: the error message
100
* @str1: string infor
101
* @str2: string infor
103
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
106
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107
const char *msg, const xmlChar *str1, const xmlChar *str2)
109
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110
(ctxt->instate == XML_PARSER_EOF))
114
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115
XML_ERR_ERROR, NULL, 0,
116
(const char *) str1, (const char *) str2,
120
ctxt->wellFormed = 0;
125
* @ctxt: an HTML parser context
126
* @error: the error number
127
* @msg: the error message
130
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
133
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134
const char *msg, int val)
136
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137
(ctxt->instate == XML_PARSER_EOF))
141
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142
XML_ERR_ERROR, NULL, 0, NULL, NULL,
143
NULL, val, 0, msg, val);
145
ctxt->wellFormed = 0;
148
/************************************************************************
150
* Parser stacks related functions and macros *
152
************************************************************************/
156
* @ctxt: an HTML parser context
157
* @value: the element name
159
* Pushes a new element name on top of the name stack
161
* Returns 0 in case of error, the index in the stack otherwise
164
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
166
if (ctxt->nameNr >= ctxt->nameMax) {
168
ctxt->nameTab = (const xmlChar * *)
169
xmlRealloc((xmlChar * *)ctxt->nameTab,
171
sizeof(ctxt->nameTab[0]));
172
if (ctxt->nameTab == NULL) {
173
htmlErrMemory(ctxt, NULL);
177
ctxt->nameTab[ctxt->nameNr] = value;
179
return (ctxt->nameNr++);
183
* @ctxt: an HTML parser context
185
* Pops the top element name from the name stack
187
* Returns the name just removed
189
static const xmlChar *
190
htmlnamePop(htmlParserCtxtPtr ctxt)
194
if (ctxt->nameNr <= 0)
197
if (ctxt->nameNr < 0)
199
if (ctxt->nameNr > 0)
200
ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
203
ret = ctxt->nameTab[ctxt->nameNr];
204
ctxt->nameTab[ctxt->nameNr] = NULL;
209
* Macros for accessing the content. Those should be used only by the parser,
212
* Dirty macros, i.e. one need to make assumption on the context to use them
214
* CUR_PTR return the current pointer to the xmlChar to be parsed.
215
* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216
* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217
* in UNICODE mode. This should be used internally by the parser
218
* only to compare to ASCII values otherwise it would break when
219
* running with UTF-8 encoding.
220
* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221
* to compare on ASCII based substring.
222
* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223
* it should be used only to compare on ASCII based substring.
224
* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
225
* strings without newlines within the parser.
227
* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
229
* CURRENT Returns the current char value, with the full decoding of
230
* UTF-8 if we are using this mode. It returns an int.
231
* NEXT Skip to the next character, this does the proper decoding
232
* in UTF-8 mode. It also pop-up unfinished entities on the fly.
233
* NEXTL(l) Skip the current unicode character of l xmlChars long.
234
* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
237
#define UPPER (toupper(*ctxt->input->cur))
239
#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
241
#define NXT(val) ctxt->input->cur[(val)]
243
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
245
#define CUR_PTR ctxt->input->cur
247
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248
(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249
xmlParserInputShrink(ctxt->input)
251
#define GROW if ((ctxt->progressive == 0) && \
252
(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253
xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
255
#define CURRENT ((int) (*ctxt->input->cur))
257
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
259
/* Inported from XML */
261
/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262
#define CUR ((int) (*ctxt->input->cur))
263
#define NEXT xmlNextChar(ctxt)
265
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
266
#define NXT(val) ctxt->input->cur[(val)]
267
#define CUR_PTR ctxt->input->cur
270
#define NEXTL(l) do { \
271
if (*(ctxt->input->cur) == '\n') { \
272
ctxt->input->line++; ctxt->input->col = 1; \
273
} else ctxt->input->col++; \
274
ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
279
if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280
if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
283
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284
#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
286
#define COPY_BUF(l,b,i,v) \
287
if (l == 1) b[i++] = (xmlChar) v; \
288
else i += xmlCopyChar(l,&b[i],v)
292
* @ctxt: the HTML parser context
293
* @len: pointer to the length of the char read
295
* The current char value, if using UTF-8 this may actually span multiple
296
* bytes in the input buffer. Implement the end of line normalization:
297
* 2.11 End-of-Line Handling
298
* If the encoding is unspecified, in the case we find an ISO-Latin-1
299
* char, then the encoding converter is plugged in automatically.
301
* Returns the current char value and its length
305
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306
if (ctxt->instate == XML_PARSER_EOF)
309
if (ctxt->token != 0) {
313
if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
315
* We are supposed to handle UTF8, check it's valid
316
* From rfc2044: encoding of the Unicode values on UTF-8:
318
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
319
* 0000 0000-0000 007F 0xxxxxxx
320
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
323
* Check for the 0x110000 limit too
325
const unsigned char *cur = ctxt->input->cur;
332
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333
if ((cur[1] & 0xc0) != 0x80)
335
if ((c & 0xe0) == 0xe0) {
338
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339
if ((cur[2] & 0xc0) != 0x80)
341
if ((c & 0xf0) == 0xf0) {
343
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344
if (((c & 0xf8) != 0xf0) ||
345
((cur[3] & 0xc0) != 0x80))
349
val = (cur[0] & 0x7) << 18;
350
val |= (cur[1] & 0x3f) << 12;
351
val |= (cur[2] & 0x3f) << 6;
352
val |= cur[3] & 0x3f;
356
val = (cur[0] & 0xf) << 12;
357
val |= (cur[1] & 0x3f) << 6;
358
val |= cur[2] & 0x3f;
363
val = (cur[0] & 0x1f) << 6;
364
val |= cur[1] & 0x3f;
367
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368
"Char 0x%X out of allowed range\n", val);
374
return((int) *ctxt->input->cur);
378
* Assume it's a fixed length encoding (1) with
379
* a compatible encoding for the ASCII set, since
380
* XML constructs only use < 128 chars
383
if ((int) *ctxt->input->cur < 0x80)
384
return((int) *ctxt->input->cur);
387
* Humm this is bad, do an automatic flow conversion
389
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390
ctxt->charset = XML_CHAR_ENCODING_UTF8;
391
return(xmlCurrentChar(ctxt, len));
395
* If we detect an UTF8 error that probably mean that the
396
* input encoding didn't get properly advertized in the
397
* declaration header. Report the error and switch the encoding
398
* to ISO-Latin-1 (if you don't like this policy, just declare the
404
if (ctxt->input->end - ctxt->input->cur >= 4) {
405
snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
406
ctxt->input->cur[0], ctxt->input->cur[1],
407
ctxt->input->cur[2], ctxt->input->cur[3]);
409
snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
411
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
412
"Input is not proper UTF-8, indicate encoding !\n",
413
BAD_CAST buffer, NULL);
416
ctxt->charset = XML_CHAR_ENCODING_8859_1;
418
return((int) *ctxt->input->cur);
422
* htmlSkipBlankChars:
423
* @ctxt: the HTML parser context
425
* skip all blanks character found at that point in the input streams.
427
* Returns the number of space chars skipped
431
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
434
while (IS_BLANK_CH(*(ctxt->input->cur))) {
435
if ((*ctxt->input->cur == 0) &&
436
(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
439
if (*(ctxt->input->cur) == '\n') {
440
ctxt->input->line++; ctxt->input->col = 1;
441
} else ctxt->input->col++;
444
if (*ctxt->input->cur == 0)
445
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
454
/************************************************************************
456
* The list of HTML elements and their properties *
458
************************************************************************/
461
* Start Tag: 1 means the start tag can be ommited
462
* End Tag: 1 means the end tag can be ommited
463
* 2 means it's forbidden (empty elements)
464
* 3 means the tag is stylistic and should be closed easily
465
* Depr: this element is deprecated
466
* DTD: 1 means that this element is valid only in the Loose DTD
467
* 2 means that this element is valid only in the Frameset DTD
469
* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
470
, subElements , impliedsubelt , Attributes, userdata
473
/* Definitions and a couple of vars for HTML Elements */
475
#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
476
#define NB_FONTSTYLE 8
477
#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
479
#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
480
#define NB_SPECIAL 16
481
#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
482
#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
483
#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
484
#define NB_BLOCK NB_HEADING + NB_LIST + 14
485
#define FORMCTRL "input", "select", "textarea", "label", "button"
486
#define NB_FORMCTRL 5
489
#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
491
#define LIST "ul", "ol", "dir", "menu"
494
#define NB_MODIFIER 0
495
#define FLOW BLOCK,INLINE
496
#define NB_FLOW NB_BLOCK + NB_INLINE
500
static const char* const html_flow[] = { FLOW, NULL } ;
501
static const char* const html_inline[] = { INLINE, NULL } ;
503
/* placeholders: elts with content but no subelements */
504
static const char* const html_pcdata[] = { NULL } ;
505
#define html_cdata html_pcdata
508
/* ... and for HTML Attributes */
510
#define COREATTRS "id", "class", "style", "title"
511
#define NB_COREATTRS 4
512
#define I18N "lang", "dir"
514
#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
516
#define ATTRS COREATTRS,I18N,EVENTS
517
#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
518
#define CELLHALIGN "align", "char", "charoff"
519
#define NB_CELLHALIGN 3
520
#define CELLVALIGN "valign"
521
#define NB_CELLVALIGN 1
523
static const char* const html_attrs[] = { ATTRS, NULL } ;
524
static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
525
static const char* const core_attrs[] = { COREATTRS, NULL } ;
526
static const char* const i18n_attrs[] = { I18N, NULL } ;
529
/* Other declarations that should go inline ... */
530
static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
531
"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
532
"tabindex", "onfocus", "onblur", NULL } ;
533
static const char* const target_attr[] = { "target", NULL } ;
534
static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
535
static const char* const alt_attr[] = { "alt", NULL } ;
536
static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
537
static const char* const href_attrs[] = { "href", NULL } ;
538
static const char* const clear_attrs[] = { "clear", NULL } ;
539
static const char* const inline_p[] = { INLINE, "p", NULL } ;
541
static const char* const flow_param[] = { FLOW, "param", NULL } ;
542
static const char* const applet_attrs[] = { COREATTRS , "codebase",
543
"archive", "alt", "name", "height", "width", "align",
544
"hspace", "vspace", NULL } ;
545
static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
546
"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
547
static const char* const basefont_attrs[] =
548
{ "id", "size", "color", "face", NULL } ;
549
static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
550
static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
551
static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
552
static const char* const body_depr[] = { "background", "bgcolor", "text",
553
"link", "vlink", "alink", NULL } ;
554
static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
555
"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
558
static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
559
static const char* const col_elt[] = { "col", NULL } ;
560
static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
561
static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
562
static const char* const dl_contents[] = { "dt", "dd", NULL } ;
563
static const char* const compact_attr[] = { "compact", NULL } ;
564
static const char* const label_attr[] = { "label", NULL } ;
565
static const char* const fieldset_contents[] = { FLOW, "legend" } ;
566
static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
567
static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
568
static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
569
static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
570
static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
571
static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
572
static const char* const head_attrs[] = { I18N, "profile", NULL } ;
573
static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
574
static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
575
static const char* const version_attr[] = { "version", NULL } ;
576
static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
577
static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
578
static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
579
static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
580
static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
581
static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
582
static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
583
static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
584
static const char* const align_attr[] = { "align", NULL } ;
585
static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
586
static const char* const map_contents[] = { BLOCK, "area", NULL } ;
587
static const char* const name_attr[] = { "name", NULL } ;
588
static const char* const action_attr[] = { "action", NULL } ;
589
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
590
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
591
static const char* const content_attr[] = { "content", NULL } ;
592
static const char* const type_attr[] = { "type", NULL } ;
593
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
594
static const char* const object_contents[] = { FLOW, "param", NULL } ;
595
static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
596
static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
597
static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
598
static const char* const option_elt[] = { "option", NULL } ;
599
static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
600
static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
601
static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
602
static const char* const width_attr[] = { "width", NULL } ;
603
static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
604
static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
605
static const char* const language_attr[] = { "language", NULL } ;
606
static const char* const select_content[] = { "optgroup", "option", NULL } ;
607
static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
608
static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
609
static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
610
static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
611
static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
612
static const char* const tr_elt[] = { "tr", NULL } ;
613
static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
614
static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
615
static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
616
static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
617
static const char* const tr_contents[] = { "th", "td", NULL } ;
618
static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
619
static const char* const li_elt[] = { "li", NULL } ;
620
static const char* const ul_depr[] = { "type", "compact", NULL} ;
621
static const char* const dir_attr[] = { "dir", NULL} ;
623
#define DECL (const char**)
625
static const htmlElemDesc
626
html40ElementTable[] = {
627
{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
628
DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
630
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
631
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
633
{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
634
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
636
{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
637
DECL inline_p , NULL , DECL html_attrs, NULL, NULL
639
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
640
DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
642
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
643
EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
645
{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
646
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
648
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
649
EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
651
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
652
EMPTY , NULL , NULL, DECL basefont_attrs, NULL
654
{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
655
DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
657
{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
658
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
660
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
661
DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
663
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
664
DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
666
{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
667
EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
669
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
670
DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
672
{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
673
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
675
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
676
DECL html_flow , NULL , NULL, DECL html_attrs, NULL
678
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
679
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
681
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
682
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
684
{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
685
EMPTY , NULL , DECL col_attrs , NULL, NULL
687
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
688
DECL col_elt , "col" , DECL col_attrs , NULL, NULL
690
{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
691
DECL html_flow , NULL , DECL html_attrs, NULL, NULL
693
{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
694
DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
696
{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
697
DECL html_inline , NULL , DECL html_attrs, NULL, NULL
699
{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
700
DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
702
{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
703
DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
705
{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
706
DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
708
{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
709
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
711
{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
712
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
714
{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
715
EMPTY, NULL, DECL embed_attrs, NULL, NULL
717
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
718
DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
720
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
721
DECL html_inline, NULL, NULL, DECL font_attrs, NULL
723
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
724
DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
726
{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
727
EMPTY, NULL, NULL, DECL frame_attrs, NULL
729
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
730
DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
732
{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
733
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
735
{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
736
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
738
{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
739
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
741
{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
742
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
744
{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
745
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
747
{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
748
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
750
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
751
DECL head_contents, NULL, DECL head_attrs, NULL, NULL
753
{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
754
EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
756
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
757
DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
759
{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
760
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
762
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
763
DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
765
{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
766
EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
768
{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
769
EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
771
{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
772
DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
774
{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
775
EMPTY, NULL, NULL, DECL prompt_attrs, NULL
777
{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
778
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
780
{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
781
DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
783
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
784
DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
786
{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
787
DECL html_flow, NULL, DECL html_attrs, NULL, NULL
789
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
790
EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
792
{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
793
DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
795
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
796
DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
798
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
799
EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
801
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
802
DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
804
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
805
DECL html_flow, "div", DECL html_attrs, NULL, NULL
807
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
808
DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
810
{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
811
DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
813
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
814
DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
816
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
817
DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
819
{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
820
DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
822
{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
823
EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
825
{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
826
DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
828
{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
829
DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
831
{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
832
DECL html_inline, NULL, NULL, DECL html_attrs, NULL
834
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
835
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
838
DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
840
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
841
DECL select_content, NULL, DECL select_attrs, NULL, NULL
843
{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
844
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
846
{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
847
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
849
{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
850
DECL html_inline, NULL, NULL, DECL html_attrs, NULL
852
{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
853
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
855
{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
856
DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
858
{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
859
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
861
{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
862
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
864
{ "table", 0, 0, 0, 0, 0, 0, 0, "",
865
DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
867
{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
868
DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
870
{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
871
DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
873
{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
874
DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
876
{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
877
DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
879
{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
880
DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
882
{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
883
DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
885
{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
886
DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
888
{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
889
DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
891
{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
892
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
894
{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
895
DECL html_inline, NULL, NULL, DECL html_attrs, NULL
897
{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
898
DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
900
{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
901
DECL html_inline, NULL, DECL html_attrs, NULL, NULL
906
* start tags that imply the end of current element
908
static const char * const htmlStartClose[] = {
909
"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
910
"dl", "ul", "ol", "menu", "dir", "address", "pre",
911
"listing", "xmp", "head", NULL,
914
"body", "head", "style", "link", "title", "p", NULL,
915
"frameset", "head", "style", "link", "title", "p", NULL,
916
"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
917
"pre", "listing", "xmp", "head", "li", NULL,
918
"hr", "p", "head", NULL,
919
"h1", "p", "head", NULL,
920
"h2", "p", "head", NULL,
921
"h3", "p", "head", NULL,
922
"h4", "p", "head", NULL,
923
"h5", "p", "head", NULL,
924
"h6", "p", "head", NULL,
925
"dir", "p", "head", NULL,
926
"address", "p", "head", "ul", NULL,
927
"pre", "p", "head", "ul", NULL,
928
"listing", "p", "head", NULL,
929
"xmp", "p", "head", NULL,
930
"blockquote", "p", "head", NULL,
931
"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
933
"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
935
"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
937
"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
938
"listing", "xmp", NULL,
939
"ol", "p", "head", "ul", NULL,
940
"menu", "p", "head", "ul", NULL,
941
"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
942
"div", "p", "head", NULL,
943
"noscript", "p", "head", NULL,
944
"center", "font", "b", "i", "p", "head", NULL,
946
"caption", "p", NULL,
947
"colgroup", "caption", "colgroup", "col", "p", NULL,
948
"col", "caption", "col", "p", NULL,
949
"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
950
"listing", "xmp", "a", NULL,
951
"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
952
"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
953
"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
954
"thead", "caption", "col", "colgroup", NULL,
955
"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
957
"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
958
"tfoot", "tbody", "p", NULL,
959
"optgroup", "option", NULL,
960
"option", "option", NULL,
961
"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
962
"pre", "listing", "xmp", "a", NULL,
967
* The list of HTML elements which are supposed not to have
968
* CDATA content and where a p element will be implied
970
* TODO: extend that list by reading the HTML SGML DTD on
973
static const char *const htmlNoContentElements[] = {
980
* The list of HTML attributes which are of content %Script;
981
* NOTE: when adding ones, check htmlIsScriptAttribute() since
982
* it assumes the name starts with 'on'
984
static const char *const htmlScriptAttributes[] = {
1006
* This table is used by the htmlparser to know what to do with
1007
* broken html pages. By assigning different priorities to different
1008
* elements the parser can decide how to handle extra endtags.
1009
* Endtags are only allowed to close elements with lower or equal
1018
static const elementPriority htmlEndPriority[] = {
1030
{NULL, 100} /* Default priority */
1033
static const char** htmlStartCloseIndex[100];
1034
static int htmlStartCloseIndexinitialized = 0;
1036
/************************************************************************
1038
* functions to handle HTML specific data *
1040
************************************************************************/
1043
* htmlInitAutoClose:
1045
* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1046
* This is not reentrant. Call xmlInitParser() once before processing in
1047
* case of use in multithreaded programs.
1050
htmlInitAutoClose(void) {
1053
if (htmlStartCloseIndexinitialized) return;
1055
for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1057
while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1058
htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1059
while (htmlStartClose[i] != NULL) i++;
1062
htmlStartCloseIndexinitialized = 1;
1067
* @tag: The tag name in lowercase
1069
* Lookup the HTML tag in the ElementTable
1071
* Returns the related htmlElemDescPtr or NULL if not found.
1073
const htmlElemDesc *
1074
htmlTagLookup(const xmlChar *tag) {
1077
for (i = 0; i < (sizeof(html40ElementTable) /
1078
sizeof(html40ElementTable[0]));i++) {
1079
if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1080
return((htmlElemDescPtr) &html40ElementTable[i]);
1086
* htmlGetEndPriority:
1087
* @name: The name of the element to look up the priority for.
1089
* Return value: The "endtag" priority.
1092
htmlGetEndPriority (const xmlChar *name) {
1095
while ((htmlEndPriority[i].name != NULL) &&
1096
(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1099
return(htmlEndPriority[i].priority);
1104
* htmlCheckAutoClose:
1105
* @newtag: The new tag name
1106
* @oldtag: The old tag name
1108
* Checks whether the new tag is one of the registered valid tags for
1110
* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1112
* Returns 0 if no, 1 if yes.
1115
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1118
const char **closed = NULL;
1120
if (htmlStartCloseIndexinitialized == 0)
1121
htmlInitAutoClose();
1123
/* inefficient, but not a big deal */
1124
for (indx = 0; indx < 100; indx++) {
1125
closed = htmlStartCloseIndex[indx];
1128
if (xmlStrEqual(BAD_CAST * closed, newtag))
1132
i = closed - htmlStartClose;
1134
while (htmlStartClose[i] != NULL) {
1135
if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1144
* htmlAutoCloseOnClose:
1145
* @ctxt: an HTML parser context
1146
* @newtag: The new tag name
1147
* @force: force the tag closure
1149
* The HTML DTD allows an ending tag to implicitly close other tags.
1152
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1154
const htmlElemDesc *info;
1157
priority = htmlGetEndPriority(newtag);
1159
for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1161
if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1164
* A missplaced endtag can only close elements with lower
1165
* or equal priority, so if we find an element with higher
1166
* priority before we find an element with
1167
* matching name, we just ignore this endtag
1169
if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1175
while (!xmlStrEqual(newtag, ctxt->name)) {
1176
info = htmlTagLookup(ctxt->name);
1177
if ((info != NULL) && (info->endTag == 3)) {
1178
htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1179
"Opening and ending tag mismatch: %s and %s\n",
1180
newtag, ctxt->name);
1182
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1183
ctxt->sax->endElement(ctxt->userData, ctxt->name);
1189
* htmlAutoCloseOnEnd:
1190
* @ctxt: an HTML parser context
1192
* Close all remaining tags at the end of the stream
1195
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1199
if (ctxt->nameNr == 0)
1201
for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1202
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1203
ctxt->sax->endElement(ctxt->userData, ctxt->name);
1210
* @ctxt: an HTML parser context
1211
* @newtag: The new tag name or NULL
1213
* The HTML DTD allows a tag to implicitly close other tags.
1214
* The list is kept in htmlStartClose array. This function is
1215
* called when a new tag has been detected and generates the
1216
* appropriates closes if possible/needed.
1217
* If newtag is NULL this mean we are at the end of the resource
1218
* and we should check
1221
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1223
while ((newtag != NULL) && (ctxt->name != NULL) &&
1224
(htmlCheckAutoClose(newtag, ctxt->name))) {
1225
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1226
ctxt->sax->endElement(ctxt->userData, ctxt->name);
1229
if (newtag == NULL) {
1230
htmlAutoCloseOnEnd(ctxt);
1233
while ((newtag == NULL) && (ctxt->name != NULL) &&
1234
((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1235
(xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1236
(xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1237
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1238
ctxt->sax->endElement(ctxt->userData, ctxt->name);
1245
* @doc: the HTML document
1246
* @name: The tag name
1247
* @elem: the HTML element
1249
* The HTML DTD allows a tag to implicitly close other tags.
1250
* The list is kept in htmlStartClose array. This function checks
1251
* if the element or one of it's children would autoclose the
1254
* Returns 1 if autoclose, 0 otherwise
1257
htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1260
if (elem == NULL) return(1);
1261
if (xmlStrEqual(name, elem->name)) return(0);
1262
if (htmlCheckAutoClose(elem->name, name)) return(1);
1263
child = elem->children;
1264
while (child != NULL) {
1265
if (htmlAutoCloseTag(doc, name, child)) return(1);
1266
child = child->next;
1273
* @doc: the HTML document
1274
* @elem: the HTML element
1276
* The HTML DTD allows a tag to implicitly close other tags.
1277
* The list is kept in htmlStartClose array. This function checks
1278
* if a tag is autoclosed by one of it's child
1280
* Returns 1 if autoclosed, 0 otherwise
1283
htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1286
if (elem == NULL) return(1);
1287
child = elem->children;
1288
while (child != NULL) {
1289
if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1290
child = child->next;
1297
* @ctxt: an HTML parser context
1298
* @newtag: The new tag name
1300
* The HTML DTD allows a tag to exists only implicitly
1301
* called when a new tag has been detected and generates the
1302
* appropriates implicit tags if missing
1305
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1306
if (!htmlOmittedDefaultValue)
1308
if (xmlStrEqual(newtag, BAD_CAST"html"))
1310
if (ctxt->nameNr <= 0) {
1311
htmlnamePush(ctxt, BAD_CAST"html");
1312
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1313
ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1315
if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1317
if ((ctxt->nameNr <= 1) &&
1318
((xmlStrEqual(newtag, BAD_CAST"script")) ||
1319
(xmlStrEqual(newtag, BAD_CAST"style")) ||
1320
(xmlStrEqual(newtag, BAD_CAST"meta")) ||
1321
(xmlStrEqual(newtag, BAD_CAST"link")) ||
1322
(xmlStrEqual(newtag, BAD_CAST"title")) ||
1323
(xmlStrEqual(newtag, BAD_CAST"base")))) {
1325
* dropped OBJECT ... i you put it first BODY will be
1328
htmlnamePush(ctxt, BAD_CAST"head");
1329
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1330
ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1331
} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1332
(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1333
(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1335
for (i = 0;i < ctxt->nameNr;i++) {
1336
if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1339
if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1344
htmlnamePush(ctxt, BAD_CAST"body");
1345
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1346
ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1351
* htmlCheckParagraph
1352
* @ctxt: an HTML parser context
1354
* Check whether a p element need to be implied before inserting
1355
* characters in the current element.
1357
* Returns 1 if a paragraph has been inserted, 0 if not and -1
1362
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1370
htmlAutoClose(ctxt, BAD_CAST"p");
1371
htmlCheckImplied(ctxt, BAD_CAST"p");
1372
htmlnamePush(ctxt, BAD_CAST"p");
1373
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1374
ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1377
if (!htmlOmittedDefaultValue)
1379
for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1380
if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1381
htmlAutoClose(ctxt, BAD_CAST"p");
1382
htmlCheckImplied(ctxt, BAD_CAST"p");
1383
htmlnamePush(ctxt, BAD_CAST"p");
1384
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385
ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1393
* htmlIsScriptAttribute:
1394
* @name: an attribute name
1396
* Check if an attribute is of content type Script
1398
* Returns 1 is the attribute is a script 0 otherwise
1401
htmlIsScriptAttribute(const xmlChar *name) {
1407
* all script attributes start with 'on'
1409
if ((name[0] != 'o') || (name[1] != 'n'))
1412
i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1414
if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1420
/************************************************************************
1422
* The list of HTML predefined entities *
1424
************************************************************************/
1427
static const htmlEntityDesc html40EntitiesTable[] = {
1429
* the 4 absolute ones, plus apostrophe.
1431
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1432
{ 38, "amp", "ampersand, U+0026 ISOnum" },
1433
{ 39, "apos", "single quote" },
1434
{ 60, "lt", "less-than sign, U+003C ISOnum" },
1435
{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1438
* A bunch still in the 128-255 range
1439
* Replacing them depend really on the charset used.
1441
{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1442
{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1443
{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1444
{ 163, "pound","pound sign, U+00A3 ISOnum" },
1445
{ 164, "curren","currency sign, U+00A4 ISOnum" },
1446
{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1447
{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1448
{ 167, "sect", "section sign, U+00A7 ISOnum" },
1449
{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1450
{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1451
{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1452
{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1453
{ 172, "not", "not sign, U+00AC ISOnum" },
1454
{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1455
{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1456
{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1457
{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1458
{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1459
{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1460
{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1461
{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1462
{ 181, "micro","micro sign, U+00B5 ISOnum" },
1463
{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1464
{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1465
{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1466
{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1467
{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1468
{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1469
{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1470
{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1471
{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1472
{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1473
{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1474
{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1475
{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1476
{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1477
{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1478
{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1479
{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1480
{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1481
{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1482
{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1483
{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1484
{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1485
{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1486
{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1487
{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1488
{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1489
{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1490
{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1491
{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1492
{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1493
{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1494
{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1495
{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1496
{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1497
{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1498
{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1499
{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1500
{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1501
{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1502
{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1503
{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1504
{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1505
{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1506
{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1507
{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1508
{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1509
{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1510
{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1511
{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1512
{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1513
{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1514
{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1515
{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1516
{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1517
{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1518
{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1519
{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1520
{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1521
{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1522
{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1523
{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1524
{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1525
{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1526
{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1527
{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1528
{ 247, "divide","division sign, U+00F7 ISOnum" },
1529
{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1530
{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1531
{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1532
{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1533
{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1534
{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1535
{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1536
{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1538
{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1539
{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1540
{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1541
{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1542
{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1545
* Anything below should really be kept as entities references
1547
{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1549
{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1550
{ 732, "tilde","small tilde, U+02DC ISOdia" },
1552
{ 913, "Alpha","greek capital letter alpha, U+0391" },
1553
{ 914, "Beta", "greek capital letter beta, U+0392" },
1554
{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1555
{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1556
{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1557
{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1558
{ 919, "Eta", "greek capital letter eta, U+0397" },
1559
{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1560
{ 921, "Iota", "greek capital letter iota, U+0399" },
1561
{ 922, "Kappa","greek capital letter kappa, U+039A" },
1562
{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1563
{ 924, "Mu", "greek capital letter mu, U+039C" },
1564
{ 925, "Nu", "greek capital letter nu, U+039D" },
1565
{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1566
{ 927, "Omicron","greek capital letter omicron, U+039F" },
1567
{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1568
{ 929, "Rho", "greek capital letter rho, U+03A1" },
1569
{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1570
{ 932, "Tau", "greek capital letter tau, U+03A4" },
1571
{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1572
{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1573
{ 935, "Chi", "greek capital letter chi, U+03A7" },
1574
{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1575
{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1577
{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1578
{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1579
{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1580
{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1581
{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1582
{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1583
{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1584
{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1585
{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1586
{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1587
{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1588
{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1589
{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1590
{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1591
{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1592
{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1593
{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1594
{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1595
{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1596
{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1597
{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1598
{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1599
{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1600
{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1601
{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1602
{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1603
{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1604
{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1606
{ 8194, "ensp", "en space, U+2002 ISOpub" },
1607
{ 8195, "emsp", "em space, U+2003 ISOpub" },
1608
{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1609
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1610
{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1611
{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1612
{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1613
{ 8211, "ndash","en dash, U+2013 ISOpub" },
1614
{ 8212, "mdash","em dash, U+2014 ISOpub" },
1615
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1616
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1617
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1618
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1619
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1620
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1621
{ 8224, "dagger","dagger, U+2020 ISOpub" },
1622
{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1624
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1625
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1627
{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1629
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1630
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1632
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1633
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1635
{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1636
{ 8260, "frasl","fraction slash, U+2044 NEW" },
1638
{ 8364, "euro", "euro sign, U+20AC NEW" },
1640
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1641
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1642
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1643
{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1644
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1645
{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1646
{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1647
{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1648
{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1649
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1650
{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1651
{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1652
{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1653
{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1654
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1655
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1657
{ 8704, "forall","for all, U+2200 ISOtech" },
1658
{ 8706, "part", "partial differential, U+2202 ISOtech" },
1659
{ 8707, "exist","there exists, U+2203 ISOtech" },
1660
{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1661
{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1662
{ 8712, "isin", "element of, U+2208 ISOtech" },
1663
{ 8713, "notin","not an element of, U+2209 ISOtech" },
1664
{ 8715, "ni", "contains as member, U+220B ISOtech" },
1665
{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1666
{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1667
{ 8722, "minus","minus sign, U+2212 ISOtech" },
1668
{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1669
{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1670
{ 8733, "prop", "proportional to, U+221D ISOtech" },
1671
{ 8734, "infin","infinity, U+221E ISOtech" },
1672
{ 8736, "ang", "angle, U+2220 ISOamso" },
1673
{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1674
{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1675
{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1676
{ 8746, "cup", "union = cup, U+222A ISOtech" },
1677
{ 8747, "int", "integral, U+222B ISOtech" },
1678
{ 8756, "there4","therefore, U+2234 ISOtech" },
1679
{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1680
{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1681
{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1682
{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1683
{ 8801, "equiv","identical to, U+2261 ISOtech" },
1684
{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1685
{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1686
{ 8834, "sub", "subset of, U+2282 ISOtech" },
1687
{ 8835, "sup", "superset of, U+2283 ISOtech" },
1688
{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1689
{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1690
{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1691
{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1692
{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1693
{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1694
{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1695
{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1696
{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1697
{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1698
{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1699
{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1700
{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1701
{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1703
{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1704
{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1705
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1706
{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1710
/************************************************************************
1712
* Commodity functions to handle entities *
1714
************************************************************************/
1717
* Macro used to grow the current buffer.
1719
#define growBuffer(buffer) { \
1721
buffer##_size *= 2; \
1722
tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1723
if (tmp == NULL) { \
1724
htmlErrMemory(ctxt, "growing buffer\n"); \
1733
* @name: the entity name
1735
* Lookup the given entity in EntitiesTable
1737
* TODO: the linear scan is really ugly, an hash table is really needed.
1739
* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1741
const htmlEntityDesc *
1742
htmlEntityLookup(const xmlChar *name) {
1745
for (i = 0;i < (sizeof(html40EntitiesTable)/
1746
sizeof(html40EntitiesTable[0]));i++) {
1747
if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1748
return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1755
* htmlEntityValueLookup:
1756
* @value: the entity's unicode value
1758
* Lookup the given entity in EntitiesTable
1760
* TODO: the linear scan is really ugly, an hash table is really needed.
1762
* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1764
const htmlEntityDesc *
1765
htmlEntityValueLookup(unsigned int value) {
1768
for (i = 0;i < (sizeof(html40EntitiesTable)/
1769
sizeof(html40EntitiesTable[0]));i++) {
1770
if (html40EntitiesTable[i].value >= value) {
1771
if (html40EntitiesTable[i].value > value)
1773
return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1781
* @out: a pointer to an array of bytes to store the result
1782
* @outlen: the length of @out
1783
* @in: a pointer to an array of UTF-8 chars
1784
* @inlen: the length of @in
1786
* Take a block of UTF-8 chars in and try to convert it to an ASCII
1787
* plus HTML entities block of chars out.
1789
* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1790
* The value of @inlen after return is the number of octets consumed
1791
* as the return value is positive, else unpredictable.
1792
* The value of @outlen after return is the number of octets consumed.
1795
UTF8ToHtml(unsigned char* out, int *outlen,
1796
const unsigned char* in, int *inlen) {
1797
const unsigned char* processed = in;
1798
const unsigned char* outend;
1799
const unsigned char* outstart = out;
1800
const unsigned char* instart = in;
1801
const unsigned char* inend;
1805
if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1808
* initialization nothing to do
1814
inend = in + (*inlen);
1815
outend = out + (*outlen);
1816
while (in < inend) {
1818
if (d < 0x80) { c= d; trailing= 0; }
1819
else if (d < 0xC0) {
1820
/* trailing byte in leading position */
1821
*outlen = out - outstart;
1822
*inlen = processed - instart;
1824
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1825
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1826
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1828
/* no chance for this in Ascii */
1829
*outlen = out - outstart;
1830
*inlen = processed - instart;
1834
if (inend - in < trailing) {
1838
for ( ; trailing; trailing--) {
1839
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1845
/* assertion: c is a single UTF-4 value */
1847
if (out + 1 >= outend)
1852
const htmlEntityDesc * ent;
1857
* Try to lookup a predefined HTML entity for it
1860
ent = htmlEntityValueLookup(c);
1862
snprintf(nbuf, sizeof(nbuf), "#%u", c);
1868
if (out + 2 + len >= outend)
1871
memcpy(out, cp, len);
1877
*outlen = out - outstart;
1878
*inlen = processed - instart;
1883
* htmlEncodeEntities:
1884
* @out: a pointer to an array of bytes to store the result
1885
* @outlen: the length of @out
1886
* @in: a pointer to an array of UTF-8 chars
1887
* @inlen: the length of @in
1888
* @quoteChar: the quote character to escape (' or ") or zero.
1890
* Take a block of UTF-8 chars in and try to convert it to an ASCII
1891
* plus HTML entities block of chars out.
1893
* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1894
* The value of @inlen after return is the number of octets consumed
1895
* as the return value is positive, else unpredictable.
1896
* The value of @outlen after return is the number of octets consumed.
1899
htmlEncodeEntities(unsigned char* out, int *outlen,
1900
const unsigned char* in, int *inlen, int quoteChar) {
1901
const unsigned char* processed = in;
1902
const unsigned char* outend;
1903
const unsigned char* outstart = out;
1904
const unsigned char* instart = in;
1905
const unsigned char* inend;
1909
if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1911
outend = out + (*outlen);
1912
inend = in + (*inlen);
1913
while (in < inend) {
1915
if (d < 0x80) { c= d; trailing= 0; }
1916
else if (d < 0xC0) {
1917
/* trailing byte in leading position */
1918
*outlen = out - outstart;
1919
*inlen = processed - instart;
1921
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1922
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1923
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1925
/* no chance for this in Ascii */
1926
*outlen = out - outstart;
1927
*inlen = processed - instart;
1931
if (inend - in < trailing)
1934
while (trailing--) {
1935
if (((d= *in++) & 0xC0) != 0x80) {
1936
*outlen = out - outstart;
1937
*inlen = processed - instart;
1944
/* assertion: c is a single UTF-4 value */
1945
if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1946
(c != '&') && (c != '<') && (c != '>')) {
1951
const htmlEntityDesc * ent;
1957
* Try to lookup a predefined HTML entity for it
1959
ent = htmlEntityValueLookup(c);
1961
snprintf(nbuf, sizeof(nbuf), "#%u", c);
1967
if (out + 2 + len > outend)
1970
memcpy(out, cp, len);
1976
*outlen = out - outstart;
1977
*inlen = processed - instart;
1981
/************************************************************************
1983
* Commodity functions to handle streams *
1985
************************************************************************/
1988
* htmlNewInputStream:
1989
* @ctxt: an HTML parser context
1991
* Create a new input stream structure
1992
* Returns the new input stream or NULL
1994
static htmlParserInputPtr
1995
htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1996
htmlParserInputPtr input;
1998
input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1999
if (input == NULL) {
2000
htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2003
memset(input, 0, sizeof(htmlParserInput));
2004
input->filename = NULL;
2005
input->directory = NULL;
2013
input->version = NULL;
2014
input->consumed = 0;
2020
/************************************************************************
2022
* Commodity functions, cleanup needed ? *
2024
************************************************************************/
2026
* all tags allowing pc data from the html 4.01 loose dtd
2027
* NOTE: it might be more apropriate to integrate this information
2028
* into the html40ElementTable array but I don't want to risk any
2029
* binary incomptibility
2031
static const char *allowPCData[] = {
2032
"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2033
"blockquote", "body", "button", "caption", "center", "cite", "code",
2034
"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2035
"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2036
"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2037
"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2042
* @ctxt: an HTML parser context
2044
* @len: the size of @str
2046
* Is this a sequence of blank chars that one can ignore ?
2048
* Returns 1 if ignorable 0 otherwise.
2051
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2054
xmlNodePtr lastChild;
2057
for (j = 0;j < len;j++)
2058
if (!(IS_BLANK_CH(str[j]))) return(0);
2060
if (CUR == 0) return(1);
2061
if (CUR != '<') return(0);
2062
if (ctxt->name == NULL)
2064
if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2066
if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2069
/* Only strip CDATA children of the body tag for strict HTML DTDs */
2070
if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2071
dtd = xmlGetIntSubset(ctxt->myDoc);
2072
if (dtd != NULL && dtd->ExternalID != NULL) {
2073
if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2074
!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2079
if (ctxt->node == NULL) return(0);
2080
lastChild = xmlGetLastChild(ctxt->node);
2081
while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2082
lastChild = lastChild->prev;
2083
if (lastChild == NULL) {
2084
if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2085
(ctxt->node->content != NULL)) return(0);
2086
/* keep ws in constructs like ...<b> </b>...
2087
for all tags "b" allowing PCDATA */
2088
for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089
if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2093
} else if (xmlNodeIsText(lastChild)) {
2096
/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2097
for all tags "p" allowing PCDATA */
2098
for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2099
if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2109
* @URI: URI for the dtd, or NULL
2110
* @ExternalID: the external ID of the DTD, or NULL
2112
* Creates a new HTML document without a DTD node if @URI and @ExternalID
2115
* Returns a new document, do not initialize the DTD if not provided
2118
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2122
* Allocate a new document and fill the fields.
2124
cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2126
htmlErrMemory(NULL, "HTML document creation failed\n");
2129
memset(cur, 0, sizeof(xmlDoc));
2131
cur->type = XML_HTML_DOCUMENT_NODE;
2132
cur->version = NULL;
2133
cur->intSubset = NULL;
2136
cur->children = NULL;
2137
cur->extSubset = NULL;
2139
cur->encoding = NULL;
2140
cur->standalone = 1;
2141
cur->compression = 0;
2144
cur->_private = NULL;
2145
cur->charset = XML_CHAR_ENCODING_UTF8;
2146
if ((ExternalID != NULL) ||
2148
xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2154
* @URI: URI for the dtd, or NULL
2155
* @ExternalID: the external ID of the DTD, or NULL
2157
* Creates a new HTML document
2159
* Returns a new document
2162
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2163
if ((URI == NULL) && (ExternalID == NULL))
2164
return(htmlNewDocNoDtD(
2165
BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2166
BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2168
return(htmlNewDocNoDtD(URI, ExternalID));
2172
/************************************************************************
2174
* The parser itself *
2175
* Relates to http://www.w3.org/TR/html40 *
2177
************************************************************************/
2179
/************************************************************************
2181
* The parser itself *
2183
************************************************************************/
2185
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2188
* htmlParseHTMLName:
2189
* @ctxt: an HTML parser context
2191
* parse an HTML tag or attribute name, note that we convert it to lowercase
2192
* since HTML names are not case-sensitive.
2194
* Returns the Tag Name parsed or NULL
2197
static const xmlChar *
2198
htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2200
xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2202
if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2203
(CUR != ':')) return(NULL);
2205
while ((i < HTML_PARSER_BUFFER_SIZE) &&
2206
((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2207
(CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2208
if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2215
return(xmlDictLookup(ctxt->dict, loc, i));
2220
* htmlParseHTMLName_nonInvasive:
2221
* @ctxt: an HTML parser context
2223
* parse an HTML tag or attribute name, note that we convert it to lowercase
2224
* since HTML names are not case-sensitive, this doesn't consume the data
2225
* from the stream, it's a look-ahead
2227
* Returns the Tag Name parsed or NULL
2230
static const xmlChar *
2231
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2233
xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2235
if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2236
(NXT(1) != ':')) return(NULL);
2238
while ((i < HTML_PARSER_BUFFER_SIZE) &&
2239
((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2240
(NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2241
if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2242
else loc[i] = NXT(1+i);
2246
return(xmlDictLookup(ctxt->dict, loc, i));
2252
* @ctxt: an HTML parser context
2254
* parse an HTML name, this routine is case sensitive.
2256
* Returns the Name parsed or NULL
2259
static const xmlChar *
2260
htmlParseName(htmlParserCtxtPtr ctxt) {
2268
* Accelerator for simple ASCII names
2270
in = ctxt->input->cur;
2271
if (((*in >= 0x61) && (*in <= 0x7A)) ||
2272
((*in >= 0x41) && (*in <= 0x5A)) ||
2273
(*in == '_') || (*in == ':')) {
2275
while (((*in >= 0x61) && (*in <= 0x7A)) ||
2276
((*in >= 0x41) && (*in <= 0x5A)) ||
2277
((*in >= 0x30) && (*in <= 0x39)) ||
2278
(*in == '_') || (*in == '-') ||
2279
(*in == ':') || (*in == '.'))
2281
if ((*in > 0) && (*in < 0x80)) {
2282
count = in - ctxt->input->cur;
2283
ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2284
ctxt->input->cur = in;
2285
ctxt->nbChars += count;
2286
ctxt->input->col += count;
2290
return(htmlParseNameComplex(ctxt));
2293
static const xmlChar *
2294
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2300
* Handler for more complex cases
2304
if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2305
(!IS_LETTER(c) && (c != '_') &&
2310
while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2311
((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2312
(c == '.') || (c == '-') ||
2313
(c == '_') || (c == ':') ||
2314
(IS_COMBINING(c)) ||
2315
(IS_EXTENDER(c)))) {
2316
if (count++ > 100) {
2324
return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2329
* htmlParseHTMLAttribute:
2330
* @ctxt: an HTML parser context
2331
* @stop: a char stop value
2333
* parse an HTML attribute value till the stop (quote), if
2334
* stop is 0 then it stops at the first space
2336
* Returns the attribute parsed or NULL
2340
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2341
xmlChar *buffer = NULL;
2342
int buffer_size = 0;
2343
xmlChar *out = NULL;
2344
const xmlChar *name = NULL;
2345
const xmlChar *cur = NULL;
2346
const htmlEntityDesc * ent;
2349
* allocate a translation buffer.
2351
buffer_size = HTML_PARSER_BUFFER_SIZE;
2352
buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2353
if (buffer == NULL) {
2354
htmlErrMemory(ctxt, "buffer allocation failed\n");
2360
* Ok loop until we reach one of the ending chars
2362
while ((CUR != 0) && (CUR != stop)) {
2363
if ((stop == 0) && (CUR == '>')) break;
2364
if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2366
if (NXT(1) == '#') {
2370
c = htmlParseCharRef(ctxt);
2372
{ *out++ = c; bits= -6; }
2374
{ *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2375
else if (c < 0x10000)
2376
{ *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2378
{ *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2380
for ( ; bits >= 0; bits-= 6) {
2381
*out++ = ((c >> bits) & 0x3F) | 0x80;
2384
if (out - buffer > buffer_size - 100) {
2385
int indx = out - buffer;
2388
out = &buffer[indx];
2391
ent = htmlParseEntityRef(ctxt, &name);
2394
if (out - buffer > buffer_size - 100) {
2395
int indx = out - buffer;
2398
out = &buffer[indx];
2400
} else if (ent == NULL) {
2404
if (out - buffer > buffer_size - 100) {
2405
int indx = out - buffer;
2408
out = &buffer[indx];
2416
if (out - buffer > buffer_size - 100) {
2417
int indx = out - buffer;
2420
out = &buffer[indx];
2424
{ *out++ = c; bits= -6; }
2426
{ *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2427
else if (c < 0x10000)
2428
{ *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2430
{ *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2432
for ( ; bits >= 0; bits-= 6) {
2433
*out++ = ((c >> bits) & 0x3F) | 0x80;
2441
if (out - buffer > buffer_size - 100) {
2442
int indx = out - buffer;
2445
out = &buffer[indx];
2449
{ *out++ = c; bits= -6; }
2451
{ *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2452
else if (c < 0x10000)
2453
{ *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2455
{ *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2457
for ( ; bits >= 0; bits-= 6) {
2458
*out++ = ((c >> bits) & 0x3F) | 0x80;
2468
* htmlParseEntityRef:
2469
* @ctxt: an HTML parser context
2470
* @str: location to store the entity name
2472
* parse an HTML ENTITY references
2474
* [68] EntityRef ::= '&' Name ';'
2476
* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2477
* if non-NULL *str will have to be freed by the caller.
2479
const htmlEntityDesc *
2480
htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2481
const xmlChar *name;
2482
const htmlEntityDesc * ent = NULL;
2484
if (str != NULL) *str = NULL;
2485
if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2489
name = htmlParseName(ctxt);
2491
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2492
"htmlParseEntityRef: no name\n", NULL, NULL);
2500
* Lookup the entity in the table.
2502
ent = htmlEntityLookup(name);
2503
if (ent != NULL) /* OK that's ugly !!! */
2506
htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2507
"htmlParseEntityRef: expecting ';'\n",
2518
* htmlParseAttValue:
2519
* @ctxt: an HTML parser context
2521
* parse a value for an attribute
2522
* Note: the parser won't do substitution of entities here, this
2523
* will be handled later in xmlStringGetNodeList, unless it was
2524
* asked for ctxt->replaceEntities != 0
2526
* Returns the AttValue parsed or NULL.
2530
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2531
xmlChar *ret = NULL;
2535
ret = htmlParseHTMLAttribute(ctxt, '"');
2537
htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2538
"AttValue: \" expected\n", NULL, NULL);
2541
} else if (CUR == '\'') {
2543
ret = htmlParseHTMLAttribute(ctxt, '\'');
2545
htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2546
"AttValue: ' expected\n", NULL, NULL);
2551
* That's an HTMLism, the attribute value may not be quoted
2553
ret = htmlParseHTMLAttribute(ctxt, 0);
2555
htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2556
"AttValue: no value found\n", NULL, NULL);
2563
* htmlParseSystemLiteral:
2564
* @ctxt: an HTML parser context
2566
* parse an HTML Literal
2568
* [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2570
* Returns the SystemLiteral parsed or NULL
2574
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2576
xmlChar *ret = NULL;
2581
while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2583
if (!IS_CHAR_CH(CUR)) {
2584
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2585
"Unfinished SystemLiteral\n", NULL, NULL);
2587
ret = xmlStrndup(q, CUR_PTR - q);
2590
} else if (CUR == '\'') {
2593
while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2595
if (!IS_CHAR_CH(CUR)) {
2596
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2597
"Unfinished SystemLiteral\n", NULL, NULL);
2599
ret = xmlStrndup(q, CUR_PTR - q);
2603
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2604
" or ' expected\n", NULL, NULL);
2611
* htmlParsePubidLiteral:
2612
* @ctxt: an HTML parser context
2614
* parse an HTML public literal
2616
* [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2618
* Returns the PubidLiteral parsed or NULL.
2622
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2624
xmlChar *ret = NULL;
2626
* Name ::= (Letter | '_') (NameChar)*
2631
while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2633
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2634
"Unfinished PubidLiteral\n", NULL, NULL);
2636
ret = xmlStrndup(q, CUR_PTR - q);
2639
} else if (CUR == '\'') {
2642
while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2645
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2646
"Unfinished PubidLiteral\n", NULL, NULL);
2648
ret = xmlStrndup(q, CUR_PTR - q);
2652
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2653
"PubidLiteral \" or ' expected\n", NULL, NULL);
2661
* @ctxt: an HTML parser context
2663
* parse the content of an HTML SCRIPT or STYLE element
2664
* http://www.w3.org/TR/html4/sgml/dtd.html#Script
2665
* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2666
* http://www.w3.org/TR/html4/types.html#type-script
2667
* http://www.w3.org/TR/html4/types.html#h-6.15
2668
* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2670
* Script data ( %Script; in the DTD) can be the content of the SCRIPT
2671
* element and the value of intrinsic event attributes. User agents must
2672
* not evaluate script data as HTML markup but instead must pass it on as
2673
* data to a script engine.
2675
* - The content is passed like CDATA
2676
* - the attributes for style and scripting "onXXX" are also described
2677
* as CDATA but SGML allows entities references in attributes so their
2678
* processing is identical as other attributes
2681
htmlParseScript(htmlParserCtxtPtr ctxt) {
2682
xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2688
while (IS_CHAR_CH(cur)) {
2689
if ((cur == '<') && (NXT(1) == '/')) {
2691
* One should break here, the specification is clear:
2692
* Authors should therefore escape "</" within the content.
2693
* Escape mechanisms are specific to each scripting or
2694
* style sheet language.
2696
* In recovery mode, only break if end tag match the
2697
* current tag, effectively ignoring all tags inside the
2698
* script/style block and treating the entire block as
2701
if (ctxt->recovery) {
2702
if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2703
xmlStrlen(ctxt->name)) == 0)
2707
htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2708
"Element %s embeds close tag\n",
2712
if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2713
((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2719
COPY_BUF(l,buf,nbchar,cur);
2720
if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2721
if (ctxt->sax->cdataBlock!= NULL) {
2723
* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2725
ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2726
} else if (ctxt->sax->characters != NULL) {
2727
ctxt->sax->characters(ctxt->userData, buf, nbchar);
2736
if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2737
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2738
"Invalid char in CDATA 0x%X\n", cur);
2742
if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2743
if (ctxt->sax->cdataBlock!= NULL) {
2745
* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2747
ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2748
} else if (ctxt->sax->characters != NULL) {
2749
ctxt->sax->characters(ctxt->userData, buf, nbchar);
2756
* htmlParseCharData:
2757
* @ctxt: an HTML parser context
2759
* parse a CharData section.
2760
* if we are within a CDATA section ']]>' marks an end of section.
2762
* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2766
htmlParseCharData(htmlParserCtxtPtr ctxt) {
2767
xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2773
while (((cur != '<') || (ctxt->token == '<')) &&
2774
((cur != '&') || (ctxt->token == '&')) &&
2776
if (!(IS_CHAR(cur))) {
2777
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2778
"Invalid char in CDATA 0x%X\n", cur);
2780
COPY_BUF(l,buf,nbchar,cur);
2782
if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2784
* Ok the segment is to be consumed as chars.
2786
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2787
if (areBlanks(ctxt, buf, nbchar)) {
2788
if (ctxt->sax->ignorableWhitespace != NULL)
2789
ctxt->sax->ignorableWhitespace(ctxt->userData,
2792
htmlCheckParagraph(ctxt);
2793
if (ctxt->sax->characters != NULL)
2794
ctxt->sax->characters(ctxt->userData, buf, nbchar);
2811
* Ok the segment is to be consumed as chars.
2813
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2814
if (areBlanks(ctxt, buf, nbchar)) {
2815
if (ctxt->sax->ignorableWhitespace != NULL)
2816
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2818
htmlCheckParagraph(ctxt);
2819
if (ctxt->sax->characters != NULL)
2820
ctxt->sax->characters(ctxt->userData, buf, nbchar);
2828
ctxt->instate = XML_PARSER_EOF;
2833
* htmlParseExternalID:
2834
* @ctxt: an HTML parser context
2835
* @publicID: a xmlChar** receiving PubidLiteral
2837
* Parse an External ID or a Public ID
2839
* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2840
* | 'PUBLIC' S PubidLiteral S SystemLiteral
2842
* [83] PublicID ::= 'PUBLIC' S PubidLiteral
2844
* Returns the function returns SystemLiteral and in the second
2845
* case publicID receives PubidLiteral, is strict is off
2846
* it is possible to return NULL and have publicID set.
2850
htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2851
xmlChar *URI = NULL;
2853
if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2854
(UPP(2) == 'S') && (UPP(3) == 'T') &&
2855
(UPP(4) == 'E') && (UPP(5) == 'M')) {
2857
if (!IS_BLANK_CH(CUR)) {
2858
htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2859
"Space required after 'SYSTEM'\n", NULL, NULL);
2862
URI = htmlParseSystemLiteral(ctxt);
2864
htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2865
"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2867
} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2868
(UPP(2) == 'B') && (UPP(3) == 'L') &&
2869
(UPP(4) == 'I') && (UPP(5) == 'C')) {
2871
if (!IS_BLANK_CH(CUR)) {
2872
htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2873
"Space required after 'PUBLIC'\n", NULL, NULL);
2876
*publicID = htmlParsePubidLiteral(ctxt);
2877
if (*publicID == NULL) {
2878
htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2879
"htmlParseExternalID: PUBLIC, no Public Identifier\n",
2883
if ((CUR == '"') || (CUR == '\'')) {
2884
URI = htmlParseSystemLiteral(ctxt);
2892
* @ctxt: an XML parser context
2894
* parse an XML Processing Instruction.
2896
* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2899
htmlParsePI(htmlParserCtxtPtr ctxt) {
2900
xmlChar *buf = NULL;
2902
int size = HTML_PARSER_BUFFER_SIZE;
2904
const xmlChar *target;
2905
xmlParserInputState state;
2908
if ((RAW == '<') && (NXT(1) == '?')) {
2909
state = ctxt->instate;
2910
ctxt->instate = XML_PARSER_PI;
2912
* this is a Processing Instruction.
2918
* Parse the target name and check for special support like
2921
target = htmlParseName(ctxt);
2922
if (target != NULL) {
2929
if ((ctxt->sax) && (!ctxt->disableSAX) &&
2930
(ctxt->sax->processingInstruction != NULL))
2931
ctxt->sax->processingInstruction(ctxt->userData,
2933
ctxt->instate = state;
2936
buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2938
htmlErrMemory(ctxt, NULL);
2939
ctxt->instate = state;
2943
if (!IS_BLANK(cur)) {
2944
htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2945
"ParsePI: PI %s space expected\n", target, NULL);
2949
while (IS_CHAR(cur) && (cur != '>')) {
2950
if (len + 5 >= size) {
2954
tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2956
htmlErrMemory(ctxt, NULL);
2958
ctxt->instate = state;
2968
COPY_BUF(l,buf,len,cur);
2979
htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2980
"ParsePI: PI %s never end ...\n", target, NULL);
2987
if ((ctxt->sax) && (!ctxt->disableSAX) &&
2988
(ctxt->sax->processingInstruction != NULL))
2989
ctxt->sax->processingInstruction(ctxt->userData,
2994
htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2995
"PI is not started correctly", NULL, NULL);
2997
ctxt->instate = state;
3003
* @ctxt: an HTML parser context
3005
* Parse an XML (SGML) comment <!-- .... -->
3007
* [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3010
htmlParseComment(htmlParserCtxtPtr ctxt) {
3011
xmlChar *buf = NULL;
3013
int size = HTML_PARSER_BUFFER_SIZE;
3017
xmlParserInputState state;
3020
* Check that there is a comment right here.
3022
if ((RAW != '<') || (NXT(1) != '!') ||
3023
(NXT(2) != '-') || (NXT(3) != '-')) return;
3025
state = ctxt->instate;
3026
ctxt->instate = XML_PARSER_COMMENT;
3029
buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3031
htmlErrMemory(ctxt, "buffer allocation failed\n");
3032
ctxt->instate = state;
3041
while (IS_CHAR(cur) &&
3043
(r != '-') || (q != '-'))) {
3044
if (len + 5 >= size) {
3048
tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3051
htmlErrMemory(ctxt, "growing buffer failed\n");
3052
ctxt->instate = state;
3057
COPY_BUF(ql,buf,len,q);
3071
if (!IS_CHAR(cur)) {
3072
htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3073
"Comment not terminated \n<!--%.50s\n", buf, NULL);
3077
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3078
(!ctxt->disableSAX))
3079
ctxt->sax->comment(ctxt->userData, buf);
3082
ctxt->instate = state;
3087
* @ctxt: an HTML parser context
3089
* parse Reference declarations
3091
* [66] CharRef ::= '&#' [0-9]+ ';' |
3092
* '&#x' [0-9a-fA-F]+ ';'
3094
* Returns the value parsed (as an int)
3097
htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3100
if ((ctxt == NULL) || (ctxt->input == NULL)) {
3101
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3102
"htmlParseCharRef: context error\n",
3106
if ((CUR == '&') && (NXT(1) == '#') &&
3107
((NXT(2) == 'x') || NXT(2) == 'X')) {
3109
while (CUR != ';') {
3110
if ((CUR >= '0') && (CUR <= '9'))
3111
val = val * 16 + (CUR - '0');
3112
else if ((CUR >= 'a') && (CUR <= 'f'))
3113
val = val * 16 + (CUR - 'a') + 10;
3114
else if ((CUR >= 'A') && (CUR <= 'F'))
3115
val = val * 16 + (CUR - 'A') + 10;
3117
htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3118
"htmlParseCharRef: missing semicolumn\n",
3126
} else if ((CUR == '&') && (NXT(1) == '#')) {
3128
while (CUR != ';') {
3129
if ((CUR >= '0') && (CUR <= '9'))
3130
val = val * 10 + (CUR - '0');
3132
htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3133
"htmlParseCharRef: missing semicolumn\n",
3142
htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3143
"htmlParseCharRef: invalid value\n", NULL, NULL);
3146
* Check the value IS_CHAR ...
3151
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3152
"htmlParseCharRef: invalid xmlChar value %d\n",
3160
* htmlParseDocTypeDecl:
3161
* @ctxt: an HTML parser context
3163
* parse a DOCTYPE declaration
3165
* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3166
* ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3170
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3171
const xmlChar *name;
3172
xmlChar *ExternalID = NULL;
3173
xmlChar *URI = NULL;
3176
* We know that '<!DOCTYPE' has been detected.
3183
* Parse the DOCTYPE name.
3185
name = htmlParseName(ctxt);
3187
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3188
"htmlParseDocTypeDecl : no DOCTYPE name !\n",
3192
* Check that upper(name) == "HTML" !!!!!!!!!!!!!
3198
* Check for SystemID and ExternalID
3200
URI = htmlParseExternalID(ctxt, &ExternalID);
3204
* We should be at the end of the DOCTYPE declaration.
3207
htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3208
"DOCTYPE improperly terminated\n", NULL, NULL);
3209
/* We shouldn't try to resynchronize ... */
3214
* Create or update the document accordingly to the DOCTYPE
3216
if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3217
(!ctxt->disableSAX))
3218
ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3221
* Cleanup, since we don't use all those identifiers
3223
if (URI != NULL) xmlFree(URI);
3224
if (ExternalID != NULL) xmlFree(ExternalID);
3228
* htmlParseAttribute:
3229
* @ctxt: an HTML parser context
3230
* @value: a xmlChar ** used to store the value of the attribute
3232
* parse an attribute
3234
* [41] Attribute ::= Name Eq AttValue
3236
* [25] Eq ::= S? '=' S?
3240
* [NS 11] Attribute ::= QName Eq AttValue
3242
* Also the case QName == xmlns:??? is handled independently as a namespace
3245
* Returns the attribute name, and the value in *value.
3248
static const xmlChar *
3249
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3250
const xmlChar *name;
3251
xmlChar *val = NULL;
3254
name = htmlParseHTMLName(ctxt);
3256
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3257
"error parsing attribute name\n", NULL, NULL);
3268
val = htmlParseAttValue(ctxt);
3269
} else if (htmlIsBooleanAttr(name)) {
3271
* assume a minimized attribute
3273
val = xmlStrdup(name);
3281
* htmlCheckEncoding:
3282
* @ctxt: an HTML parser context
3283
* @attvalue: the attribute value
3285
* Checks an http-equiv attribute from a Meta tag to detect
3287
* If a new encoding is detected the parser is switched to decode
3291
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3292
const xmlChar *encoding;
3294
if ((ctxt == NULL) || (attvalue == NULL))
3297
/* do not change encoding */
3298
if (ctxt->input->encoding != NULL)
3301
encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3302
if (encoding != NULL) {
3305
encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3306
if (encoding != NULL)
3309
if (encoding != NULL) {
3310
xmlCharEncoding enc;
3311
xmlCharEncodingHandlerPtr handler;
3313
while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3315
if (ctxt->input->encoding != NULL)
3316
xmlFree((xmlChar *) ctxt->input->encoding);
3317
ctxt->input->encoding = xmlStrdup(encoding);
3319
enc = xmlParseCharEncoding((const char *) encoding);
3321
* registered set of known encodings
3323
if (enc != XML_CHAR_ENCODING_ERROR) {
3324
if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3325
(enc == XML_CHAR_ENCODING_UTF16BE) ||
3326
(enc == XML_CHAR_ENCODING_UCS4LE) ||
3327
(enc == XML_CHAR_ENCODING_UCS4BE)) &&
3328
(ctxt->input->buf != NULL) &&
3329
(ctxt->input->buf->encoder == NULL)) {
3330
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3331
"htmlCheckEncoding: wrong encoding meta\n",
3334
xmlSwitchEncoding(ctxt, enc);
3336
ctxt->charset = XML_CHAR_ENCODING_UTF8;
3339
* fallback for unknown encodings
3341
handler = xmlFindCharEncodingHandler((const char *) encoding);
3342
if (handler != NULL) {
3343
xmlSwitchToEncoding(ctxt, handler);
3344
ctxt->charset = XML_CHAR_ENCODING_UTF8;
3346
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3350
if ((ctxt->input->buf != NULL) &&
3351
(ctxt->input->buf->encoder != NULL) &&
3352
(ctxt->input->buf->raw != NULL) &&
3353
(ctxt->input->buf->buffer != NULL)) {
3358
* convert as much as possible to the parser reading buffer.
3360
processed = ctxt->input->cur - ctxt->input->base;
3361
xmlBufferShrink(ctxt->input->buf->buffer, processed);
3362
nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3363
ctxt->input->buf->buffer,
3364
ctxt->input->buf->raw);
3366
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3367
"htmlCheckEncoding: encoder error\n",
3371
ctxt->input->cur = ctxt->input->buf->buffer->content;
3378
* @ctxt: an HTML parser context
3379
* @atts: the attributes values
3381
* Checks an attributes from a Meta tag
3384
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3386
const xmlChar *att, *value;
3388
const xmlChar *content = NULL;
3390
if ((ctxt == NULL) || (atts == NULL))
3395
while (att != NULL) {
3397
if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3398
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3400
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3404
if ((http) && (content != NULL))
3405
htmlCheckEncoding(ctxt, content);
3410
* htmlParseStartTag:
3411
* @ctxt: an HTML parser context
3413
* parse a start of tag either for rule element or
3414
* EmptyElement. In both case we don't parse the tag closing chars.
3416
* [40] STag ::= '<' Name (S Attribute)* S? '>'
3418
* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3422
* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3424
* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3426
* Returns 0 in case of success, -1 in case of error and 1 if discarded
3430
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3431
const xmlChar *name;
3432
const xmlChar *attname;
3434
const xmlChar **atts;
3441
if ((ctxt == NULL) || (ctxt->input == NULL)) {
3442
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3443
"htmlParseStartTag: context error\n", NULL, NULL);
3446
if (CUR != '<') return -1;
3450
maxatts = ctxt->maxatts;
3453
name = htmlParseHTMLName(ctxt);
3455
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3456
"htmlParseStartTag: invalid element name\n",
3458
/* Dump the bogus tag like browsers do */
3459
while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3463
if (xmlStrEqual(name, BAD_CAST"meta"))
3467
* Check for auto-closure of HTML elements.
3469
htmlAutoClose(ctxt, name);
3472
* Check for implied HTML elements.
3474
htmlCheckImplied(ctxt, name);
3477
* Avoid html at any level > 0, head at any level != 1
3478
* or any attempt to recurse body
3480
if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3481
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3482
"htmlParseStartTag: misplaced <html> tag\n",
3486
if ((ctxt->nameNr != 1) &&
3487
(xmlStrEqual(name, BAD_CAST"head"))) {
3488
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3489
"htmlParseStartTag: misplaced <head> tag\n",
3493
if (xmlStrEqual(name, BAD_CAST"body")) {
3495
for (indx = 0;indx < ctxt->nameNr;indx++) {
3496
if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3497
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3498
"htmlParseStartTag: misplaced <body> tag\n",
3506
* Now parse the attributes, it ends up with the ending
3511
while ((IS_CHAR_CH(CUR)) &&
3513
((CUR != '/') || (NXT(1) != '>'))) {
3514
long cons = ctxt->nbChars;
3517
attname = htmlParseAttribute(ctxt, &attvalue);
3518
if (attname != NULL) {
3521
* Well formedness requires at most one declaration of an attribute
3523
for (i = 0; i < nbatts;i += 2) {
3524
if (xmlStrEqual(atts[i], attname)) {
3525
htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3526
"Attribute %s redefined\n", attname, NULL);
3527
if (attvalue != NULL)
3534
* Add the pair to atts
3537
maxatts = 22; /* allow for 10 attrs by default */
3538
atts = (const xmlChar **)
3539
xmlMalloc(maxatts * sizeof(xmlChar *));
3541
htmlErrMemory(ctxt, NULL);
3542
if (attvalue != NULL)
3547
ctxt->maxatts = maxatts;
3548
} else if (nbatts + 4 > maxatts) {
3552
n = (const xmlChar **) xmlRealloc((void *) atts,
3553
maxatts * sizeof(const xmlChar *));
3555
htmlErrMemory(ctxt, NULL);
3556
if (attvalue != NULL)
3562
ctxt->maxatts = maxatts;
3564
atts[nbatts++] = attname;
3565
atts[nbatts++] = attvalue;
3566
atts[nbatts] = NULL;
3567
atts[nbatts + 1] = NULL;
3570
if (attvalue != NULL)
3572
/* Dump the bogus attribute string up to the next blank or
3573
* the end of the tag. */
3574
while ((IS_CHAR_CH(CUR)) &&
3575
!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3576
((CUR != '/') || (NXT(1) != '>')))
3582
if (cons == ctxt->nbChars) {
3583
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3584
"htmlParseStartTag: problem parsing attributes\n",
3591
* Handle specific association to the META tag
3593
if (meta && (nbatts != 0))
3594
htmlCheckMeta(ctxt, atts);
3597
* SAX: Start of Element !
3600
htmlnamePush(ctxt, name);
3601
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3603
ctxt->sax->startElement(ctxt->userData, name, atts);
3605
ctxt->sax->startElement(ctxt->userData, name, NULL);
3610
for (i = 1;i < nbatts;i += 2) {
3611
if (atts[i] != NULL)
3612
xmlFree((xmlChar *) atts[i]);
3621
* @ctxt: an HTML parser context
3623
* parse an end of tag
3625
* [42] ETag ::= '</' Name S? '>'
3629
* [NS 9] ETag ::= '</' QName S? '>'
3631
* Returns 1 if the current level should be closed.
3635
htmlParseEndTag(htmlParserCtxtPtr ctxt)
3637
const xmlChar *name;
3638
const xmlChar *oldname;
3641
if ((CUR != '<') || (NXT(1) != '/')) {
3642
htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3643
"htmlParseEndTag: '</' not found\n", NULL, NULL);
3648
name = htmlParseHTMLName(ctxt);
3653
* We should definitely be at the ending "S? '>'" part
3656
if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3657
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3658
"End tag : expected '>'\n", NULL, NULL);
3659
if (ctxt->recovery) {
3661
* We're not at the ending > !!
3662
* Error, unless in recover mode where we search forwards
3665
while (CUR != '\0' && CUR != '>') NEXT;
3672
* If the name read is not one of the element in the parsing stack
3673
* then return, it's just an error.
3675
for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3676
if (xmlStrEqual(name, ctxt->nameTab[i]))
3680
htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3681
"Unexpected end tag : %s\n", name, NULL);
3687
* Check for auto-closure of HTML elements.
3690
htmlAutoCloseOnClose(ctxt, name);
3693
* Well formedness constraints, opening and closing must match.
3694
* With the exception that the autoclose may have popped stuff out
3697
if (!xmlStrEqual(name, ctxt->name)) {
3698
if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3699
htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3700
"Opening and ending tag mismatch: %s and %s\n",
3708
oldname = ctxt->name;
3709
if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3710
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3711
ctxt->sax->endElement(ctxt->userData, name);
3723
* htmlParseReference:
3724
* @ctxt: an HTML parser context
3726
* parse and handle entity references in content,
3727
* this will end-up in a call to character() since this is either a
3728
* CharRef, or a predefined entity.
3731
htmlParseReference(htmlParserCtxtPtr ctxt) {
3732
const htmlEntityDesc * ent;
3734
const xmlChar *name;
3735
if (CUR != '&') return;
3737
if (NXT(1) == '#') {
3741
c = htmlParseCharRef(ctxt);
3745
if (c < 0x80) { out[i++]= c; bits= -6; }
3746
else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3747
else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3748
else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3750
for ( ; bits >= 0; bits-= 6) {
3751
out[i++]= ((c >> bits) & 0x3F) | 0x80;
3755
htmlCheckParagraph(ctxt);
3756
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3757
ctxt->sax->characters(ctxt->userData, out, i);
3759
ent = htmlParseEntityRef(ctxt, &name);
3761
htmlCheckParagraph(ctxt);
3762
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3763
ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3766
if ((ent == NULL) || !(ent->value > 0)) {
3767
htmlCheckParagraph(ctxt);
3768
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3769
ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3770
ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3771
/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3779
{ out[i++]= c; bits= -6; }
3781
{ out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3782
else if (c < 0x10000)
3783
{ out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3785
{ out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3787
for ( ; bits >= 0; bits-= 6) {
3788
out[i++]= ((c >> bits) & 0x3F) | 0x80;
3792
htmlCheckParagraph(ctxt);
3793
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3794
ctxt->sax->characters(ctxt->userData, out, i);
3801
* @ctxt: an HTML parser context
3803
* Parse a content: comment, sub-element, reference or text.
3807
htmlParseContent(htmlParserCtxtPtr ctxt) {
3808
xmlChar *currentNode;
3810
const xmlChar *name;
3812
currentNode = xmlStrdup(ctxt->name);
3813
depth = ctxt->nameNr;
3815
long cons = ctxt->nbChars;
3819
* Our tag or one of it's parent or children is ending.
3821
if ((CUR == '<') && (NXT(1) == '/')) {
3822
if (htmlParseEndTag(ctxt) &&
3823
((currentNode != NULL) || (ctxt->nameNr == 0))) {
3824
if (currentNode != NULL)
3825
xmlFree(currentNode);
3828
continue; /* while */
3831
else if ((CUR == '<') &&
3832
((IS_ASCII_LETTER(NXT(1))) ||
3833
(NXT(1) == '_') || (NXT(1) == ':'))) {
3834
name = htmlParseHTMLName_nonInvasive(ctxt);
3836
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3837
"htmlParseStartTag: invalid element name\n",
3839
/* Dump the bogus tag like browsers do */
3840
while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3843
if (currentNode != NULL)
3844
xmlFree(currentNode);
3848
if (ctxt->name != NULL) {
3849
if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3850
htmlAutoClose(ctxt, name);
3857
* Has this node been popped out during parsing of
3860
if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3861
(!xmlStrEqual(currentNode, ctxt->name)))
3863
if (currentNode != NULL) xmlFree(currentNode);
3867
if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3868
(xmlStrEqual(currentNode, BAD_CAST"style")))) {
3870
* Handle SCRIPT/STYLE separately
3872
htmlParseScript(ctxt);
3875
* Sometimes DOCTYPE arrives in the middle of the document
3877
if ((CUR == '<') && (NXT(1) == '!') &&
3878
(UPP(2) == 'D') && (UPP(3) == 'O') &&
3879
(UPP(4) == 'C') && (UPP(5) == 'T') &&
3880
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
3882
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3883
"Misplaced DOCTYPE declaration\n",
3884
BAD_CAST "DOCTYPE" , NULL);
3885
htmlParseDocTypeDecl(ctxt);
3889
* First case : a comment
3891
if ((CUR == '<') && (NXT(1) == '!') &&
3892
(NXT(2) == '-') && (NXT(3) == '-')) {
3893
htmlParseComment(ctxt);
3897
* Second case : a Processing Instruction.
3899
else if ((CUR == '<') && (NXT(1) == '?')) {
3904
* Third case : a sub-element.
3906
else if (CUR == '<') {
3907
htmlParseElement(ctxt);
3911
* Fourth case : a reference. If if has not been resolved,
3912
* parsing returns it's Name, create the node
3914
else if (CUR == '&') {
3915
htmlParseReference(ctxt);
3919
* Fifth case : end of the resource
3921
else if (CUR == 0) {
3922
htmlAutoCloseOnEnd(ctxt);
3927
* Last case, text. Note that References are handled directly.
3930
htmlParseCharData(ctxt);
3933
if (cons == ctxt->nbChars) {
3934
if (ctxt->node != NULL) {
3935
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3936
"detected an error in element content\n",
3944
if (currentNode != NULL) xmlFree(currentNode);
3949
* @ctxt: an HTML parser context
3951
* Parse a content: comment, sub-element, reference or text.
3955
__htmlParseContent(void *ctxt) {
3957
htmlParseContent((htmlParserCtxtPtr) ctxt);
3962
* @ctxt: an HTML parser context
3964
* parse an HTML element, this is highly recursive
3966
* [39] element ::= EmptyElemTag | STag content ETag
3968
* [41] Attribute ::= Name Eq AttValue
3972
htmlParseElement(htmlParserCtxtPtr ctxt) {
3973
const xmlChar *name;
3974
xmlChar *currentNode = NULL;
3975
const htmlElemDesc * info;
3976
htmlParserNodeInfo node_info;
3979
const xmlChar *oldptr;
3981
if ((ctxt == NULL) || (ctxt->input == NULL)) {
3982
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3983
"htmlParseElement: context error\n", NULL, NULL);
3986
/* Capture start position */
3987
if (ctxt->record_info) {
3988
node_info.begin_pos = ctxt->input->consumed +
3989
(CUR_PTR - ctxt->input->base);
3990
node_info.begin_line = ctxt->input->line;
3993
failed = htmlParseStartTag(ctxt);
3995
if ((failed == -1) || (name == NULL)) {
4002
* Lookup the info for that element.
4004
info = htmlTagLookup(name);
4006
htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4007
"Tag %s invalid\n", name, NULL);
4011
* Check for an Empty Element labeled the XML/SGML way
4013
if ((CUR == '/') && (NXT(1) == '>')) {
4015
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4016
ctxt->sax->endElement(ctxt->userData, name);
4024
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4025
"Couldn't find end of Start Tag %s\n", name, NULL);
4028
* end of parsing of this node.
4030
if (xmlStrEqual(name, ctxt->name)) {
4036
* Capture end position and add node
4038
if (ctxt->record_info) {
4039
node_info.end_pos = ctxt->input->consumed +
4040
(CUR_PTR - ctxt->input->base);
4041
node_info.end_line = ctxt->input->line;
4042
node_info.node = ctxt->node;
4043
xmlParserAddNodeInfo(ctxt, &node_info);
4049
* Check for an Empty Element from DTD definition
4051
if ((info != NULL) && (info->empty)) {
4052
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4053
ctxt->sax->endElement(ctxt->userData, name);
4059
* Parse the content of the element:
4061
currentNode = xmlStrdup(ctxt->name);
4062
depth = ctxt->nameNr;
4063
while (IS_CHAR_CH(CUR)) {
4064
oldptr = ctxt->input->cur;
4065
htmlParseContent(ctxt);
4066
if (oldptr==ctxt->input->cur) break;
4067
if (ctxt->nameNr < depth) break;
4071
* Capture end position and add node
4073
if ( currentNode != NULL && ctxt->record_info ) {
4074
node_info.end_pos = ctxt->input->consumed +
4075
(CUR_PTR - ctxt->input->base);
4076
node_info.end_line = ctxt->input->line;
4077
node_info.node = ctxt->node;
4078
xmlParserAddNodeInfo(ctxt, &node_info);
4080
if (!IS_CHAR_CH(CUR)) {
4081
htmlAutoCloseOnEnd(ctxt);
4084
if (currentNode != NULL)
4085
xmlFree(currentNode);
4089
* htmlParseDocument:
4090
* @ctxt: an HTML parser context
4092
* parse an HTML document (and build a tree if using the standard SAX
4095
* Returns 0, -1 in case of error. the parser context is augmented
4096
* as a result of the parsing.
4100
htmlParseDocument(htmlParserCtxtPtr ctxt) {
4105
htmlDefaultSAXHandlerInit();
4107
if ((ctxt == NULL) || (ctxt->input == NULL)) {
4108
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4109
"htmlParseDocument: context error\n", NULL, NULL);
4110
return(XML_ERR_INTERNAL_ERROR);
4115
* SAX: beginning of the document processing.
4117
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4118
ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4121
* Wipe out everything which is before the first '<'
4125
htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4126
"Document is empty\n", NULL, NULL);
4129
if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4130
ctxt->sax->startDocument(ctxt->userData);
4134
* Parse possible comments and PIs before any content
4136
while (((CUR == '<') && (NXT(1) == '!') &&
4137
(NXT(2) == '-') && (NXT(3) == '-')) ||
4138
((CUR == '<') && (NXT(1) == '?'))) {
4139
htmlParseComment(ctxt);
4146
* Then possibly doc type declaration(s) and more Misc
4147
* (doctypedecl Misc*)?
4149
if ((CUR == '<') && (NXT(1) == '!') &&
4150
(UPP(2) == 'D') && (UPP(3) == 'O') &&
4151
(UPP(4) == 'C') && (UPP(5) == 'T') &&
4152
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4154
htmlParseDocTypeDecl(ctxt);
4159
* Parse possible comments and PIs before any content
4161
while (((CUR == '<') && (NXT(1) == '!') &&
4162
(NXT(2) == '-') && (NXT(3) == '-')) ||
4163
((CUR == '<') && (NXT(1) == '?'))) {
4164
htmlParseComment(ctxt);
4170
* Time to start parsing the tree itself
4172
htmlParseContent(ctxt);
4178
htmlAutoCloseOnEnd(ctxt);
4182
* SAX: end of the document processing.
4184
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4185
ctxt->sax->endDocument(ctxt->userData);
4187
if (ctxt->myDoc != NULL) {
4188
dtd = xmlGetIntSubset(ctxt->myDoc);
4190
ctxt->myDoc->intSubset =
4191
xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4192
BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4193
BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4195
if (! ctxt->wellFormed) return(-1);
4200
/************************************************************************
4202
* Parser contexts handling *
4204
************************************************************************/
4207
* htmlInitParserCtxt:
4208
* @ctxt: an HTML parser context
4210
* Initialize a parser context
4212
* Returns 0 in case of success and -1 in case of error
4216
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4218
htmlSAXHandler *sax;
4220
if (ctxt == NULL) return(-1);
4221
memset(ctxt, 0, sizeof(htmlParserCtxt));
4223
ctxt->dict = xmlDictCreate();
4224
if (ctxt->dict == NULL) {
4225
htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4228
sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4230
htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4234
memset(sax, 0, sizeof(htmlSAXHandler));
4236
/* Allocate the Input stack */
4237
ctxt->inputTab = (htmlParserInputPtr *)
4238
xmlMalloc(5 * sizeof(htmlParserInputPtr));
4239
if (ctxt->inputTab == NULL) {
4240
htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4249
ctxt->version = NULL;
4250
ctxt->encoding = NULL;
4251
ctxt->standalone = -1;
4252
ctxt->instate = XML_PARSER_START;
4254
/* Allocate the Node stack */
4255
ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4256
if (ctxt->nodeTab == NULL) {
4257
htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4270
/* Allocate the Name stack */
4271
ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4272
if (ctxt->nameTab == NULL) {
4273
htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4289
if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4292
memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4294
ctxt->userData = ctxt;
4296
ctxt->wellFormed = 1;
4297
ctxt->replaceEntities = 0;
4298
ctxt->linenumbers = xmlLineNumbersDefaultValue;
4300
ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4301
ctxt->vctxt.userData = ctxt;
4302
ctxt->vctxt.error = xmlParserValidityError;
4303
ctxt->vctxt.warning = xmlParserValidityWarning;
4304
ctxt->record_info = 0;
4307
ctxt->checkIndex = 0;
4308
ctxt->catalogs = NULL;
4309
xmlInitNodeInfoSeq(&ctxt->node_seq);
4314
* htmlFreeParserCtxt:
4315
* @ctxt: an HTML parser context
4317
* Free all the memory used by a parser context. However the parsed
4318
* document in ctxt->myDoc is not freed.
4322
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4324
xmlFreeParserCtxt(ctxt);
4328
* htmlNewParserCtxt:
4330
* Allocate and initialize a new parser context.
4332
* Returns the htmlParserCtxtPtr or NULL in case of allocation error
4336
htmlNewParserCtxt(void)
4338
xmlParserCtxtPtr ctxt;
4340
ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4342
htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4345
memset(ctxt, 0, sizeof(xmlParserCtxt));
4346
if (htmlInitParserCtxt(ctxt) < 0) {
4347
htmlFreeParserCtxt(ctxt);
4354
* htmlCreateMemoryParserCtxt:
4355
* @buffer: a pointer to a char array
4356
* @size: the size of the array
4358
* Create a parser context for an HTML in-memory document.
4360
* Returns the new parser context or NULL
4363
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4364
xmlParserCtxtPtr ctxt;
4365
xmlParserInputPtr input;
4366
xmlParserInputBufferPtr buf;
4373
ctxt = htmlNewParserCtxt();
4377
buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4378
if (buf == NULL) return(NULL);
4380
input = xmlNewInputStream(ctxt);
4381
if (input == NULL) {
4382
xmlFreeParserCtxt(ctxt);
4386
input->filename = NULL;
4388
input->base = input->buf->buffer->content;
4389
input->cur = input->buf->buffer->content;
4390
input->end = &input->buf->buffer->content[input->buf->buffer->use];
4392
inputPush(ctxt, input);
4397
* htmlCreateDocParserCtxt:
4398
* @cur: a pointer to an array of xmlChar
4399
* @encoding: a free form C string describing the HTML document encoding, or NULL
4401
* Create a parser context for an HTML document.
4403
* TODO: check the need to add encoding handling there
4405
* Returns the new parser context or NULL
4407
static htmlParserCtxtPtr
4408
htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
4410
htmlParserCtxtPtr ctxt;
4414
len = xmlStrlen(cur);
4415
ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4419
if (encoding != NULL) {
4420
xmlCharEncoding enc;
4421
xmlCharEncodingHandlerPtr handler;
4423
if (ctxt->input->encoding != NULL)
4424
xmlFree((xmlChar *) ctxt->input->encoding);
4425
ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4427
enc = xmlParseCharEncoding(encoding);
4429
* registered set of known encodings
4431
if (enc != XML_CHAR_ENCODING_ERROR) {
4432
xmlSwitchEncoding(ctxt, enc);
4433
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4434
htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4435
"Unsupported encoding %s\n",
4436
(const xmlChar *) encoding, NULL);
4440
* fallback for unknown encodings
4442
handler = xmlFindCharEncodingHandler((const char *) encoding);
4443
if (handler != NULL) {
4444
xmlSwitchToEncoding(ctxt, handler);
4446
htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4447
"Unsupported encoding %s\n",
4448
(const xmlChar *) encoding, NULL);
4455
#ifdef LIBXML_PUSH_ENABLED
4456
/************************************************************************
4458
* Progressive parsing interfaces *
4460
************************************************************************/
4463
* htmlParseLookupSequence:
4464
* @ctxt: an HTML parser context
4465
* @first: the first char to lookup
4466
* @next: the next char to lookup or zero
4467
* @third: the next char to lookup or zero
4468
* @comment: flag to force checking inside comments
4470
* Try to find if a sequence (first, next, third) or just (first next) or
4471
* (first) is available in the input stream.
4472
* This function has a side effect of (possibly) incrementing ctxt->checkIndex
4473
* to avoid rescanning sequences of bytes, it DOES change the state of the
4474
* parser, do not use liberally.
4475
* This is basically similar to xmlParseLookupSequence()
4477
* Returns the index to the current parsing point if the full sequence
4478
* is available, -1 otherwise.
4481
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4482
xmlChar next, xmlChar third, int iscomment) {
4484
htmlParserInputPtr in;
4489
if (in == NULL) return(-1);
4490
base = in->cur - in->base;
4491
if (base < 0) return(-1);
4492
if (ctxt->checkIndex > base)
4493
base = ctxt->checkIndex;
4494
if (in->buf == NULL) {
4498
buf = in->buf->buffer->content;
4499
len = in->buf->buffer->use;
4501
/* take into account the sequence length */
4502
if (third) len -= 2;
4503
else if (next) len --;
4504
for (;base < len;base++) {
4505
if (!incomment && (base + 4 < len) && !iscomment) {
4506
if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4507
(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4509
/* do not increment past <! - some people use <!--> */
4516
if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4517
(buf[base + 2] == '>')) {
4523
if (buf[base] == first) {
4525
if ((buf[base + 1] != next) ||
4526
(buf[base + 2] != third)) continue;
4527
} else if (next != 0) {
4528
if (buf[base + 1] != next) continue;
4530
ctxt->checkIndex = 0;
4533
xmlGenericError(xmlGenericErrorContext,
4534
"HPP: lookup '%c' found at %d\n",
4536
else if (third == 0)
4537
xmlGenericError(xmlGenericErrorContext,
4538
"HPP: lookup '%c%c' found at %d\n",
4541
xmlGenericError(xmlGenericErrorContext,
4542
"HPP: lookup '%c%c%c' found at %d\n",
4543
first, next, third, base);
4545
return(base - (in->cur - in->base));
4548
ctxt->checkIndex = base;
4551
xmlGenericError(xmlGenericErrorContext,
4552
"HPP: lookup '%c' failed\n", first);
4553
else if (third == 0)
4554
xmlGenericError(xmlGenericErrorContext,
4555
"HPP: lookup '%c%c' failed\n", first, next);
4557
xmlGenericError(xmlGenericErrorContext,
4558
"HPP: lookup '%c%c%c' failed\n", first, next, third);
4564
* htmlParseTryOrFinish:
4565
* @ctxt: an HTML parser context
4566
* @terminate: last chunk indicator
4568
* Try to progress on parsing
4570
* Returns zero if no parsing was possible
4573
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4575
htmlParserInputPtr in;
4580
switch (ctxt->instate) {
4581
case XML_PARSER_EOF:
4582
xmlGenericError(xmlGenericErrorContext,
4583
"HPP: try EOF\n"); break;
4584
case XML_PARSER_START:
4585
xmlGenericError(xmlGenericErrorContext,
4586
"HPP: try START\n"); break;
4587
case XML_PARSER_MISC:
4588
xmlGenericError(xmlGenericErrorContext,
4589
"HPP: try MISC\n");break;
4590
case XML_PARSER_COMMENT:
4591
xmlGenericError(xmlGenericErrorContext,
4592
"HPP: try COMMENT\n");break;
4593
case XML_PARSER_PROLOG:
4594
xmlGenericError(xmlGenericErrorContext,
4595
"HPP: try PROLOG\n");break;
4596
case XML_PARSER_START_TAG:
4597
xmlGenericError(xmlGenericErrorContext,
4598
"HPP: try START_TAG\n");break;
4599
case XML_PARSER_CONTENT:
4600
xmlGenericError(xmlGenericErrorContext,
4601
"HPP: try CONTENT\n");break;
4602
case XML_PARSER_CDATA_SECTION:
4603
xmlGenericError(xmlGenericErrorContext,
4604
"HPP: try CDATA_SECTION\n");break;
4605
case XML_PARSER_END_TAG:
4606
xmlGenericError(xmlGenericErrorContext,
4607
"HPP: try END_TAG\n");break;
4608
case XML_PARSER_ENTITY_DECL:
4609
xmlGenericError(xmlGenericErrorContext,
4610
"HPP: try ENTITY_DECL\n");break;
4611
case XML_PARSER_ENTITY_VALUE:
4612
xmlGenericError(xmlGenericErrorContext,
4613
"HPP: try ENTITY_VALUE\n");break;
4614
case XML_PARSER_ATTRIBUTE_VALUE:
4615
xmlGenericError(xmlGenericErrorContext,
4616
"HPP: try ATTRIBUTE_VALUE\n");break;
4617
case XML_PARSER_DTD:
4618
xmlGenericError(xmlGenericErrorContext,
4619
"HPP: try DTD\n");break;
4620
case XML_PARSER_EPILOG:
4621
xmlGenericError(xmlGenericErrorContext,
4622
"HPP: try EPILOG\n");break;
4624
xmlGenericError(xmlGenericErrorContext,
4625
"HPP: try PI\n");break;
4626
case XML_PARSER_SYSTEM_LITERAL:
4627
xmlGenericError(xmlGenericErrorContext,
4628
"HPP: try SYSTEM_LITERAL\n");break;
4635
if (in == NULL) break;
4636
if (in->buf == NULL)
4637
avail = in->length - (in->cur - in->base);
4639
avail = in->buf->buffer->use - (in->cur - in->base);
4640
if ((avail == 0) && (terminate)) {
4641
htmlAutoCloseOnEnd(ctxt);
4642
if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4644
* SAX: end of the document processing.
4646
ctxt->instate = XML_PARSER_EOF;
4647
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4648
ctxt->sax->endDocument(ctxt->userData);
4659
switch (ctxt->instate) {
4660
case XML_PARSER_EOF:
4662
* Document parsing is done !
4665
case XML_PARSER_START:
4667
* Very first chars read from the document flow.
4670
if (IS_BLANK_CH(cur)) {
4672
if (in->buf == NULL)
4673
avail = in->length - (in->cur - in->base);
4675
avail = in->buf->buffer->use - (in->cur - in->base);
4677
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4678
ctxt->sax->setDocumentLocator(ctxt->userData,
4679
&xmlDefaultSAXLocator);
4680
if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4681
(!ctxt->disableSAX))
4682
ctxt->sax->startDocument(ctxt->userData);
4686
if ((cur == '<') && (next == '!') &&
4687
(UPP(2) == 'D') && (UPP(3) == 'O') &&
4688
(UPP(4) == 'C') && (UPP(5) == 'T') &&
4689
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4692
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4695
xmlGenericError(xmlGenericErrorContext,
4696
"HPP: Parsing internal subset\n");
4698
htmlParseDocTypeDecl(ctxt);
4699
ctxt->instate = XML_PARSER_PROLOG;
4701
xmlGenericError(xmlGenericErrorContext,
4702
"HPP: entering PROLOG\n");
4705
ctxt->instate = XML_PARSER_MISC;
4707
xmlGenericError(xmlGenericErrorContext,
4708
"HPP: entering MISC\n");
4712
case XML_PARSER_MISC:
4714
if (in->buf == NULL)
4715
avail = in->length - (in->cur - in->base);
4717
avail = in->buf->buffer->use - (in->cur - in->base);
4722
if ((cur == '<') && (next == '!') &&
4723
(in->cur[2] == '-') && (in->cur[3] == '-')) {
4725
(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4728
xmlGenericError(xmlGenericErrorContext,
4729
"HPP: Parsing Comment\n");
4731
htmlParseComment(ctxt);
4732
ctxt->instate = XML_PARSER_MISC;
4733
} else if ((cur == '<') && (next == '?')) {
4735
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4738
xmlGenericError(xmlGenericErrorContext,
4739
"HPP: Parsing PI\n");
4742
ctxt->instate = XML_PARSER_MISC;
4743
} else if ((cur == '<') && (next == '!') &&
4744
(UPP(2) == 'D') && (UPP(3) == 'O') &&
4745
(UPP(4) == 'C') && (UPP(5) == 'T') &&
4746
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4749
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4752
xmlGenericError(xmlGenericErrorContext,
4753
"HPP: Parsing internal subset\n");
4755
htmlParseDocTypeDecl(ctxt);
4756
ctxt->instate = XML_PARSER_PROLOG;
4758
xmlGenericError(xmlGenericErrorContext,
4759
"HPP: entering PROLOG\n");
4761
} else if ((cur == '<') && (next == '!') &&
4765
ctxt->instate = XML_PARSER_START_TAG;
4767
xmlGenericError(xmlGenericErrorContext,
4768
"HPP: entering START_TAG\n");
4772
case XML_PARSER_PROLOG:
4774
if (in->buf == NULL)
4775
avail = in->length - (in->cur - in->base);
4777
avail = in->buf->buffer->use - (in->cur - in->base);
4782
if ((cur == '<') && (next == '!') &&
4783
(in->cur[2] == '-') && (in->cur[3] == '-')) {
4785
(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4788
xmlGenericError(xmlGenericErrorContext,
4789
"HPP: Parsing Comment\n");
4791
htmlParseComment(ctxt);
4792
ctxt->instate = XML_PARSER_PROLOG;
4793
} else if ((cur == '<') && (next == '?')) {
4795
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4798
xmlGenericError(xmlGenericErrorContext,
4799
"HPP: Parsing PI\n");
4802
ctxt->instate = XML_PARSER_PROLOG;
4803
} else if ((cur == '<') && (next == '!') &&
4807
ctxt->instate = XML_PARSER_START_TAG;
4809
xmlGenericError(xmlGenericErrorContext,
4810
"HPP: entering START_TAG\n");
4814
case XML_PARSER_EPILOG:
4815
if (in->buf == NULL)
4816
avail = in->length - (in->cur - in->base);
4818
avail = in->buf->buffer->use - (in->cur - in->base);
4822
if (IS_BLANK_CH(cur)) {
4823
htmlParseCharData(ctxt);
4829
if ((cur == '<') && (next == '!') &&
4830
(in->cur[2] == '-') && (in->cur[3] == '-')) {
4832
(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4835
xmlGenericError(xmlGenericErrorContext,
4836
"HPP: Parsing Comment\n");
4838
htmlParseComment(ctxt);
4839
ctxt->instate = XML_PARSER_EPILOG;
4840
} else if ((cur == '<') && (next == '?')) {
4842
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4845
xmlGenericError(xmlGenericErrorContext,
4846
"HPP: Parsing PI\n");
4849
ctxt->instate = XML_PARSER_EPILOG;
4850
} else if ((cur == '<') && (next == '!') &&
4854
ctxt->errNo = XML_ERR_DOCUMENT_END;
4855
ctxt->wellFormed = 0;
4856
ctxt->instate = XML_PARSER_EOF;
4858
xmlGenericError(xmlGenericErrorContext,
4859
"HPP: entering EOF\n");
4861
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4862
ctxt->sax->endDocument(ctxt->userData);
4866
case XML_PARSER_START_TAG: {
4867
const xmlChar *name;
4869
const htmlElemDesc * info;
4875
ctxt->instate = XML_PARSER_CONTENT;
4877
xmlGenericError(xmlGenericErrorContext,
4878
"HPP: entering CONTENT\n");
4882
if (in->cur[1] == '/') {
4883
ctxt->instate = XML_PARSER_END_TAG;
4884
ctxt->checkIndex = 0;
4886
xmlGenericError(xmlGenericErrorContext,
4887
"HPP: entering END_TAG\n");
4892
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4895
failed = htmlParseStartTag(ctxt);
4897
if ((failed == -1) ||
4905
* Lookup the info for that element.
4907
info = htmlTagLookup(name);
4909
htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4910
"Tag %s invalid\n", name, NULL);
4914
* Check for an Empty Element labeled the XML/SGML way
4916
if ((CUR == '/') && (NXT(1) == '>')) {
4918
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4919
ctxt->sax->endElement(ctxt->userData, name);
4921
ctxt->instate = XML_PARSER_CONTENT;
4923
xmlGenericError(xmlGenericErrorContext,
4924
"HPP: entering CONTENT\n");
4932
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4933
"Couldn't find end of Start Tag %s\n",
4937
* end of parsing of this node.
4939
if (xmlStrEqual(name, ctxt->name)) {
4944
ctxt->instate = XML_PARSER_CONTENT;
4946
xmlGenericError(xmlGenericErrorContext,
4947
"HPP: entering CONTENT\n");
4953
* Check for an Empty Element from DTD definition
4955
if ((info != NULL) && (info->empty)) {
4956
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4957
ctxt->sax->endElement(ctxt->userData, name);
4960
ctxt->instate = XML_PARSER_CONTENT;
4962
xmlGenericError(xmlGenericErrorContext,
4963
"HPP: entering CONTENT\n");
4967
case XML_PARSER_CONTENT: {
4970
* Handle preparsed entities and charRef
4972
if (ctxt->token != 0) {
4973
xmlChar chr[2] = { 0 , 0 } ;
4975
chr[0] = (xmlChar) ctxt->token;
4976
htmlCheckParagraph(ctxt);
4977
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4978
ctxt->sax->characters(ctxt->userData, chr, 1);
4980
ctxt->checkIndex = 0;
4982
if ((avail == 1) && (terminate)) {
4984
if ((cur != '<') && (cur != '&')) {
4985
if (ctxt->sax != NULL) {
4986
if (IS_BLANK_CH(cur)) {
4987
if (ctxt->sax->ignorableWhitespace != NULL)
4988
ctxt->sax->ignorableWhitespace(
4989
ctxt->userData, &cur, 1);
4991
htmlCheckParagraph(ctxt);
4992
if (ctxt->sax->characters != NULL)
4993
ctxt->sax->characters(
4994
ctxt->userData, &cur, 1);
4998
ctxt->checkIndex = 0;
5007
cons = ctxt->nbChars;
5008
if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5009
(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5011
* Handle SCRIPT/STYLE separately
5017
idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5020
val = in->cur[idx + 2];
5021
if (val == 0) /* bad cut of input */
5024
htmlParseScript(ctxt);
5025
if ((cur == '<') && (next == '/')) {
5026
ctxt->instate = XML_PARSER_END_TAG;
5027
ctxt->checkIndex = 0;
5029
xmlGenericError(xmlGenericErrorContext,
5030
"HPP: entering END_TAG\n");
5036
* Sometimes DOCTYPE arrives in the middle of the document
5038
if ((cur == '<') && (next == '!') &&
5039
(UPP(2) == 'D') && (UPP(3) == 'O') &&
5040
(UPP(4) == 'C') && (UPP(5) == 'T') &&
5041
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5044
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5046
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5047
"Misplaced DOCTYPE declaration\n",
5048
BAD_CAST "DOCTYPE" , NULL);
5049
htmlParseDocTypeDecl(ctxt);
5050
} else if ((cur == '<') && (next == '!') &&
5051
(in->cur[2] == '-') && (in->cur[3] == '-')) {
5053
(htmlParseLookupSequence(
5054
ctxt, '-', '-', '>', 1) < 0))
5057
xmlGenericError(xmlGenericErrorContext,
5058
"HPP: Parsing Comment\n");
5060
htmlParseComment(ctxt);
5061
ctxt->instate = XML_PARSER_CONTENT;
5062
} else if ((cur == '<') && (next == '?')) {
5064
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5067
xmlGenericError(xmlGenericErrorContext,
5068
"HPP: Parsing PI\n");
5071
ctxt->instate = XML_PARSER_CONTENT;
5072
} else if ((cur == '<') && (next == '!') && (avail < 4)) {
5074
} else if ((cur == '<') && (next == '/')) {
5075
ctxt->instate = XML_PARSER_END_TAG;
5076
ctxt->checkIndex = 0;
5078
xmlGenericError(xmlGenericErrorContext,
5079
"HPP: entering END_TAG\n");
5082
} else if (cur == '<') {
5083
ctxt->instate = XML_PARSER_START_TAG;
5084
ctxt->checkIndex = 0;
5086
xmlGenericError(xmlGenericErrorContext,
5087
"HPP: entering START_TAG\n");
5090
} else if (cur == '&') {
5092
(htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
5095
xmlGenericError(xmlGenericErrorContext,
5096
"HPP: Parsing Reference\n");
5098
/* TODO: check generation of subtrees if noent !!! */
5099
htmlParseReference(ctxt);
5102
* check that the text sequence is complete
5103
* before handing out the data to the parser
5104
* to avoid problems with erroneous end of
5108
(htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5110
ctxt->checkIndex = 0;
5112
xmlGenericError(xmlGenericErrorContext,
5113
"HPP: Parsing char data\n");
5115
htmlParseCharData(ctxt);
5118
if (cons == ctxt->nbChars) {
5119
if (ctxt->node != NULL) {
5120
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5121
"detected an error in element content\n",
5130
case XML_PARSER_END_TAG:
5134
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5136
htmlParseEndTag(ctxt);
5137
if (ctxt->nameNr == 0) {
5138
ctxt->instate = XML_PARSER_EPILOG;
5140
ctxt->instate = XML_PARSER_CONTENT;
5142
ctxt->checkIndex = 0;
5144
xmlGenericError(xmlGenericErrorContext,
5145
"HPP: entering CONTENT\n");
5148
case XML_PARSER_CDATA_SECTION:
5149
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5150
"HPP: internal error, state == CDATA\n",
5152
ctxt->instate = XML_PARSER_CONTENT;
5153
ctxt->checkIndex = 0;
5155
xmlGenericError(xmlGenericErrorContext,
5156
"HPP: entering CONTENT\n");
5159
case XML_PARSER_DTD:
5160
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5161
"HPP: internal error, state == DTD\n",
5163
ctxt->instate = XML_PARSER_CONTENT;
5164
ctxt->checkIndex = 0;
5166
xmlGenericError(xmlGenericErrorContext,
5167
"HPP: entering CONTENT\n");
5170
case XML_PARSER_COMMENT:
5171
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5172
"HPP: internal error, state == COMMENT\n",
5174
ctxt->instate = XML_PARSER_CONTENT;
5175
ctxt->checkIndex = 0;
5177
xmlGenericError(xmlGenericErrorContext,
5178
"HPP: entering CONTENT\n");
5182
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5183
"HPP: internal error, state == PI\n",
5185
ctxt->instate = XML_PARSER_CONTENT;
5186
ctxt->checkIndex = 0;
5188
xmlGenericError(xmlGenericErrorContext,
5189
"HPP: entering CONTENT\n");
5192
case XML_PARSER_ENTITY_DECL:
5193
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5194
"HPP: internal error, state == ENTITY_DECL\n",
5196
ctxt->instate = XML_PARSER_CONTENT;
5197
ctxt->checkIndex = 0;
5199
xmlGenericError(xmlGenericErrorContext,
5200
"HPP: entering CONTENT\n");
5203
case XML_PARSER_ENTITY_VALUE:
5204
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5205
"HPP: internal error, state == ENTITY_VALUE\n",
5207
ctxt->instate = XML_PARSER_CONTENT;
5208
ctxt->checkIndex = 0;
5210
xmlGenericError(xmlGenericErrorContext,
5211
"HPP: entering DTD\n");
5214
case XML_PARSER_ATTRIBUTE_VALUE:
5215
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5216
"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5218
ctxt->instate = XML_PARSER_START_TAG;
5219
ctxt->checkIndex = 0;
5221
xmlGenericError(xmlGenericErrorContext,
5222
"HPP: entering START_TAG\n");
5225
case XML_PARSER_SYSTEM_LITERAL:
5226
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5227
"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5229
ctxt->instate = XML_PARSER_CONTENT;
5230
ctxt->checkIndex = 0;
5232
xmlGenericError(xmlGenericErrorContext,
5233
"HPP: entering CONTENT\n");
5236
case XML_PARSER_IGNORE:
5237
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5238
"HPP: internal error, state == XML_PARSER_IGNORE\n",
5240
ctxt->instate = XML_PARSER_CONTENT;
5241
ctxt->checkIndex = 0;
5243
xmlGenericError(xmlGenericErrorContext,
5244
"HPP: entering CONTENT\n");
5247
case XML_PARSER_PUBLIC_LITERAL:
5248
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5249
"HPP: internal error, state == XML_PARSER_LITERAL\n",
5251
ctxt->instate = XML_PARSER_CONTENT;
5252
ctxt->checkIndex = 0;
5254
xmlGenericError(xmlGenericErrorContext,
5255
"HPP: entering CONTENT\n");
5262
if ((avail == 0) && (terminate)) {
5263
htmlAutoCloseOnEnd(ctxt);
5264
if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5266
* SAX: end of the document processing.
5268
ctxt->instate = XML_PARSER_EOF;
5269
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5270
ctxt->sax->endDocument(ctxt->userData);
5273
if ((ctxt->myDoc != NULL) &&
5274
((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5275
(ctxt->instate == XML_PARSER_EPILOG))) {
5277
dtd = xmlGetIntSubset(ctxt->myDoc);
5279
ctxt->myDoc->intSubset =
5280
xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5281
BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5282
BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5285
xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5292
* @ctxt: an HTML parser context
5293
* @chunk: an char array
5294
* @size: the size in byte of the chunk
5295
* @terminate: last chunk indicator
5297
* Parse a Chunk of memory
5299
* Returns zero if no error, the xmlParserErrors otherwise.
5302
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5304
if ((ctxt == NULL) || (ctxt->input == NULL)) {
5305
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5306
"htmlParseChunk: context error\n", NULL, NULL);
5307
return(XML_ERR_INTERNAL_ERROR);
5309
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5310
(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5311
int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5312
int cur = ctxt->input->cur - ctxt->input->base;
5315
res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5317
ctxt->errNo = XML_PARSER_EOF;
5318
ctxt->disableSAX = 1;
5319
return (XML_PARSER_EOF);
5321
ctxt->input->base = ctxt->input->buf->buffer->content + base;
5322
ctxt->input->cur = ctxt->input->base + cur;
5324
&ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5326
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5330
if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5331
htmlParseTryOrFinish(ctxt, terminate);
5333
} else if (ctxt->instate != XML_PARSER_EOF) {
5334
if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5335
xmlParserInputBufferPtr in = ctxt->input->buf;
5336
if ((in->encoder != NULL) && (in->buffer != NULL) &&
5337
(in->raw != NULL)) {
5340
nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5342
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5343
"encoder error\n", NULL, NULL);
5344
return(XML_ERR_INVALID_ENCODING);
5349
htmlParseTryOrFinish(ctxt, terminate);
5351
if ((ctxt->instate != XML_PARSER_EOF) &&
5352
(ctxt->instate != XML_PARSER_EPILOG) &&
5353
(ctxt->instate != XML_PARSER_MISC)) {
5354
ctxt->errNo = XML_ERR_DOCUMENT_END;
5355
ctxt->wellFormed = 0;
5357
if (ctxt->instate != XML_PARSER_EOF) {
5358
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5359
ctxt->sax->endDocument(ctxt->userData);
5361
ctxt->instate = XML_PARSER_EOF;
5363
return((xmlParserErrors) ctxt->errNo);
5366
/************************************************************************
5368
* User entry points *
5370
************************************************************************/
5373
* htmlCreatePushParserCtxt:
5374
* @sax: a SAX handler
5375
* @user_data: The user data returned on SAX callbacks
5376
* @chunk: a pointer to an array of chars
5377
* @size: number of chars in the array
5378
* @filename: an optional file name or URI
5379
* @enc: an optional encoding
5381
* Create a parser context for using the HTML parser in push mode
5382
* The value of @filename is used for fetching external entities
5383
* and error/warning reports.
5385
* Returns the new parser context or NULL
5388
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5389
const char *chunk, int size, const char *filename,
5390
xmlCharEncoding enc) {
5391
htmlParserCtxtPtr ctxt;
5392
htmlParserInputPtr inputStream;
5393
xmlParserInputBufferPtr buf;
5397
buf = xmlAllocParserInputBuffer(enc);
5398
if (buf == NULL) return(NULL);
5400
ctxt = htmlNewParserCtxt();
5402
xmlFreeParserInputBuffer(buf);
5405
if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5406
ctxt->charset=XML_CHAR_ENCODING_UTF8;
5408
if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5410
ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5411
if (ctxt->sax == NULL) {
5416
memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5417
if (user_data != NULL)
5418
ctxt->userData = user_data;
5420
if (filename == NULL) {
5421
ctxt->directory = NULL;
5423
ctxt->directory = xmlParserGetDirectory(filename);
5426
inputStream = htmlNewInputStream(ctxt);
5427
if (inputStream == NULL) {
5428
xmlFreeParserCtxt(ctxt);
5433
if (filename == NULL)
5434
inputStream->filename = NULL;
5436
inputStream->filename = (char *)
5437
xmlCanonicPath((const xmlChar *) filename);
5438
inputStream->buf = buf;
5439
inputStream->base = inputStream->buf->buffer->content;
5440
inputStream->cur = inputStream->buf->buffer->content;
5442
&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5444
inputPush(ctxt, inputStream);
5446
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5447
(ctxt->input->buf != NULL)) {
5448
int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5449
int cur = ctxt->input->cur - ctxt->input->base;
5451
xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5453
ctxt->input->base = ctxt->input->buf->buffer->content + base;
5454
ctxt->input->cur = ctxt->input->base + cur;
5456
&ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5458
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5461
ctxt->progressive = 1;
5465
#endif /* LIBXML_PUSH_ENABLED */
5469
* @cur: a pointer to an array of xmlChar
5470
* @encoding: a free form C string describing the HTML document encoding, or NULL
5471
* @sax: the SAX handler block
5472
* @userData: if using SAX, this pointer will be provided on callbacks.
5474
* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5475
* to handle parse events. If sax is NULL, fallback to the default DOM
5476
* behavior and return a tree.
5478
* Returns the resulting document tree unless SAX is NULL or the document is
5483
htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5485
htmlParserCtxtPtr ctxt;
5489
if (cur == NULL) return(NULL);
5492
ctxt = htmlCreateDocParserCtxt(cur, encoding);
5493
if (ctxt == NULL) return(NULL);
5495
if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5497
ctxt->userData = userData;
5500
htmlParseDocument(ctxt);
5504
ctxt->userData = NULL;
5506
htmlFreeParserCtxt(ctxt);
5513
* @cur: a pointer to an array of xmlChar
5514
* @encoding: a free form C string describing the HTML document encoding, or NULL
5516
* parse an HTML in-memory document and build a tree.
5518
* Returns the resulting document tree
5522
htmlParseDoc(xmlChar *cur, const char *encoding) {
5523
return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5528
* htmlCreateFileParserCtxt:
5529
* @filename: the filename
5530
* @encoding: a free form C string describing the HTML document encoding, or NULL
5532
* Create a parser context for a file content.
5533
* Automatic support for ZLIB/Compress compressed document is provided
5534
* by default if found at compile-time.
5536
* Returns the new parser context or NULL
5539
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5541
htmlParserCtxtPtr ctxt;
5542
htmlParserInputPtr inputStream;
5543
char *canonicFilename;
5544
/* htmlCharEncoding enc; */
5545
xmlChar *content, *content_line = (xmlChar *) "charset=";
5547
if (filename == NULL)
5550
ctxt = htmlNewParserCtxt();
5554
canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5555
if (canonicFilename == NULL) {
5556
#ifdef LIBXML_SAX1_ENABLED
5557
if (xmlDefaultSAXHandler.error != NULL) {
5558
xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5561
xmlFreeParserCtxt(ctxt);
5565
inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5566
xmlFree(canonicFilename);
5567
if (inputStream == NULL) {
5568
xmlFreeParserCtxt(ctxt);
5572
inputPush(ctxt, inputStream);
5576
content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5578
strcpy ((char *)content, (char *)content_line);
5579
strcat ((char *)content, (char *)encoding);
5580
htmlCheckEncoding (ctxt, content);
5590
* @filename: the filename
5591
* @encoding: a free form C string describing the HTML document encoding, or NULL
5592
* @sax: the SAX handler block
5593
* @userData: if using SAX, this pointer will be provided on callbacks.
5595
* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5596
* compressed document is provided by default if found at compile-time.
5597
* It use the given SAX function block to handle the parsing callback.
5598
* If sax is NULL, fallback to the default DOM tree building routines.
5600
* Returns the resulting document tree unless SAX is NULL or the document is
5605
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5608
htmlParserCtxtPtr ctxt;
5609
htmlSAXHandlerPtr oldsax = NULL;
5613
ctxt = htmlCreateFileParserCtxt(filename, encoding);
5614
if (ctxt == NULL) return(NULL);
5618
ctxt->userData = userData;
5621
htmlParseDocument(ctxt);
5626
ctxt->userData = NULL;
5628
htmlFreeParserCtxt(ctxt);
5635
* @filename: the filename
5636
* @encoding: a free form C string describing the HTML document encoding, or NULL
5638
* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5639
* compressed document is provided by default if found at compile-time.
5641
* Returns the resulting document tree
5645
htmlParseFile(const char *filename, const char *encoding) {
5646
return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5650
* htmlHandleOmittedElem:
5653
* Set and return the previous value for handling HTML omitted tags.
5655
* Returns the last value for 0 for no handling, 1 for auto insertion.
5659
htmlHandleOmittedElem(int val) {
5660
int old = htmlOmittedDefaultValue;
5662
htmlOmittedDefaultValue = val;
5667
* htmlElementAllowedHere:
5668
* @parent: HTML parent element
5669
* @elt: HTML element
5671
* Checks whether an HTML element may be a direct child of a parent element.
5672
* Note - doesn't check for deprecated elements
5674
* Returns 1 if allowed; 0 otherwise.
5677
htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5680
if ( ! elt || ! parent || ! parent->subelts )
5683
for ( p = parent->subelts; *p; ++p )
5684
if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5690
* htmlElementStatusHere:
5691
* @parent: HTML parent element
5692
* @elt: HTML element
5694
* Checks whether an HTML element may be a direct child of a parent element.
5695
* and if so whether it is valid or deprecated.
5697
* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5700
htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5701
if ( ! parent || ! elt )
5702
return HTML_INVALID ;
5703
if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5704
return HTML_INVALID ;
5706
return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5710
* @elt: HTML element
5711
* @attr: HTML attribute
5712
* @legacy: whether to allow deprecated attributes
5714
* Checks whether an attribute is valid for an element
5715
* Has full knowledge of Required and Deprecated attributes
5717
* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5720
htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5723
if ( !elt || ! attr )
5724
return HTML_INVALID ;
5726
if ( elt->attrs_req )
5727
for ( p = elt->attrs_req; *p; ++p)
5728
if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5729
return HTML_REQUIRED ;
5731
if ( elt->attrs_opt )
5732
for ( p = elt->attrs_opt; *p; ++p)
5733
if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5736
if ( legacy && elt->attrs_depr )
5737
for ( p = elt->attrs_depr; *p; ++p)
5738
if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5739
return HTML_DEPRECATED ;
5741
return HTML_INVALID ;
5745
* @node: an htmlNodePtr in a tree
5746
* @legacy: whether to allow deprecated elements (YES is faster here
5747
* for Element nodes)
5749
* Checks whether the tree node is valid. Experimental (the author
5750
* only uses the HTML enhancements in a SAX parser)
5752
* Return: for Element nodes, a return from htmlElementAllowedHere (if
5753
* legacy allowed) or htmlElementStatusHere (otherwise).
5754
* for Attribute nodes, a return from htmlAttrAllowed
5755
* for other nodes, HTML_NA (no checks performed)
5758
htmlNodeStatus(const htmlNodePtr node, int legacy) {
5760
return HTML_INVALID ;
5762
switch ( node->type ) {
5763
case XML_ELEMENT_NODE:
5765
? ( htmlElementAllowedHere (
5766
htmlTagLookup(node->parent->name) , node->name
5767
) ? HTML_VALID : HTML_INVALID )
5768
: htmlElementStatusHere(
5769
htmlTagLookup(node->parent->name) ,
5770
htmlTagLookup(node->name) )
5772
case XML_ATTRIBUTE_NODE:
5773
return htmlAttrAllowed(
5774
htmlTagLookup(node->parent->name) , node->name, legacy) ;
5775
default: return HTML_NA ;
5778
/************************************************************************
5780
* New set (2.6.0) of simpler and more flexible APIs *
5782
************************************************************************/
5787
* Free a string if it is not owned by the "dict" dictionnary in the
5790
#define DICT_FREE(str) \
5791
if ((str) && ((!dict) || \
5792
(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5793
xmlFree((char *)(str));
5797
* @ctxt: an HTML parser context
5799
* Reset a parser context
5802
htmlCtxtReset(htmlParserCtxtPtr ctxt)
5804
xmlParserInputPtr input;
5813
while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5814
xmlFreeInputStream(input);
5820
if (ctxt->spaceTab != NULL) {
5821
ctxt->spaceTab[0] = -1;
5822
ctxt->space = &ctxt->spaceTab[0];
5834
DICT_FREE(ctxt->version);
5835
ctxt->version = NULL;
5836
DICT_FREE(ctxt->encoding);
5837
ctxt->encoding = NULL;
5838
DICT_FREE(ctxt->directory);
5839
ctxt->directory = NULL;
5840
DICT_FREE(ctxt->extSubURI);
5841
ctxt->extSubURI = NULL;
5842
DICT_FREE(ctxt->extSubSystem);
5843
ctxt->extSubSystem = NULL;
5844
if (ctxt->myDoc != NULL)
5845
xmlFreeDoc(ctxt->myDoc);
5848
ctxt->standalone = -1;
5849
ctxt->hasExternalSubset = 0;
5850
ctxt->hasPErefs = 0;
5853
ctxt->instate = XML_PARSER_START;
5856
ctxt->wellFormed = 1;
5857
ctxt->nsWellFormed = 1;
5859
ctxt->vctxt.userData = ctxt;
5860
ctxt->vctxt.error = xmlParserValidityError;
5861
ctxt->vctxt.warning = xmlParserValidityWarning;
5862
ctxt->record_info = 0;
5864
ctxt->checkIndex = 0;
5866
ctxt->errNo = XML_ERR_OK;
5868
ctxt->charset = XML_CHAR_ENCODING_NONE;
5869
ctxt->catalogs = NULL;
5870
xmlInitNodeInfoSeq(&ctxt->node_seq);
5872
if (ctxt->attsDefault != NULL) {
5873
xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5874
ctxt->attsDefault = NULL;
5876
if (ctxt->attsSpecial != NULL) {
5877
xmlHashFree(ctxt->attsSpecial, NULL);
5878
ctxt->attsSpecial = NULL;
5883
* htmlCtxtUseOptions:
5884
* @ctxt: an HTML parser context
5885
* @options: a combination of htmlParserOption(s)
5887
* Applies the options to the parser context
5889
* Returns 0 in case of success, the set of unknown or unimplemented options
5893
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5898
if (options & HTML_PARSE_NOWARNING) {
5899
ctxt->sax->warning = NULL;
5900
ctxt->vctxt.warning = NULL;
5901
options -= XML_PARSE_NOWARNING;
5902
ctxt->options |= XML_PARSE_NOWARNING;
5904
if (options & HTML_PARSE_NOERROR) {
5905
ctxt->sax->error = NULL;
5906
ctxt->vctxt.error = NULL;
5907
ctxt->sax->fatalError = NULL;
5908
options -= XML_PARSE_NOERROR;
5909
ctxt->options |= XML_PARSE_NOERROR;
5911
if (options & HTML_PARSE_PEDANTIC) {
5913
options -= XML_PARSE_PEDANTIC;
5914
ctxt->options |= XML_PARSE_PEDANTIC;
5917
if (options & XML_PARSE_NOBLANKS) {
5918
ctxt->keepBlanks = 0;
5919
ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5920
options -= XML_PARSE_NOBLANKS;
5921
ctxt->options |= XML_PARSE_NOBLANKS;
5923
ctxt->keepBlanks = 1;
5924
if (options & HTML_PARSE_RECOVER) {
5926
options -= HTML_PARSE_RECOVER;
5929
if (options & HTML_PARSE_COMPACT) {
5930
ctxt->options |= HTML_PARSE_COMPACT;
5931
options -= HTML_PARSE_COMPACT;
5933
ctxt->dictNames = 0;
5939
* @ctxt: an HTML parser context
5940
* @URL: the base URL to use for the document
5941
* @encoding: the document encoding, or NULL
5942
* @options: a combination of htmlParserOption(s)
5943
* @reuse: keep the context for reuse
5945
* Common front-end for the htmlRead functions
5947
* Returns the resulting document tree or NULL
5950
htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5951
int options, int reuse)
5955
htmlCtxtUseOptions(ctxt, options);
5957
if (encoding != NULL) {
5958
xmlCharEncodingHandlerPtr hdlr;
5960
hdlr = xmlFindCharEncodingHandler(encoding);
5962
xmlSwitchToEncoding(ctxt, hdlr);
5964
if ((URL != NULL) && (ctxt->input != NULL) &&
5965
(ctxt->input->filename == NULL))
5966
ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5967
htmlParseDocument(ctxt);
5971
if ((ctxt->dictNames) &&
5973
(ret->dict == ctxt->dict))
5975
xmlFreeParserCtxt(ctxt);
5982
* @cur: a pointer to a zero terminated string
5983
* @URL: the base URL to use for the document
5984
* @encoding: the document encoding, or NULL
5985
* @options: a combination of htmlParserOption(s)
5987
* parse an XML in-memory document and build a tree.
5989
* Returns the resulting document tree
5992
htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5994
htmlParserCtxtPtr ctxt;
6000
ctxt = htmlCreateDocParserCtxt(cur, NULL);
6003
return (htmlDoRead(ctxt, URL, encoding, options, 0));
6008
* @filename: a file or URL
6009
* @encoding: the document encoding, or NULL
6010
* @options: a combination of htmlParserOption(s)
6012
* parse an XML file from the filesystem or the network.
6014
* Returns the resulting document tree
6017
htmlReadFile(const char *filename, const char *encoding, int options)
6019
htmlParserCtxtPtr ctxt;
6022
ctxt = htmlCreateFileParserCtxt(filename, encoding);
6025
return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6030
* @buffer: a pointer to a char array
6031
* @size: the size of the array
6032
* @URL: the base URL to use for the document
6033
* @encoding: the document encoding, or NULL
6034
* @options: a combination of htmlParserOption(s)
6036
* parse an XML in-memory document and build a tree.
6038
* Returns the resulting document tree
6041
htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6043
htmlParserCtxtPtr ctxt;
6046
ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6049
htmlDefaultSAXHandlerInit();
6050
if (ctxt->sax != NULL)
6051
memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6052
return (htmlDoRead(ctxt, URL, encoding, options, 0));
6057
* @fd: an open file descriptor
6058
* @URL: the base URL to use for the document
6059
* @encoding: the document encoding, or NULL
6060
* @options: a combination of htmlParserOption(s)
6062
* parse an XML from a file descriptor and build a tree.
6064
* Returns the resulting document tree
6067
htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6069
htmlParserCtxtPtr ctxt;
6070
xmlParserInputBufferPtr input;
6071
xmlParserInputPtr stream;
6077
input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6080
ctxt = xmlNewParserCtxt();
6082
xmlFreeParserInputBuffer(input);
6085
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6086
if (stream == NULL) {
6087
xmlFreeParserInputBuffer(input);
6088
xmlFreeParserCtxt(ctxt);
6091
inputPush(ctxt, stream);
6092
return (htmlDoRead(ctxt, URL, encoding, options, 0));
6097
* @ioread: an I/O read function
6098
* @ioclose: an I/O close function
6099
* @ioctx: an I/O handler
6100
* @URL: the base URL to use for the document
6101
* @encoding: the document encoding, or NULL
6102
* @options: a combination of htmlParserOption(s)
6104
* parse an HTML document from I/O functions and source and build a tree.
6106
* Returns the resulting document tree
6109
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6110
void *ioctx, const char *URL, const char *encoding, int options)
6112
htmlParserCtxtPtr ctxt;
6113
xmlParserInputBufferPtr input;
6114
xmlParserInputPtr stream;
6120
input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6121
XML_CHAR_ENCODING_NONE);
6124
ctxt = htmlNewParserCtxt();
6126
xmlFreeParserInputBuffer(input);
6129
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6130
if (stream == NULL) {
6131
xmlFreeParserInputBuffer(input);
6132
xmlFreeParserCtxt(ctxt);
6135
inputPush(ctxt, stream);
6136
return (htmlDoRead(ctxt, URL, encoding, options, 0));
6141
* @ctxt: an HTML parser context
6142
* @cur: a pointer to a zero terminated string
6143
* @URL: the base URL to use for the document
6144
* @encoding: the document encoding, or NULL
6145
* @options: a combination of htmlParserOption(s)
6147
* parse an XML in-memory document and build a tree.
6148
* This reuses the existing @ctxt parser context
6150
* Returns the resulting document tree
6153
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6154
const char *URL, const char *encoding, int options)
6156
xmlParserInputPtr stream;
6163
htmlCtxtReset(ctxt);
6165
stream = xmlNewStringInputStream(ctxt, cur);
6166
if (stream == NULL) {
6169
inputPush(ctxt, stream);
6170
return (htmlDoRead(ctxt, URL, encoding, options, 1));
6175
* @ctxt: an HTML parser context
6176
* @filename: a file or URL
6177
* @encoding: the document encoding, or NULL
6178
* @options: a combination of htmlParserOption(s)
6180
* parse an XML file from the filesystem or the network.
6181
* This reuses the existing @ctxt parser context
6183
* Returns the resulting document tree
6186
htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6187
const char *encoding, int options)
6189
xmlParserInputPtr stream;
6191
if (filename == NULL)
6196
htmlCtxtReset(ctxt);
6198
stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6199
if (stream == NULL) {
6202
inputPush(ctxt, stream);
6203
return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6207
* htmlCtxtReadMemory:
6208
* @ctxt: an HTML parser context
6209
* @buffer: a pointer to a char array
6210
* @size: the size of the array
6211
* @URL: the base URL to use for the document
6212
* @encoding: the document encoding, or NULL
6213
* @options: a combination of htmlParserOption(s)
6215
* parse an XML in-memory document and build a tree.
6216
* This reuses the existing @ctxt parser context
6218
* Returns the resulting document tree
6221
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6222
const char *URL, const char *encoding, int options)
6224
xmlParserInputBufferPtr input;
6225
xmlParserInputPtr stream;
6232
htmlCtxtReset(ctxt);
6234
input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6235
if (input == NULL) {
6239
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6240
if (stream == NULL) {
6241
xmlFreeParserInputBuffer(input);
6245
inputPush(ctxt, stream);
6246
return (htmlDoRead(ctxt, URL, encoding, options, 1));
6251
* @ctxt: an HTML parser context
6252
* @fd: an open file descriptor
6253
* @URL: the base URL to use for the document
6254
* @encoding: the document encoding, or NULL
6255
* @options: a combination of htmlParserOption(s)
6257
* parse an XML from a file descriptor and build a tree.
6258
* This reuses the existing @ctxt parser context
6260
* Returns the resulting document tree
6263
htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6264
const char *URL, const char *encoding, int options)
6266
xmlParserInputBufferPtr input;
6267
xmlParserInputPtr stream;
6274
htmlCtxtReset(ctxt);
6277
input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6280
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6281
if (stream == NULL) {
6282
xmlFreeParserInputBuffer(input);
6285
inputPush(ctxt, stream);
6286
return (htmlDoRead(ctxt, URL, encoding, options, 1));
6291
* @ctxt: an HTML parser context
6292
* @ioread: an I/O read function
6293
* @ioclose: an I/O close function
6294
* @ioctx: an I/O handler
6295
* @URL: the base URL to use for the document
6296
* @encoding: the document encoding, or NULL
6297
* @options: a combination of htmlParserOption(s)
6299
* parse an HTML document from I/O functions and source and build a tree.
6300
* This reuses the existing @ctxt parser context
6302
* Returns the resulting document tree
6305
htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6306
xmlInputCloseCallback ioclose, void *ioctx,
6308
const char *encoding, int options)
6310
xmlParserInputBufferPtr input;
6311
xmlParserInputPtr stream;
6318
htmlCtxtReset(ctxt);
6320
input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6321
XML_CHAR_ENCODING_NONE);
6324
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6325
if (stream == NULL) {
6326
xmlFreeParserInputBuffer(input);
6329
inputPush(ctxt, stream);
6330
return (htmlDoRead(ctxt, URL, encoding, options, 1));
6333
#define bottom_HTMLparser
6334
#include "elfgcchack.h"
6335
#endif /* LIBXML_HTML_ENABLED */