~ubuntu-branches/ubuntu/wily/opencollada/wily-proposed

#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"

484

#define NB_BLOCK NB_HEADING + NB_LIST + 14

485

#define FORMCTRL "input", "select", "textarea", "label", "button"

486

#define NB_FORMCTRL 5

487

#define PCDATA

488

#define NB_PCDATA 0

489

#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"

490

#define NB_HEADING 6

491

#define LIST "ul", "ol", "dir", "menu"

492

#define NB_LIST 4

493

#define MODIFIER

494

#define NB_MODIFIER 0

495

#define FLOW BLOCK,INLINE

496

#define NB_FLOW NB_BLOCK + NB_INLINE

497

#define EMPTY NULL

498

499

500

static const char* const html_flow[] = { FLOW, NULL } ;

501

static const char* const html_inline[] = { INLINE, NULL } ;

502

503

/* placeholders: elts with content but no subelements */

504

static const char* const html_pcdata[] = { NULL } ;

505

#define html_cdata html_pcdata

506

507

508

/* ... and for HTML Attributes */

509

510

#define COREATTRS "id", "class", "style", "title"

511

#define NB_COREATTRS 4

512

#define I18N "lang", "dir"

513

#define NB_I18N 2

514

#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"

515

#define NB_EVENTS 9

516

#define ATTRS COREATTRS,I18N,EVENTS

517

#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS

518

#define CELLHALIGN "align", "char", "charoff"

519

#define NB_CELLHALIGN 3

520

#define CELLVALIGN "valign"

521

#define NB_CELLVALIGN 1

522

523

static const char* const html_attrs[] = { ATTRS, NULL } ;

524

static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;

525

static const char* const core_attrs[] = { COREATTRS, NULL } ;

526

static const char* const i18n_attrs[] = { I18N, NULL } ;

527

528

529

/* Other declarations that should go inline ... */

530

static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",

531

"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",

532

"tabindex", "onfocus", "onblur", NULL } ;

533

static const char* const target_attr[] = { "target", NULL } ;

534

static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;

535

static const char* const alt_attr[] = { "alt", NULL } ;

536

static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;

537

static const char* const href_attrs[] = { "href", NULL } ;

538

static const char* const clear_attrs[] = { "clear", NULL } ;

539

static const char* const inline_p[] = { INLINE, "p", NULL } ;

540

541

static const char* const flow_param[] = { FLOW, "param", NULL } ;

542

static const char* const applet_attrs[] = { COREATTRS , "codebase",

543

"archive", "alt", "name", "height", "width", "align",

544

"hspace", "vspace", NULL } ;

545

static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",

546

"tabindex", "accesskey", "onfocus", "onblur", NULL } ;

547

static const char* const basefont_attrs[] =

548

{ "id", "size", "color", "face", NULL } ;

549

static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;

550

static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;

551

static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;

552

static const char* const body_depr[] = { "background", "bgcolor", "text",

553

"link", "vlink", "alink", NULL } ;

554

static const char* const button_attrs[] = { ATTRS, "name", "value", "type",

555

"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;

556

557

558

static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;

559

static const char* const col_elt[] = { "col", NULL } ;

560

static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;

561

static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;

562

static const char* const dl_contents[] = { "dt", "dd", NULL } ;

563

static const char* const compact_attr[] = { "compact", NULL } ;

564

static const char* const label_attr[] = { "label", NULL } ;

565

static const char* const fieldset_contents[] = { FLOW, "legend" } ;

566

static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;

567

static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;

568

static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;

569

static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;

570

static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;

571

static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;

572

static const char* const head_attrs[] = { I18N, "profile", NULL } ;

573

static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;

574

static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;

575

static const char* const version_attr[] = { "version", NULL } ;

576

static const char* const html_content[] = { "head", "body", "frameset", NULL } ;

577

static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;

578

static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;

579

static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;

580

static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;

581

static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;

582

static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;

583

static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;

584

static const char* const align_attr[] = { "align", NULL } ;

585

static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;

586

static const char* const map_contents[] = { BLOCK, "area", NULL } ;

587

static const char* const name_attr[] = { "name", NULL } ;

588

static const char* const action_attr[] = { "action", NULL } ;

589

static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;

590

static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;

591

static const char* const content_attr[] = { "content", NULL } ;

592

static const char* const type_attr[] = { "type", NULL } ;

593

static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;

594

static const char* const object_contents[] = { FLOW, "param", NULL } ;

595

static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;

596

static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;

597

static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;

598

static const char* const option_elt[] = { "option", NULL } ;

599

static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;

600

static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;

601

static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;

602

static const char* const width_attr[] = { "width", NULL } ;

603

static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;

604

static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;

605

static const char* const language_attr[] = { "language", NULL } ;

606

static const char* const select_content[] = { "optgroup", "option", NULL } ;

607

static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;

608

static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;

609

static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;

610

static const char* const table_depr[] = { "align", "bgcolor", NULL } ;

611

static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;

612

static const char* const tr_elt[] = { "tr", NULL } ;

613

static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;

614

static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;

615

static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;

616

static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;

617

static const char* const tr_contents[] = { "th", "td", NULL } ;

618

static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;

619

static const char* const li_elt[] = { "li", NULL } ;

620

static const char* const ul_depr[] = { "type", "compact", NULL} ;

621

static const char* const dir_attr[] = { "dir", NULL} ;

622

623

#define DECL (const char**)

624

625

static const htmlElemDesc

626

html40ElementTable[] = {

627

{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",

628

DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL

629

630

{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",

631

DECL html_inline , NULL , DECL html_attrs, NULL, NULL

632

633

{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",

634

DECL html_inline , NULL , DECL html_attrs, NULL, NULL

635

636

{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",

637

DECL inline_p , NULL , DECL html_attrs, NULL, NULL

638

639

{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",

640

DECL flow_param , NULL , NULL , DECL applet_attrs, NULL

641

642

{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",

643

EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr

644

645

{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",

646

DECL html_inline , NULL , DECL html_attrs, NULL, NULL

647

648

{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",

649

EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs

650

651

{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,

652

EMPTY , NULL , NULL, DECL basefont_attrs, NULL

653

654

{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",

655

DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr

656

657

{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",

658

DECL html_inline , NULL , DECL html_attrs, NULL, NULL

659

660

{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",

661

DECL html_flow , NULL , DECL quote_attrs , NULL, NULL

662

663

{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",

664

DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL

665

666

{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",

667

EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL

668

669

{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",

670

DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL

671

672

{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",

673

DECL html_inline , NULL , DECL html_attrs, NULL, NULL

674

675

{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",

676

DECL html_flow , NULL , NULL, DECL html_attrs, NULL

677

678

{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",

679

DECL html_inline , NULL , DECL html_attrs, NULL, NULL

680

681

{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",

682

DECL html_inline , NULL , DECL html_attrs, NULL, NULL

683

684

{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",

685

EMPTY , NULL , DECL col_attrs , NULL, NULL

686

687

{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",

688

DECL col_elt , "col" , DECL col_attrs , NULL, NULL

689

690

{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",

691

DECL html_flow , NULL , DECL html_attrs, NULL, NULL

692

693

{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",

694

DECL html_flow , NULL , DECL edit_attrs , NULL, NULL

695

696

{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",

697

DECL html_inline , NULL , DECL html_attrs, NULL, NULL

698

699

{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",

700

DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL

701

702

{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",

703

DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL

704

705

{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",

706

DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL

707

708

{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",

709

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

710

711

{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",

712

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

713

714

{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",

715

EMPTY, NULL, DECL embed_attrs, NULL, NULL

716

717

{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",

718

DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL

719

720

{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",

721

DECL html_inline, NULL, NULL, DECL font_attrs, NULL

722

723

{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",

724

DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr

725

726

{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,

727

EMPTY, NULL, NULL, DECL frame_attrs, NULL

728

729

{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,

730

DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL

731

732

{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",

733

DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL

734

735

{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",

736

DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL

737

738

{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",

739

DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL

740

741

{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",

742

DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL

743

744

{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",

745

DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL

746

747

{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",

748

DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL

749

750

{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",

751

DECL head_contents, NULL, DECL head_attrs, NULL, NULL

752

753

{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,

754

EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL

755

756

{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",

757

DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL

758

759

{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",

760

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

761

762

{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",

763

DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL

764

765

{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",

766

EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs

767

768

{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",

769

EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL

770

771

{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",

772

DECL html_flow, NULL, DECL edit_attrs, NULL, NULL

773

774

{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",

775

EMPTY, NULL, NULL, DECL prompt_attrs, NULL

776

777

{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",

778

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

779

780

{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",

781

DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL

782

783

{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",

784

DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL

785

786

{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",

787

DECL html_flow, NULL, DECL html_attrs, NULL, NULL

788

789

{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",

790

EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL

791

792

{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",

793

DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr

794

795

{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",

796

DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL

797

798

{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",

799

EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr

800

801

{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",

802

DECL noframes_content, "body" , DECL html_attrs, NULL, NULL

803

804

{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",

805

DECL html_flow, "div", DECL html_attrs, NULL, NULL

806

807

{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",

808

DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL

809

810

{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",

811

DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL

812

813

{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",

814

DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr

815

816

{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,

817

DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL

818

819

{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",

820

DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL

821

822

{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",

823

EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr

824

825

{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",

826

DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL

827

828

{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",

829

DECL html_inline, NULL, DECL quote_attrs, NULL, NULL

830

831

{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",

832

DECL html_inline, NULL, NULL, DECL html_attrs, NULL

833

834

{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",

835

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

836

837

{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",

838

DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr

839

840

{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",

841

DECL select_content, NULL, DECL select_attrs, NULL, NULL

842

843

{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",

844

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

845

846

{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",

847

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

848

849

{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",

850

DECL html_inline, NULL, NULL, DECL html_attrs, NULL

851

852

{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",

853

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

854

855

{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",

856

DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr

857

858

{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",

859

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

860

861

{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",

862

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

863

864

{ "table", 0, 0, 0, 0, 0, 0, 0, "",

865

DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL

866

867

{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",

868

DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL

869

870

{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",

871

DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL

872

873

{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",

874

DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr

875

876

{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",

877

DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL

878

879

{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",

880

DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL

881

882

{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",

883

DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL

884

885

{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",

886

DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL

887

888

{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",

889

DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL

890

891

{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",

892

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

893

894

{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",

895

DECL html_inline, NULL, NULL, DECL html_attrs, NULL

896

897

{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",

898

DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL

899

900

{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",

901

DECL html_inline, NULL, DECL html_attrs, NULL, NULL

902

}

903

};

904

905

906

* start tags that imply the end of current element

907

908

static const char * const htmlStartClose[] = {

909

"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",

910

"dl", "ul", "ol", "menu", "dir", "address", "pre",

911

"listing", "xmp", "head", NULL,

912

"head", "p", NULL,

913

"title", "p", NULL,

914

"body", "head", "style", "link", "title", "p", NULL,

915

"frameset", "head", "style", "link", "title", "p", NULL,

916

"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",

917

"pre", "listing", "xmp", "head", "li", NULL,

918

"hr", "p", "head", NULL,

919

"h1", "p", "head", NULL,

920

"h2", "p", "head", NULL,

921

"h3", "p", "head", NULL,

922

"h4", "p", "head", NULL,

923

"h5", "p", "head", NULL,

924

"h6", "p", "head", NULL,

925

"dir", "p", "head", NULL,

926

"address", "p", "head", "ul", NULL,

927

"pre", "p", "head", "ul", NULL,

928

"listing", "p", "head", NULL,

929

"xmp", "p", "head", NULL,

930

"blockquote", "p", "head", NULL,

931

"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",

932

"xmp", "head", NULL,

933

"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",

934

"head", "dd", NULL,

935

"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",

936

"head", "dt", NULL,

937

"ul", "p", "head", "ol", "menu", "dir", "address", "pre",

938

"listing", "xmp", NULL,

939

"ol", "p", "head", "ul", NULL,

940

"menu", "p", "head", "ul", NULL,

941

"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,

942

"div", "p", "head", NULL,

943

"noscript", "p", "head", NULL,

944

"center", "font", "b", "i", "p", "head", NULL,

945

"a", "a", NULL,

946

"caption", "p", NULL,

947

"colgroup", "caption", "colgroup", "col", "p", NULL,

948

"col", "caption", "col", "p", NULL,

949

"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",

950

"listing", "xmp", "a", NULL,

951

"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,

952

"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,

953

"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,

954

"thead", "caption", "col", "colgroup", NULL,

955

"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",

956

"tbody", "p", NULL,

957

"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",

958

"tfoot", "tbody", "p", NULL,

959

"optgroup", "option", NULL,

960

"option", "option", NULL,

961

"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",

962

"pre", "listing", "xmp", "a", NULL,

963

NULL

964

};

965

966

967

* The list of HTML elements which are supposed not to have

968

* CDATA content and where a p element will be implied

969

970

* TODO: extend that list by reading the HTML SGML DTD on

971

* implied paragraph

972

973

static const char *const htmlNoContentElements[] = {

974

"html",

975

"head",

976

NULL

977

};

978

979

980

* The list of HTML attributes which are of content %Script;

981

* NOTE: when adding ones, check htmlIsScriptAttribute() since

982

* it assumes the name starts with 'on'

983

984

static const char *const htmlScriptAttributes[] = {

985

"onclick",

986

"ondblclick",

987

"onmousedown",

988

"onmouseup",

989

"onmouseover",

990

"onmousemove",

991

"onmouseout",

992

"onkeypress",

993

"onkeydown",

994

"onkeyup",

995

"onload",

996

"onunload",

997

"onfocus",

998

"onblur",

999

"onsubmit",

1000

"onrest",

1001

"onchange",

1002

"onselect"

1003

};

1004

1005

1006

* This table is used by the htmlparser to know what to do with

1007

* broken html pages. By assigning different priorities to different

1008

* elements the parser can decide how to handle extra endtags.

1009

* Endtags are only allowed to close elements with lower or equal

1010

* priority.

1011

1012

1013

typedef struct {

1014

const char *name;

1015

int priority;

1016

} elementPriority;

1017

1018

static const elementPriority htmlEndPriority[] = {

1019

{"div", 150},

1020

{"td", 160},

1021

{"th", 160},

1022

{"tr", 170},

1023

{"thead", 180},

1024

{"tbody", 180},

1025

{"tfoot", 180},

1026

{"table", 190},

1027

{"head", 200},

1028

{"body", 200},

1029

{"html", 220},

1030

{NULL, 100} /* Default priority */

1031

};

1032

1033

static const char** htmlStartCloseIndex[100];

1034

static int htmlStartCloseIndexinitialized = 0;

1035

1036

/************************************************************************

1037

* *

1038

* functions to handle HTML specific data *

1039

* *

1040

************************************************************************/

1041

1042

/**

1043

* htmlInitAutoClose:

1044

1045

* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.

1046

* This is not reentrant. Call xmlInitParser() once before processing in

1047

* case of use in multithreaded programs.

1048

1049

void

1050

htmlInitAutoClose(void) {

1051

int indx, i = 0;

1052

1053

if (htmlStartCloseIndexinitialized) return;

1054

1055

for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;

1056

indx = 0;

1057

while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {

1058

htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];

1059

while (htmlStartClose[i] != NULL) i++;

1060

i++;

1061

}

1062

htmlStartCloseIndexinitialized = 1;

1063

}

1064

1065

/**

1066

* htmlTagLookup:

1067

* @tag: The tag name in lowercase

1068

1069

* Lookup the HTML tag in the ElementTable

1070

1071

* Returns the related htmlElemDescPtr or NULL if not found.

1072

1073

const htmlElemDesc *

1074

htmlTagLookup(const xmlChar *tag) {

1075

unsigned int i;

1076

1077

for (i = 0; i < (sizeof(html40ElementTable) /

1078

sizeof(html40ElementTable[0]));i++) {

1079

if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))

1080

return((htmlElemDescPtr) &html40ElementTable[i]);

1081

}

1082

return(NULL);

1083

}

1084

1085

/**

1086

* htmlGetEndPriority:

1087

* @name: The name of the element to look up the priority for.

1088

1089

* Return value: The "endtag" priority.

1090

**/

1091

static int

1092

htmlGetEndPriority (const xmlChar *name) {

1093

int i = 0;

1094

1095

while ((htmlEndPriority[i].name != NULL) &&

1096

(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))

1097

i++;

1098

1099

return(htmlEndPriority[i].priority);

1100

}

1101

1102

1103

/**

1104

* htmlCheckAutoClose:

1105

* @newtag: The new tag name

1106

* @oldtag: The old tag name

1107

1108

* Checks whether the new tag is one of the registered valid tags for

1109

* closing old.

1110

* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.

1111

1112

* Returns 0 if no, 1 if yes.

1113

1114

static int

1115

htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)

1116

{

1117

int i, indx;

1118

const char **closed = NULL;

1119

1120

if (htmlStartCloseIndexinitialized == 0)

1121

htmlInitAutoClose();

1122

1123

/* inefficient, but not a big deal */

1124

for (indx = 0; indx < 100; indx++) {

1125

closed = htmlStartCloseIndex[indx];

1126

if (closed == NULL)

1127

return (0);

1128

if (xmlStrEqual(BAD_CAST * closed, newtag))

1129

break;

1130

}

1131

1132

i = closed - htmlStartClose;

1133

i++;

1134

while (htmlStartClose[i] != NULL) {

1135

if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {

1136

return (1);

1137

}

1138

i++;

1139

}

1140

return (0);

1141

}

1142

1143

/**

1144

* htmlAutoCloseOnClose:

1145

* @ctxt: an HTML parser context

1146

* @newtag: The new tag name

1147

* @force: force the tag closure

1148

1149

* The HTML DTD allows an ending tag to implicitly close other tags.

1150

1151

static void

1152

htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)

1153

{

1154

const htmlElemDesc *info;

1155

int i, priority;

1156

1157

priority = htmlGetEndPriority(newtag);

1158

1159

for (i = (ctxt->nameNr - 1); i >= 0; i--) {

1160

1161

if (xmlStrEqual(newtag, ctxt->nameTab[i]))

1162

break;

1163

1164

* A missplaced endtag can only close elements with lower

1165

* or equal priority, so if we find an element with higher

1166

* priority before we find an element with

1167

* matching name, we just ignore this endtag

1168

1169

if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)

1170

return;

1171

}

1172

if (i < 0)

1173

return;

1174

1175

while (!xmlStrEqual(newtag, ctxt->name)) {

1176

info = htmlTagLookup(ctxt->name);

1177

if ((info != NULL) && (info->endTag == 3)) {

1178

htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,

1179

"Opening and ending tag mismatch: %s and %s\n",

1180

newtag, ctxt->name);

1181

}

1182

if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

1183

ctxt->sax->endElement(ctxt->userData, ctxt->name);

1184

htmlnamePop(ctxt);

1185

}

1186

}

1187

1188

/**

1189

* htmlAutoCloseOnEnd:

1190

* @ctxt: an HTML parser context

1191

1192

* Close all remaining tags at the end of the stream

1193

1194

static void

1195

htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)

1196

{

1197

int i;

1198

1199

if (ctxt->nameNr == 0)

1200

return;

1201

for (i = (ctxt->nameNr - 1); i >= 0; i--) {

1202

if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

1203

ctxt->sax->endElement(ctxt->userData, ctxt->name);

1204

htmlnamePop(ctxt);

1205

}

1206

}

1207

1208

/**

1209

* htmlAutoClose:

1210

* @ctxt: an HTML parser context

1211

* @newtag: The new tag name or NULL

1212

1213

* The HTML DTD allows a tag to implicitly close other tags.

1214

* The list is kept in htmlStartClose array. This function is

1215

* called when a new tag has been detected and generates the

1216

* appropriates closes if possible/needed.

1217

* If newtag is NULL this mean we are at the end of the resource

1218

* and we should check

1219

1220

static void

1221

htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)

1222

{

1223

while ((newtag != NULL) && (ctxt->name != NULL) &&

1224

(htmlCheckAutoClose(newtag, ctxt->name))) {

1225

if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

1226

ctxt->sax->endElement(ctxt->userData, ctxt->name);

1227

htmlnamePop(ctxt);

1228

}

1229

if (newtag == NULL) {

1230

htmlAutoCloseOnEnd(ctxt);

1231

return;

1232

}

1233

while ((newtag == NULL) && (ctxt->name != NULL) &&

1234

((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||

1235

(xmlStrEqual(ctxt->name, BAD_CAST "body")) ||

1236

(xmlStrEqual(ctxt->name, BAD_CAST "html")))) {

1237

if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

1238

ctxt->sax->endElement(ctxt->userData, ctxt->name);

1239

htmlnamePop(ctxt);

1240

}

1241

}

1242

1243

/**

1244

* htmlAutoCloseTag:

1245

* @doc: the HTML document

1246

* @name: The tag name

1247

* @elem: the HTML element

1248

1249

* The HTML DTD allows a tag to implicitly close other tags.

1250

* The list is kept in htmlStartClose array. This function checks

1251

* if the element or one of it's children would autoclose the

1252

* given tag.

1253

1254

* Returns 1 if autoclose, 0 otherwise

1255

1256

int

1257

htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {

1258

htmlNodePtr child;

1259

1260

if (elem == NULL) return(1);

1261

if (xmlStrEqual(name, elem->name)) return(0);

1262

if (htmlCheckAutoClose(elem->name, name)) return(1);

1263

child = elem->children;

1264

while (child != NULL) {

1265

if (htmlAutoCloseTag(doc, name, child)) return(1);

1266

child = child->next;

1267

}

1268

return(0);

1269

}

1270

1271

/**

1272

* htmlIsAutoClosed:

1273

* @doc: the HTML document

1274

* @elem: the HTML element

1275

1276

* The HTML DTD allows a tag to implicitly close other tags.

1277

* The list is kept in htmlStartClose array. This function checks

1278

* if a tag is autoclosed by one of it's child

1279

1280

* Returns 1 if autoclosed, 0 otherwise

1281

1282

int

1283

htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {

1284

htmlNodePtr child;

1285

1286

if (elem == NULL) return(1);

1287

child = elem->children;

1288

while (child != NULL) {

1289

if (htmlAutoCloseTag(doc, elem->name, child)) return(1);

1290

child = child->next;

1291

}

1292

return(0);

1293

}

1294

1295

/**

1296

* htmlCheckImplied:

1297

* @ctxt: an HTML parser context

1298

* @newtag: The new tag name

1299

1300

* The HTML DTD allows a tag to exists only implicitly

1301

* called when a new tag has been detected and generates the

1302

* appropriates implicit tags if missing

1303

1304

static void

1305

htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {

1306

if (!htmlOmittedDefaultValue)

1307

return;

1308

if (xmlStrEqual(newtag, BAD_CAST"html"))

1309

return;

1310

if (ctxt->nameNr <= 0) {

1311

htmlnamePush(ctxt, BAD_CAST"html");

1312

if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))

1313

ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);

1314

}

1315

if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))

1316

return;

1317

if ((ctxt->nameNr <= 1) &&

1318

((xmlStrEqual(newtag, BAD_CAST"script")) ||

1319

(xmlStrEqual(newtag, BAD_CAST"style")) ||

1320

(xmlStrEqual(newtag, BAD_CAST"meta")) ||

1321

(xmlStrEqual(newtag, BAD_CAST"link")) ||

1322

(xmlStrEqual(newtag, BAD_CAST"title")) ||

1323

(xmlStrEqual(newtag, BAD_CAST"base")))) {

1324

1325

* dropped OBJECT ... i you put it first BODY will be

1326

* assumed !

1327

1328

htmlnamePush(ctxt, BAD_CAST"head");

1329

if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))

1330

ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);

1331

} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&

1332

(!xmlStrEqual(newtag, BAD_CAST"frame")) &&

1333

(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {

1334

int i;

1335

for (i = 0;i < ctxt->nameNr;i++) {

1336

if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {

1337

return;

1338

}

1339

if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {

1340

return;

1341

}

1342

}

1343

1344

htmlnamePush(ctxt, BAD_CAST"body");

1345

if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))

1346

ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);

1347

}

1348

}

1349

1350

/**

1351

* htmlCheckParagraph

1352

* @ctxt: an HTML parser context

1353

1354

* Check whether a p element need to be implied before inserting

1355

* characters in the current element.

1356

1357

* Returns 1 if a paragraph has been inserted, 0 if not and -1

1358

* in case of error.

1359

1360

1361

static int

1362

htmlCheckParagraph(htmlParserCtxtPtr ctxt) {

1363

const xmlChar *tag;

1364

int i;

1365

1366

if (ctxt == NULL)

1367

return(-1);

1368

tag = ctxt->name;

1369

if (tag == NULL) {

1370

htmlAutoClose(ctxt, BAD_CAST"p");

1371

htmlCheckImplied(ctxt, BAD_CAST"p");

1372

htmlnamePush(ctxt, BAD_CAST"p");

1373

if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))

1374

ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);

1375

return(1);

1376

}

1377

if (!htmlOmittedDefaultValue)

1378

return(0);

1379

for (i = 0; htmlNoContentElements[i] != NULL; i++) {

1380

if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {

1381

htmlAutoClose(ctxt, BAD_CAST"p");

1382

htmlCheckImplied(ctxt, BAD_CAST"p");

1383

htmlnamePush(ctxt, BAD_CAST"p");

1384

if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))

1385

ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);

1386

return(1);

1387

}

1388

}

1389

return(0);

1390

}

1391

1392

/**

1393

* htmlIsScriptAttribute:

1394

* @name: an attribute name

1395

1396

* Check if an attribute is of content type Script

1397

1398

* Returns 1 is the attribute is a script 0 otherwise

1399

1400

int

1401

htmlIsScriptAttribute(const xmlChar *name) {

1402

unsigned int i;

1403

1404

if (name == NULL)

1405

return(0);

1406

1407

* all script attributes start with 'on'

1408

1409

if ((name[0] != 'o') || (name[1] != 'n'))

1410

return(0);

1411

for (i = 0;

1412

i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);

1413

i++) {

1414

if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))

1415

return(1);

1416

}

1417

return(0);

1418

}

1419

1420

/************************************************************************

1421

* *

1422

* The list of HTML predefined entities *

1423

* *

1424

************************************************************************/

1425

1426

1427

static const htmlEntityDesc html40EntitiesTable[] = {

1428

1429

* the 4 absolute ones, plus apostrophe.

1430

1431

{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },

1432

{ 38, "amp", "ampersand, U+0026 ISOnum" },

1433

{ 39, "apos", "single quote" },

1434

{ 60, "lt", "less-than sign, U+003C ISOnum" },

1435

{ 62, "gt", "greater-than sign, U+003E ISOnum" },

1436

1437

1438

* A bunch still in the 128-255 range

1439

* Replacing them depend really on the charset used.

1440

1441

{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },

1442

{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },

1443

{ 162, "cent", "cent sign, U+00A2 ISOnum" },

1444

{ 163, "pound","pound sign, U+00A3 ISOnum" },

1445

{ 164, "curren","currency sign, U+00A4 ISOnum" },

1446

{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },

1447

{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },

1448

{ 167, "sect", "section sign, U+00A7 ISOnum" },

1449

{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },

1450

{ 169, "copy", "copyright sign, U+00A9 ISOnum" },

1451

{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },

1452

{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },

1453

{ 172, "not", "not sign, U+00AC ISOnum" },

1454

{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },

1455

{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },

1456

{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },

1457

{ 176, "deg", "degree sign, U+00B0 ISOnum" },

1458

{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },

1459

{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },

1460

{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },

1461

{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },

1462

{ 181, "micro","micro sign, U+00B5 ISOnum" },

1463

{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },

1464

{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },

1465

{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },

1466

{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },

1467

{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },

1468

{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },

1469

{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },

1470

{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },

1471

{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },

1472

{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },

1473

{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },

1474

{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },

1475

{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },

1476

{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },

1477

{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },

1478

{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },

1479

{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },

1480

{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },

1481

{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },

1482

{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },

1483

{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },

1484

{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },

1485

{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },

1486

{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },

1487

{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },

1488

{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },

1489

{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },

1490

{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },

1491

{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },

1492

{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },

1493

{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },

1494

{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },

1495

{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },

1496

{ 215, "times","multiplication sign, U+00D7 ISOnum" },

1497

{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },

1498

{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },

1499

{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },

1500

{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },

1501

{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },

1502

{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },

1503

{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },

1504

{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },

1505

{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },

1506

{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },

1507

{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },

1508

{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },

1509

{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },

1510

{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },

1511

{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },

1512

{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },

1513

{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },

1514

{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },

1515

{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },

1516

{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },

1517

{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },

1518

{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },

1519

{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },

1520

{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },

1521

{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },

1522

{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },

1523

{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },

1524

{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },

1525

{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },

1526

{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },

1527

{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },

1528

{ 247, "divide","division sign, U+00F7 ISOnum" },

1529

{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },

1530

{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },

1531

{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },

1532

{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },

1533

{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },

1534

{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },

1535

{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },

1536

{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },

1537

1538

{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },

1539

{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },

1540

{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },

1541

{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },

1542

{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },

1543

1544

1545

* Anything below should really be kept as entities references

1546

1547

{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },

1548

1549

{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },

1550

{ 732, "tilde","small tilde, U+02DC ISOdia" },

1551

1552

{ 913, "Alpha","greek capital letter alpha, U+0391" },

1553

{ 914, "Beta", "greek capital letter beta, U+0392" },

1554

{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },

1555

{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },

1556

{ 917, "Epsilon","greek capital letter epsilon, U+0395" },

1557

{ 918, "Zeta", "greek capital letter zeta, U+0396" },

1558

{ 919, "Eta", "greek capital letter eta, U+0397" },

1559

{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },

1560

{ 921, "Iota", "greek capital letter iota, U+0399" },

1561

{ 922, "Kappa","greek capital letter kappa, U+039A" },

1562

{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },

1563

{ 924, "Mu", "greek capital letter mu, U+039C" },

1564

{ 925, "Nu", "greek capital letter nu, U+039D" },

1565

{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },

1566

{ 927, "Omicron","greek capital letter omicron, U+039F" },

1567

{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },

1568

{ 929, "Rho", "greek capital letter rho, U+03A1" },

1569

{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },

1570

{ 932, "Tau", "greek capital letter tau, U+03A4" },

1571

{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },

1572

{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },

1573

{ 935, "Chi", "greek capital letter chi, U+03A7" },

1574

{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },

1575

{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },

1576

1577

{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },

1578

{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },

1579

{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },

1580

{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },

1581

{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },

1582

{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },

1583

{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },

1584

{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },

1585

{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },

1586

{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },

1587

{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },

1588

{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },

1589

{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },

1590

{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },

1591

{ 959, "omicron","greek small letter omicron, U+03BF NEW" },

1592

{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },

1593

{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },

1594

{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },

1595

{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },

1596

{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },

1597

{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },

1598

{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },

1599

{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },

1600

{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },

1601

{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },

1602

{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },

1603

{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },

1604

{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },

1605

1606

{ 8194, "ensp", "en space, U+2002 ISOpub" },

1607

{ 8195, "emsp", "em space, U+2003 ISOpub" },

1608

{ 8201, "thinsp","thin space, U+2009 ISOpub" },

1609

{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },

1610

{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },

1611

{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },

1612

{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },

1613

{ 8211, "ndash","en dash, U+2013 ISOpub" },

1614

{ 8212, "mdash","em dash, U+2014 ISOpub" },

1615

{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },

1616

{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },

1617

{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },

1618

{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },

1619

{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },

1620

{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },

1621

{ 8224, "dagger","dagger, U+2020 ISOpub" },

1622

{ 8225, "Dagger","double dagger, U+2021 ISOpub" },

1623

1624

{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },

1625

{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },

1626

1627

{ 8240, "permil","per mille sign, U+2030 ISOtech" },

1628

1629

{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },

1630

{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },

1631

1632

{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },

1633

{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },

1634

1635

{ 8254, "oline","overline = spacing overscore, U+203E NEW" },

1636

{ 8260, "frasl","fraction slash, U+2044 NEW" },

1637

1638

{ 8364, "euro", "euro sign, U+20AC NEW" },

1639

1640

{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },

1641

{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },

1642

{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },

1643

{ 8482, "trade","trade mark sign, U+2122 ISOnum" },

1644

{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },

1645

{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },

1646

{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },

1647

{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },

1648

{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },

1649

{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },

1650

{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },

1651

{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },

1652

{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },

1653

{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },

1654

{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },

1655

{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },

1656

1657

{ 8704, "forall","for all, U+2200 ISOtech" },

1658

{ 8706, "part", "partial differential, U+2202 ISOtech" },

1659

{ 8707, "exist","there exists, U+2203 ISOtech" },

1660

{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },

1661

{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },

1662

{ 8712, "isin", "element of, U+2208 ISOtech" },

1663

{ 8713, "notin","not an element of, U+2209 ISOtech" },

1664

{ 8715, "ni", "contains as member, U+220B ISOtech" },

1665

{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },

1666

{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },

1667

{ 8722, "minus","minus sign, U+2212 ISOtech" },

1668

{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },

1669

{ 8730, "radic","square root = radical sign, U+221A ISOtech" },

1670

{ 8733, "prop", "proportional to, U+221D ISOtech" },

1671

{ 8734, "infin","infinity, U+221E ISOtech" },

1672

{ 8736, "ang", "angle, U+2220 ISOamso" },

1673

{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },

1674

{ 8744, "or", "logical or = vee, U+2228 ISOtech" },

1675

{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },

1676

{ 8746, "cup", "union = cup, U+222A ISOtech" },

1677

{ 8747, "int", "integral, U+222B ISOtech" },

1678

{ 8756, "there4","therefore, U+2234 ISOtech" },

1679

{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },

1680

{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },

1681

{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },

1682

{ 8800, "ne", "not equal to, U+2260 ISOtech" },

1683

{ 8801, "equiv","identical to, U+2261 ISOtech" },

1684

{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },

1685

{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },

1686

{ 8834, "sub", "subset of, U+2282 ISOtech" },

1687

{ 8835, "sup", "superset of, U+2283 ISOtech" },

1688

{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },

1689

{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },

1690

{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },

1691

{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },

1692

{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },

1693

{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },

1694

{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },

1695

{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },

1696

{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },

1697

{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },

1698

{ 8971, "rfloor","right floor, U+230B ISOamsc" },

1699

{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },

1700

{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },

1701

{ 9674, "loz", "lozenge, U+25CA ISOpub" },

1702

1703

{ 9824, "spades","black spade suit, U+2660 ISOpub" },

1704

{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },

1705

{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },

1706

{ 9830, "diams","black diamond suit, U+2666 ISOpub" },

1707

1708

};

1709

1710

/************************************************************************

1711

* *

1712

* Commodity functions to handle entities *

1713

* *

1714

************************************************************************/

1715

1716

1717

* Macro used to grow the current buffer.

1718

1719

#define growBuffer(buffer) { \

1720

xmlChar *tmp; \

1721

buffer##_size *= 2; \

1722

tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \

1723

if (tmp == NULL) { \

1724

htmlErrMemory(ctxt, "growing buffer\n"); \

1725

xmlFree(buffer); \

1726

return(NULL); \

1727

} \

1728

buffer = tmp; \

1729

}

1730

1731

/**

1732

* htmlEntityLookup:

1733

* @name: the entity name

1734

1735

* Lookup the given entity in EntitiesTable

1736

1737

* TODO: the linear scan is really ugly, an hash table is really needed.

1738

1739

* Returns the associated htmlEntityDescPtr if found, NULL otherwise.

1740

1741

const htmlEntityDesc *

1742

htmlEntityLookup(const xmlChar *name) {

1743

unsigned int i;

1744

1745

for (i = 0;i < (sizeof(html40EntitiesTable)/

1746

sizeof(html40EntitiesTable[0]));i++) {

1747

if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {

1748

return((htmlEntityDescPtr) &html40EntitiesTable[i]);

1749

}

1750

}

1751

return(NULL);

1752

}

1753

1754

/**

1755

* htmlEntityValueLookup:

1756

* @value: the entity's unicode value

1757

1758

* Lookup the given entity in EntitiesTable

1759

1760

* TODO: the linear scan is really ugly, an hash table is really needed.

1761

1762

* Returns the associated htmlEntityDescPtr if found, NULL otherwise.

1763

1764

const htmlEntityDesc *

1765

htmlEntityValueLookup(unsigned int value) {

1766

unsigned int i;

1767

1768

for (i = 0;i < (sizeof(html40EntitiesTable)/

1769

sizeof(html40EntitiesTable[0]));i++) {

1770

if (html40EntitiesTable[i].value >= value) {

1771

if (html40EntitiesTable[i].value > value)

1772

break;

1773

return((htmlEntityDescPtr) &html40EntitiesTable[i]);

1774

}

1775

}

1776

return(NULL);

1777

}

1778

1779

/**

1780

* UTF8ToHtml:

1781

* @out: a pointer to an array of bytes to store the result

1782

* @outlen: the length of @out

1783

* @in: a pointer to an array of UTF-8 chars

1784

* @inlen: the length of @in

1785

1786

* Take a block of UTF-8 chars in and try to convert it to an ASCII

1787

* plus HTML entities block of chars out.

1788

1789

* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise

1790

* The value of @inlen after return is the number of octets consumed

1791

* as the return value is positive, else unpredictable.

1792

* The value of @outlen after return is the number of octets consumed.

1793

1794

int

1795

UTF8ToHtml(unsigned char* out, int *outlen,

1796

const unsigned char* in, int *inlen) {

1797

const unsigned char* processed = in;

1798

const unsigned char* outend;

1799

const unsigned char* outstart = out;

1800

const unsigned char* instart = in;

1801

const unsigned char* inend;

1802

unsigned int c, d;

1803

int trailing;

1804

1805

if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);

1806

if (in == NULL) {

1807

1808

* initialization nothing to do

1809

1810

*outlen = 0;

1811

*inlen = 0;

1812

return(0);

1813

}

1814

inend = in + (*inlen);

1815

outend = out + (*outlen);

1816

while (in < inend) {

1817

d = *in++;

1818

if (d < 0x80) { c= d; trailing= 0; }

1819

else if (d < 0xC0) {

1820

/* trailing byte in leading position */

1821

*outlen = out - outstart;

1822

*inlen = processed - instart;

1823

return(-2);

1824

} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }

1825

else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }

1826

else if (d < 0xF8) { c= d & 0x07; trailing= 3; }

1827

else {

1828

/* no chance for this in Ascii */

1829

*outlen = out - outstart;

1830

*inlen = processed - instart;

1831

return(-2);

1832

}

1833

1834

if (inend - in < trailing) {

1835

break;

1836

}

1837

1838

for ( ; trailing; trailing--) {

1839

if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))

1840

break;

1841

c <<= 6;

1842

c |= d & 0x3F;

1843

}

1844

1845

/* assertion: c is a single UTF-4 value */

1846

if (c < 0x80) {

1847

if (out + 1 >= outend)

1848

break;

1849

*out++ = c;

1850

} else {

1851

int len;

1852

const htmlEntityDesc * ent;

1853

const char *cp;

1854

char nbuf[16];

1855

1856

1857

* Try to lookup a predefined HTML entity for it

1858

1859

1860

ent = htmlEntityValueLookup(c);

1861

if (ent == NULL) {

1862

snprintf(nbuf, sizeof(nbuf), "#%u", c);

1863

cp = nbuf;

1864

}

1865

else

1866

cp = ent->name;

1867

len = strlen(cp);

1868

if (out + 2 + len >= outend)

1869

break;

1870

*out++ = '&';

1871

memcpy(out, cp, len);

1872

out += len;

1873

*out++ = ';';

1874

}

1875

processed = in;

1876

}

1877

*outlen = out - outstart;

1878

*inlen = processed - instart;

1879

return(0);

1880

}

1881

1882

/**

1883

* htmlEncodeEntities:

1884

* @out: a pointer to an array of bytes to store the result

1885

* @outlen: the length of @out

1886

* @in: a pointer to an array of UTF-8 chars

1887

* @inlen: the length of @in

1888

* @quoteChar: the quote character to escape (' or ") or zero.

1889

1890

* Take a block of UTF-8 chars in and try to convert it to an ASCII

1891

* plus HTML entities block of chars out.

1892

1893

* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise

1894

* The value of @inlen after return is the number of octets consumed

1895

* as the return value is positive, else unpredictable.

1896

* The value of @outlen after return is the number of octets consumed.

1897

1898

int

1899

htmlEncodeEntities(unsigned char* out, int *outlen,

1900

const unsigned char* in, int *inlen, int quoteChar) {

1901

const unsigned char* processed = in;

1902

const unsigned char* outend;

1903

const unsigned char* outstart = out;

1904

const unsigned char* instart = in;

1905

const unsigned char* inend;

1906

unsigned int c, d;

1907

int trailing;

1908

1909

if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))

1910

return(-1);

1911

outend = out + (*outlen);

1912

inend = in + (*inlen);

1913

while (in < inend) {

1914

d = *in++;

1915

if (d < 0x80) { c= d; trailing= 0; }

1916

else if (d < 0xC0) {

1917

/* trailing byte in leading position */

1918

*outlen = out - outstart;

1919

*inlen = processed - instart;

1920

return(-2);

1921

} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }

1922

else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }

1923

else if (d < 0xF8) { c= d & 0x07; trailing= 3; }

1924

else {

1925

/* no chance for this in Ascii */

1926

*outlen = out - outstart;

1927

*inlen = processed - instart;

1928

return(-2);

1929

}

1930

1931

if (inend - in < trailing)

1932

break;

1933

1934

while (trailing--) {

1935

if (((d= *in++) & 0xC0) != 0x80) {

1936

*outlen = out - outstart;

1937

*inlen = processed - instart;

1938

return(-2);

1939

}

1940

c <<= 6;

1941

c |= d & 0x3F;

1942

}

1943

1944

/* assertion: c is a single UTF-4 value */

1945

if ((c < 0x80) && (c != (unsigned int) quoteChar) &&

1946

(c != '&') && (c != '<') && (c != '>')) {

1947

if (out >= outend)

1948

break;

1949

*out++ = c;

1950

} else {

1951

const htmlEntityDesc * ent;

1952

const char *cp;

1953

char nbuf[16];

1954

int len;

1955

1956

1957

* Try to lookup a predefined HTML entity for it

1958

1959

ent = htmlEntityValueLookup(c);

1960

if (ent == NULL) {

1961

snprintf(nbuf, sizeof(nbuf), "#%u", c);

1962

cp = nbuf;

1963

}

1964

else

1965

cp = ent->name;

1966

len = strlen(cp);

1967

if (out + 2 + len > outend)

1968

break;

1969

*out++ = '&';

1970

memcpy(out, cp, len);

1971

out += len;

1972

*out++ = ';';

1973

}

1974

processed = in;

1975

}

1976

*outlen = out - outstart;

1977

*inlen = processed - instart;

1978

return(0);

1979

}

1980

1981

/************************************************************************

1982

* *

1983

* Commodity functions to handle streams *

1984

* *

1985

************************************************************************/

1986

1987

/**

1988

* htmlNewInputStream:

1989

* @ctxt: an HTML parser context

1990

1991

* Create a new input stream structure

1992

* Returns the new input stream or NULL

1993

1994

static htmlParserInputPtr

1995

htmlNewInputStream(htmlParserCtxtPtr ctxt) {

1996

htmlParserInputPtr input;

1997

1998

input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));

1999

if (input == NULL) {

2000

htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");

2001

return(NULL);

2002

}

2003

memset(input, 0, sizeof(htmlParserInput));

2004

input->filename = NULL;

2005

input->directory = NULL;

2006

input->base = NULL;

2007

input->cur = NULL;

2008

input->buf = NULL;

2009

input->line = 1;

2010

input->col = 1;

2011

input->buf = NULL;

2012

input->free = NULL;

2013

input->version = NULL;

2014

input->consumed = 0;

2015

input->length = 0;

2016

return(input);

2017

}

2018

2019

2020

/************************************************************************

2021

* *

2022

* Commodity functions, cleanup needed ? *

2023

* *

2024

************************************************************************/

2025

2026

* all tags allowing pc data from the html 4.01 loose dtd

2027

* NOTE: it might be more apropriate to integrate this information

2028

* into the html40ElementTable array but I don't want to risk any

2029

* binary incomptibility

2030

2031

static const char *allowPCData[] = {

2032

"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",

2033

"blockquote", "body", "button", "caption", "center", "cite", "code",

2034

"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",

2035

"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",

2036

"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",

2037

"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"

2038

};

2039

2040

/**

2041

* areBlanks:

2042

* @ctxt: an HTML parser context

2043

* @str: a xmlChar *

2044

* @len: the size of @str

2045

2046

* Is this a sequence of blank chars that one can ignore ?

2047

2048

* Returns 1 if ignorable 0 otherwise.

2049

2050

2051

static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {

2052

unsigned int i;

2053

int j;

2054

xmlNodePtr lastChild;

2055

xmlDtdPtr dtd;

2056

2057

for (j = 0;j < len;j++)

2058

if (!(IS_BLANK_CH(str[j]))) return(0);

2059

2060

if (CUR == 0) return(1);

2061

if (CUR != '<') return(0);

2062

if (ctxt->name == NULL)

2063

return(1);

2064

if (xmlStrEqual(ctxt->name, BAD_CAST"html"))

2065

return(1);

2066

if (xmlStrEqual(ctxt->name, BAD_CAST"head"))

2067

return(1);

2068

2069

/* Only strip CDATA children of the body tag for strict HTML DTDs */

2070

if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {

2071

dtd = xmlGetIntSubset(ctxt->myDoc);

2072

if (dtd != NULL && dtd->ExternalID != NULL) {

2073

if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||

2074

!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))

2075

return(1);

2076

}

2077

}

2078

2079

if (ctxt->node == NULL) return(0);

2080

lastChild = xmlGetLastChild(ctxt->node);

2081

while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))

2082

lastChild = lastChild->prev;

2083

if (lastChild == NULL) {

2084

if ((ctxt->node->type != XML_ELEMENT_NODE) &&

2085

(ctxt->node->content != NULL)) return(0);

2086

/* keep ws in constructs like ... ...

2087

for all tags "b" allowing PCDATA */

2088

for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {

2089

if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {

2090

return(0);

2091

}

2092

}

2093

} else if (xmlNodeIsText(lastChild)) {

2094

return(0);

2095

} else {

2096

/* keep ws in constructs like xy z

2097

for all tags "p" allowing PCDATA */

2098

for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {

2099

if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {

2100

return(0);

2101

}

2102

}

2103

}

2104

return(1);

2105

}

2106

2107

/**

2108

* htmlNewDocNoDtD:

2109

* @URI: URI for the dtd, or NULL

2110

* @ExternalID: the external ID of the DTD, or NULL

2111

2112

* Creates a new HTML document without a DTD node if @URI and @ExternalID

2113

* are NULL

2114

2115

* Returns a new document, do not initialize the DTD if not provided

2116

2117

htmlDocPtr

2118

htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {

2119

xmlDocPtr cur;

2120

2121

2122

* Allocate a new document and fill the fields.

2123

2124

cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));

2125

if (cur == NULL) {

2126

htmlErrMemory(NULL, "HTML document creation failed\n");

2127

return(NULL);

2128

}

2129

memset(cur, 0, sizeof(xmlDoc));

2130

2131

cur->type = XML_HTML_DOCUMENT_NODE;

2132

cur->version = NULL;

2133

cur->intSubset = NULL;

2134

cur->doc = cur;

2135

cur->name = NULL;

2136

cur->children = NULL;

2137

cur->extSubset = NULL;

2138

cur->oldNs = NULL;

2139

cur->encoding = NULL;

2140

cur->standalone = 1;

2141

cur->compression = 0;

2142

cur->ids = NULL;

2143

cur->refs = NULL;

2144

cur->_private = NULL;

2145

cur->charset = XML_CHAR_ENCODING_UTF8;

2146

if ((ExternalID != NULL) ||

2147

(URI != NULL))

2148

xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);

2149

return(cur);

2150

}

2151

2152

/**

2153

* htmlNewDoc:

2154

* @URI: URI for the dtd, or NULL

2155

* @ExternalID: the external ID of the DTD, or NULL

2156

2157

* Creates a new HTML document

2158

2159

* Returns a new document

2160

2161

htmlDocPtr

2162

htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {

2163

if ((URI == NULL) && (ExternalID == NULL))

2164

return(htmlNewDocNoDtD(

2165

BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",

2166

BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));

2167

2168

return(htmlNewDocNoDtD(URI, ExternalID));

2169

}

2170

2171

2172

/************************************************************************

2173

* *

2174

* The parser itself *

2175

* Relates to http://www.w3.org/TR/html40 *

2176

* *

2177

************************************************************************/

2178

2179

/************************************************************************

2180

* *

2181

* The parser itself *

2182

* *

2183

************************************************************************/

2184

2185

static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);

2186

2187

/**

2188

* htmlParseHTMLName:

2189

* @ctxt: an HTML parser context

2190

2191

* parse an HTML tag or attribute name, note that we convert it to lowercase

2192

* since HTML names are not case-sensitive.

2193

2194

* Returns the Tag Name parsed or NULL

2195

2196

2197

static const xmlChar *

2198

htmlParseHTMLName(htmlParserCtxtPtr ctxt) {

2199

int i = 0;

2200

xmlChar loc[HTML_PARSER_BUFFER_SIZE];

2201

2202

if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&

2203

(CUR != ':')) return(NULL);

2204

2205

while ((i < HTML_PARSER_BUFFER_SIZE) &&

2206

((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||

2207

(CUR == ':') || (CUR == '-') || (CUR == '_'))) {

2208

if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;

2209

else loc[i] = CUR;

2210

i++;

2211

2212

NEXT;

2213

}

2214

2215

return(xmlDictLookup(ctxt->dict, loc, i));

2216

}

2217

2218

2219

/**

2220

* htmlParseHTMLName_nonInvasive:

2221

* @ctxt: an HTML parser context

2222

2223

* parse an HTML tag or attribute name, note that we convert it to lowercase

2224

* since HTML names are not case-sensitive, this doesn't consume the data

2225

* from the stream, it's a look-ahead

2226

2227

* Returns the Tag Name parsed or NULL

2228

2229

2230

static const xmlChar *

2231

htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {

2232

int i = 0;

2233

xmlChar loc[HTML_PARSER_BUFFER_SIZE];

2234

2235

if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&

2236

(NXT(1) != ':')) return(NULL);

2237

2238

while ((i < HTML_PARSER_BUFFER_SIZE) &&

2239

((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||

2240

(NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {

2241

if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;

2242

else loc[i] = NXT(1+i);

2243

i++;

2244

}

2245

2246

return(xmlDictLookup(ctxt->dict, loc, i));

2247

}

2248

2249

2250

/**

2251

* htmlParseName:

2252

* @ctxt: an HTML parser context

2253

2254

* parse an HTML name, this routine is case sensitive.

2255

2256

* Returns the Name parsed or NULL

2257

2258

2259

static const xmlChar *

2260

htmlParseName(htmlParserCtxtPtr ctxt) {

2261

const xmlChar *in;

2262

const xmlChar *ret;

2263

int count = 0;

2264

2265

GROW;

2266

2267

2268

* Accelerator for simple ASCII names

2269

2270

in = ctxt->input->cur;

2271

if (((*in >= 0x61) && (*in <= 0x7A)) ||

2272

((*in >= 0x41) && (*in <= 0x5A)) ||

2273

(*in == '_') || (*in == ':')) {

2274

in++;

2275

while (((*in >= 0x61) && (*in <= 0x7A)) ||

2276

((*in >= 0x41) && (*in <= 0x5A)) ||

2277

((*in >= 0x30) && (*in <= 0x39)) ||

2278

(*in == '_') || (*in == '-') ||

2279

(*in == ':') || (*in == '.'))

2280

in++;

2281

if ((*in > 0) && (*in < 0x80)) {

2282

count = in - ctxt->input->cur;

2283

ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);

2284

ctxt->input->cur = in;

2285

ctxt->nbChars += count;

2286

ctxt->input->col += count;

2287

return(ret);

2288

}

2289

}

2290

return(htmlParseNameComplex(ctxt));

2291

}

2292

2293

static const xmlChar *

2294

htmlParseNameComplex(xmlParserCtxtPtr ctxt) {

2295

int len = 0, l;

2296

int c;

2297

int count = 0;

2298

2299

2300

* Handler for more complex cases

2301

2302

GROW;

2303

c = CUR_CHAR(l);

2304

if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */

2305

(!IS_LETTER(c) && (c != '_') &&

2306

(c != ':'))) {

2307

return(NULL);

2308

}

2309

2310

while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */

2311

((IS_LETTER(c)) || (IS_DIGIT(c)) ||

2312

(c == '.') || (c == '-') ||

2313

(c == '_') || (c == ':') ||

2314

(IS_COMBINING(c)) ||

2315

(IS_EXTENDER(c)))) {

2316

if (count++ > 100) {

2317

count = 0;

2318

GROW;

2319

}

2320

len += l;

2321

NEXTL(l);

2322

c = CUR_CHAR(l);

2323

}

2324

return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));

2325

}

2326

2327

2328

/**

2329

* htmlParseHTMLAttribute:

2330

* @ctxt: an HTML parser context

2331

* @stop: a char stop value

2332

2333

* parse an HTML attribute value till the stop (quote), if

2334

* stop is 0 then it stops at the first space

2335

2336

* Returns the attribute parsed or NULL

2337

2338

2339

static xmlChar *

2340

htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {

2341

xmlChar *buffer = NULL;

2342

int buffer_size = 0;

2343

xmlChar *out = NULL;

2344

const xmlChar *name = NULL;

2345

const xmlChar *cur = NULL;

2346

const htmlEntityDesc * ent;

2347

2348

2349

* allocate a translation buffer.

2350

2351

buffer_size = HTML_PARSER_BUFFER_SIZE;

2352

buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));

2353

if (buffer == NULL) {

2354

htmlErrMemory(ctxt, "buffer allocation failed\n");

2355

return(NULL);

2356

}

2357

out = buffer;

2358

2359

2360

* Ok loop until we reach one of the ending chars

2361

2362

while ((CUR != 0) && (CUR != stop)) {

2363

if ((stop == 0) && (CUR == '>')) break;

2364

if ((stop == 0) && (IS_BLANK_CH(CUR))) break;

2365

if (CUR == '&') {

2366

if (NXT(1) == '#') {

2367

unsigned int c;

2368

int bits;

2369

2370

c = htmlParseCharRef(ctxt);

2371

if (c < 0x80)

2372

{ *out++ = c; bits= -6; }

2373

else if (c < 0x800)

2374

{ *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }

2375

else if (c < 0x10000)

2376

{ *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }

2377

else

2378

{ *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }

2379

2380

for ( ; bits >= 0; bits-= 6) {

2381

*out++ = ((c >> bits) & 0x3F) | 0x80;

2382

}

2383

2384

if (out - buffer > buffer_size - 100) {

2385

int indx = out - buffer;

2386

2387

growBuffer(buffer);

2388

out = &buffer[indx];

2389

}

2390

} else {

2391

ent = htmlParseEntityRef(ctxt, &name);

2392

if (name == NULL) {

2393

*out++ = '&';

2394

if (out - buffer > buffer_size - 100) {

2395

int indx = out - buffer;

2396

2397

growBuffer(buffer);

2398

out = &buffer[indx];

2399

}

2400

} else if (ent == NULL) {

2401

*out++ = '&';

2402

cur = name;

2403

while (*cur != 0) {

2404

if (out - buffer > buffer_size - 100) {

2405

int indx = out - buffer;

2406

2407

growBuffer(buffer);

2408

out = &buffer[indx];

2409

}

2410

*out++ = *cur++;

2411

}

2412

} else {

2413

unsigned int c;

2414

int bits;

2415

2416

if (out - buffer > buffer_size - 100) {

2417

int indx = out - buffer;

2418

2419

growBuffer(buffer);

2420

out = &buffer[indx];

2421

}

2422

c = ent->value;

2423

if (c < 0x80)

2424

{ *out++ = c; bits= -6; }

2425

else if (c < 0x800)

2426

{ *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }

2427

else if (c < 0x10000)

2428

{ *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }

2429

else

2430

{ *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }

2431

2432

for ( ; bits >= 0; bits-= 6) {

2433

*out++ = ((c >> bits) & 0x3F) | 0x80;

2434

}

2435

}

2436

}

2437

} else {

2438

unsigned int c;

2439

int bits, l;

2440

2441

if (out - buffer > buffer_size - 100) {

2442

int indx = out - buffer;

2443

2444

growBuffer(buffer);

2445

out = &buffer[indx];

2446

}

2447

c = CUR_CHAR(l);

2448

if (c < 0x80)

2449

{ *out++ = c; bits= -6; }

2450

else if (c < 0x800)

2451

{ *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }

2452

else if (c < 0x10000)

2453

{ *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }

2454

else

2455

{ *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }

2456

2457

for ( ; bits >= 0; bits-= 6) {

2458

*out++ = ((c >> bits) & 0x3F) | 0x80;

2459

}

2460

NEXT;

2461

}

2462

}

2463

*out++ = 0;

2464

return(buffer);

2465

}

2466

2467

/**

2468

* htmlParseEntityRef:

2469

* @ctxt: an HTML parser context

2470

* @str: location to store the entity name

2471

2472

* parse an HTML ENTITY references

2473

2474

* [68] EntityRef ::= '&' Name ';'

2475

2476

* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,

2477

* if non-NULL *str will have to be freed by the caller.

2478

2479

const htmlEntityDesc *

2480

htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {

2481

const xmlChar *name;

2482

const htmlEntityDesc * ent = NULL;

2483

2484

if (str != NULL) *str = NULL;

2485

if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);

2486

2487

if (CUR == '&') {

2488

NEXT;

2489

name = htmlParseName(ctxt);

2490

if (name == NULL) {

2491

htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,

2492

"htmlParseEntityRef: no name\n", NULL, NULL);

2493

} else {

2494

GROW;

2495

if (CUR == ';') {

2496

if (str != NULL)

2497

*str = name;

2498

2499

2500

* Lookup the entity in the table.

2501

2502

ent = htmlEntityLookup(name);

2503

if (ent != NULL) /* OK that's ugly !!! */

2504

NEXT;

2505

} else {

2506

htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,

2507

"htmlParseEntityRef: expecting ';'\n",

2508

NULL, NULL);

2509

if (str != NULL)

2510

*str = name;

2511

}

2512

}

2513

}

2514

return(ent);

2515

}

2516

2517

/**

2518

* htmlParseAttValue:

2519

* @ctxt: an HTML parser context

2520

2521

* parse a value for an attribute

2522

* Note: the parser won't do substitution of entities here, this

2523

* will be handled later in xmlStringGetNodeList, unless it was

2524

* asked for ctxt->replaceEntities != 0

2525

2526

* Returns the AttValue parsed or NULL.

2527

2528

2529

static xmlChar *

2530

htmlParseAttValue(htmlParserCtxtPtr ctxt) {

2531

xmlChar *ret = NULL;

2532

2533

if (CUR == '"') {

2534

NEXT;

2535

ret = htmlParseHTMLAttribute(ctxt, '"');

2536

if (CUR != '"') {

2537

htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,

2538

"AttValue: \" expected\n", NULL, NULL);

2539

} else

2540

NEXT;

2541

} else if (CUR == '\'') {

2542

NEXT;

2543

ret = htmlParseHTMLAttribute(ctxt, '\'');

2544

if (CUR != '\'') {

2545

htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,

2546

"AttValue: ' expected\n", NULL, NULL);

2547

} else

2548

NEXT;

2549

} else {

2550

2551

* That's an HTMLism, the attribute value may not be quoted

2552

2553

ret = htmlParseHTMLAttribute(ctxt, 0);

2554

if (ret == NULL) {

2555

htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,

2556

"AttValue: no value found\n", NULL, NULL);

2557

}

2558

}

2559

return(ret);

2560

}

2561

2562

/**

2563

* htmlParseSystemLiteral:

2564

* @ctxt: an HTML parser context

2565

2566

* parse an HTML Literal

2567

2568

* [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")

2569

2570

* Returns the SystemLiteral parsed or NULL

2571

2572

2573

static xmlChar *

2574

htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {

2575

const xmlChar *q;

2576

xmlChar *ret = NULL;

2577

2578

if (CUR == '"') {

2579

NEXT;

2580

q = CUR_PTR;

2581

while ((IS_CHAR_CH(CUR)) && (CUR != '"'))

2582

NEXT;

2583

if (!IS_CHAR_CH(CUR)) {

2584

htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,

2585

"Unfinished SystemLiteral\n", NULL, NULL);

2586

} else {

2587

ret = xmlStrndup(q, CUR_PTR - q);

2588

NEXT;

2589

}

2590

} else if (CUR == '\'') {

2591

NEXT;

2592

q = CUR_PTR;

2593

while ((IS_CHAR_CH(CUR)) && (CUR != '\''))

2594

NEXT;

2595

if (!IS_CHAR_CH(CUR)) {

2596

htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,

2597

"Unfinished SystemLiteral\n", NULL, NULL);

2598

} else {

2599

ret = xmlStrndup(q, CUR_PTR - q);

2600

NEXT;

2601

}

2602

} else {

2603

htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,

2604

" or ' expected\n", NULL, NULL);

2605

}

2606

2607

return(ret);

2608

}

2609

2610

/**

2611

* htmlParsePubidLiteral:

2612

* @ctxt: an HTML parser context

2613

2614

* parse an HTML public literal

2615

2616

* [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"

2617

2618

* Returns the PubidLiteral parsed or NULL.

2619

2620

2621

static xmlChar *

2622

htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {

2623

const xmlChar *q;

2624

xmlChar *ret = NULL;

2625

2626

* Name ::= (Letter | '_') (NameChar)*

2627

2628

if (CUR == '"') {

2629

NEXT;

2630

q = CUR_PTR;

2631

while (IS_PUBIDCHAR_CH(CUR)) NEXT;

2632

if (CUR != '"') {

2633

htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,

2634

"Unfinished PubidLiteral\n", NULL, NULL);

2635

} else {

2636

ret = xmlStrndup(q, CUR_PTR - q);

2637

NEXT;

2638

}

2639

} else if (CUR == '\'') {

2640

NEXT;

2641

q = CUR_PTR;

2642

while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))

2643

NEXT;

2644

if (CUR != '\'') {

2645

htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,

2646

"Unfinished PubidLiteral\n", NULL, NULL);

2647

} else {

2648

ret = xmlStrndup(q, CUR_PTR - q);

2649

NEXT;

2650

}

2651

} else {

2652

htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,

2653

"PubidLiteral \" or ' expected\n", NULL, NULL);

2654

}

2655

2656

return(ret);

2657

}

2658

2659

/**

2660

* htmlParseScript:

2661

* @ctxt: an HTML parser context

2662

2663

* parse the content of an HTML SCRIPT or STYLE element

2664

* http://www.w3.org/TR/html4/sgml/dtd.html#Script

2665

* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet

2666

* http://www.w3.org/TR/html4/types.html#type-script

2667

* http://www.w3.org/TR/html4/types.html#h-6.15

2668

* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1

2669

2670

* Script data ( %Script; in the DTD) can be the content of the SCRIPT

2671

* element and the value of intrinsic event attributes. User agents must

2672

* not evaluate script data as HTML markup but instead must pass it on as

2673

* data to a script engine.

2674

* NOTES:

2675

* - The content is passed like CDATA

2676

* - the attributes for style and scripting "onXXX" are also described

2677

* as CDATA but SGML allows entities references in attributes so their

2678

* processing is identical as other attributes

2679

2680

static void

2681

htmlParseScript(htmlParserCtxtPtr ctxt) {

2682

xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];

2683

int nbchar = 0;

2684

int cur,l;

2685

2686

SHRINK;

2687

cur = CUR_CHAR(l);

2688

while (IS_CHAR_CH(cur)) {

2689

if ((cur == '<') && (NXT(1) == '/')) {

2690

2691

* One should break here, the specification is clear:

2692

* Authors should therefore escape "</" within the content.

2693

* Escape mechanisms are specific to each scripting or

2694

* style sheet language.

2695

2696

* In recovery mode, only break if end tag match the

2697

* current tag, effectively ignoring all tags inside the

2698

* script/style block and treating the entire block as

2699

* CDATA.

2700

2701

if (ctxt->recovery) {

2702

if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,

2703

xmlStrlen(ctxt->name)) == 0)

2704

{

2705

break; /* while */

2706

} else {

2707

htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,

2708

"Element %s embeds close tag\n",

2709

ctxt->name, NULL);

2710

}

2711

} else {

2712

if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||

2713

((NXT(2) >= 'a') && (NXT(2) <= 'z')))

2714

{

2715

break; /* while */

2716

}

2717

}

2718

}

2719

COPY_BUF(l,buf,nbchar,cur);

2720

if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {

2721

if (ctxt->sax->cdataBlock!= NULL) {

2722

2723

* Insert as CDATA, which is the same as HTML_PRESERVE_NODE

2724

2725

ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);

2726

} else if (ctxt->sax->characters != NULL) {

2727

ctxt->sax->characters(ctxt->userData, buf, nbchar);

2728

}

2729

nbchar = 0;

2730

}

2731

GROW;

2732

NEXTL(l);

2733

cur = CUR_CHAR(l);

2734

}

2735

2736

if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {

2737

htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,

2738

"Invalid char in CDATA 0x%X\n", cur);

2739

NEXT;

2740

}

2741

2742

if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {

2743

if (ctxt->sax->cdataBlock!= NULL) {

2744

2745

* Insert as CDATA, which is the same as HTML_PRESERVE_NODE

2746

2747

ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);

2748

} else if (ctxt->sax->characters != NULL) {

2749

ctxt->sax->characters(ctxt->userData, buf, nbchar);

2750

}

2751

}

2752

}

2753

2754

2755

/**

2756

* htmlParseCharData:

2757

* @ctxt: an HTML parser context

2758

2759

* parse a CharData section.

2760

* if we are within a CDATA section ']]>' marks an end of section.

2761

2762

* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)

2763

2764

2765

static void

2766

htmlParseCharData(htmlParserCtxtPtr ctxt) {

2767

xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];

2768

int nbchar = 0;

2769

int cur, l;

2770

2771

SHRINK;

2772

cur = CUR_CHAR(l);

2773

while (((cur != '<') || (ctxt->token == '<')) &&

2774

((cur != '&') || (ctxt->token == '&')) &&

2775

(cur != 0)) {

2776

if (!(IS_CHAR(cur))) {

2777

htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,

2778

"Invalid char in CDATA 0x%X\n", cur);

2779

} else {

2780

COPY_BUF(l,buf,nbchar,cur);

2781

}

2782

if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {

2783

2784

* Ok the segment is to be consumed as chars.

2785

2786

if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {

2787

if (areBlanks(ctxt, buf, nbchar)) {

2788

if (ctxt->sax->ignorableWhitespace != NULL)

2789

ctxt->sax->ignorableWhitespace(ctxt->userData,

2790

buf, nbchar);

2791

} else {

2792

htmlCheckParagraph(ctxt);

2793

if (ctxt->sax->characters != NULL)

2794

ctxt->sax->characters(ctxt->userData, buf, nbchar);

2795

}

2796

}

2797

nbchar = 0;

2798

}

2799

NEXTL(l);

2800

cur = CUR_CHAR(l);

2801

if (cur == 0) {

2802

SHRINK;

2803

GROW;

2804

cur = CUR_CHAR(l);

2805

}

2806

}

2807

if (nbchar != 0) {

2808

buf[nbchar] = 0;

2809

2810

2811

* Ok the segment is to be consumed as chars.

2812

2813

if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {

2814

if (areBlanks(ctxt, buf, nbchar)) {

2815

if (ctxt->sax->ignorableWhitespace != NULL)

2816

ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);

2817

} else {

2818

htmlCheckParagraph(ctxt);

2819

if (ctxt->sax->characters != NULL)

2820

ctxt->sax->characters(ctxt->userData, buf, nbchar);

2821

}

2822

}

2823

} else {

2824

2825

* Loop detection

2826

2827

if (cur == 0)

2828

ctxt->instate = XML_PARSER_EOF;

2829

}

2830

}

2831

2832

/**

2833

* htmlParseExternalID:

2834

* @ctxt: an HTML parser context

2835

* @publicID: a xmlChar** receiving PubidLiteral

2836

2837

* Parse an External ID or a Public ID

2838

2839

* [75] ExternalID ::= 'SYSTEM' S SystemLiteral

2840

* | 'PUBLIC' S PubidLiteral S SystemLiteral

2841

2842

* [83] PublicID ::= 'PUBLIC' S PubidLiteral

2843

2844

* Returns the function returns SystemLiteral and in the second

2845

* case publicID receives PubidLiteral, is strict is off

2846

* it is possible to return NULL and have publicID set.

2847

2848

2849

static xmlChar *

2850

htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {

2851

xmlChar *URI = NULL;

2852

2853

if ((UPPER == 'S') && (UPP(1) == 'Y') &&

2854

(UPP(2) == 'S') && (UPP(3) == 'T') &&

2855

(UPP(4) == 'E') && (UPP(5) == 'M')) {

2856

SKIP(6);

2857

if (!IS_BLANK_CH(CUR)) {

2858

htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,

2859

"Space required after 'SYSTEM'\n", NULL, NULL);

2860

}

2861

SKIP_BLANKS;

2862

URI = htmlParseSystemLiteral(ctxt);

2863

if (URI == NULL) {

2864

htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,

2865

"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);

2866

}

2867

} else if ((UPPER == 'P') && (UPP(1) == 'U') &&

2868

(UPP(2) == 'B') && (UPP(3) == 'L') &&

2869

(UPP(4) == 'I') && (UPP(5) == 'C')) {

2870

SKIP(6);

2871

if (!IS_BLANK_CH(CUR)) {

2872

htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,

2873

"Space required after 'PUBLIC'\n", NULL, NULL);

2874

}

2875

SKIP_BLANKS;

2876

*publicID = htmlParsePubidLiteral(ctxt);

2877

if (*publicID == NULL) {

2878

htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,

2879

"htmlParseExternalID: PUBLIC, no Public Identifier\n",

2880

NULL, NULL);

2881

}

2882

SKIP_BLANKS;

2883

if ((CUR == '"') || (CUR == '\'')) {

2884

URI = htmlParseSystemLiteral(ctxt);

2885

}

2886

}

2887

return(URI);

2888

}

2889

2890

/**

2891

* xmlParsePI:

2892

* @ctxt: an XML parser context

2893

2894

* parse an XML Processing Instruction.

2895

2896

* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'

2897

2898

static void

2899

htmlParsePI(htmlParserCtxtPtr ctxt) {

2900

xmlChar *buf = NULL;

2901

int len = 0;

2902

int size = HTML_PARSER_BUFFER_SIZE;

2903

int cur, l;

2904

const xmlChar *target;

2905

xmlParserInputState state;

2906

int count = 0;

2907

2908

if ((RAW == '<') && (NXT(1) == '?')) {

2909

state = ctxt->instate;

2910

ctxt->instate = XML_PARSER_PI;

2911

2912

* this is a Processing Instruction.

2913

2914

SKIP(2);

2915

SHRINK;

2916

2917

2918

* Parse the target name and check for special support like

2919

* namespace.

2920

2921

target = htmlParseName(ctxt);

2922

if (target != NULL) {

2923

if (RAW == '>') {

2924

SKIP(1);

2925

2926

2927

* SAX: PI detected.

2928

2929

if ((ctxt->sax) && (!ctxt->disableSAX) &&

2930

(ctxt->sax->processingInstruction != NULL))

2931

ctxt->sax->processingInstruction(ctxt->userData,

2932

target, NULL);

2933

ctxt->instate = state;

2934

return;

2935

}

2936

buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));

2937

if (buf == NULL) {

2938

htmlErrMemory(ctxt, NULL);

2939

ctxt->instate = state;

2940

return;

2941

}

2942

cur = CUR;

2943

if (!IS_BLANK(cur)) {

2944

htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,

2945

"ParsePI: PI %s space expected\n", target, NULL);

2946

}

2947

SKIP_BLANKS;

2948

cur = CUR_CHAR(l);

2949

while (IS_CHAR(cur) && (cur != '>')) {

2950

if (len + 5 >= size) {

2951

xmlChar *tmp;

2952

2953

size *= 2;

2954

tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));

2955

if (tmp == NULL) {

2956

htmlErrMemory(ctxt, NULL);

2957

xmlFree(buf);

2958

ctxt->instate = state;

2959

return;

2960

}

2961

buf = tmp;

2962

}

2963

count++;

2964

if (count > 50) {

2965

GROW;

2966

count = 0;

2967

}

2968

COPY_BUF(l,buf,len,cur);

2969

NEXTL(l);

2970

cur = CUR_CHAR(l);

2971

if (cur == 0) {

2972

SHRINK;

2973

GROW;

2974

cur = CUR_CHAR(l);

2975

}

2976

}

2977

buf[len] = 0;

2978

if (cur != '>') {

2979

htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,

2980

"ParsePI: PI %s never end ...\n", target, NULL);

2981

} else {

2982

SKIP(1);

2983

2984

2985

* SAX: PI detected.

2986

2987

if ((ctxt->sax) && (!ctxt->disableSAX) &&

2988

(ctxt->sax->processingInstruction != NULL))

2989

ctxt->sax->processingInstruction(ctxt->userData,

2990

target, buf);

2991

}

2992

xmlFree(buf);

2993

} else {

2994

htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,

2995

"PI is not started correctly", NULL, NULL);

2996

}

2997

ctxt->instate = state;

2998

}

2999

}

3000

3001

/**

3002

* htmlParseComment:

3003

* @ctxt: an HTML parser context

3004

3005

* Parse an XML (SGML) comment

3006

3007

* [15] Comment ::= ''

3008

3009

static void

3010

htmlParseComment(htmlParserCtxtPtr ctxt) {

3011

xmlChar *buf = NULL;

3012

int len;

3013

int size = HTML_PARSER_BUFFER_SIZE;

3014

int q, ql;

3015

int r, rl;

3016

int cur, l;

3017

xmlParserInputState state;

3018

3019

3020

* Check that there is a comment right here.

3021

3022

if ((RAW != '<') || (NXT(1) != '!') ||

3023

(NXT(2) != '-') || (NXT(3) != '-')) return;

3024

3025

state = ctxt->instate;

3026

ctxt->instate = XML_PARSER_COMMENT;

3027

SHRINK;

3028

SKIP(4);

3029

buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));

3030

if (buf == NULL) {

3031

htmlErrMemory(ctxt, "buffer allocation failed\n");

3032

ctxt->instate = state;

3033

return;

3034

}

3035

q = CUR_CHAR(ql);

3036

NEXTL(ql);

3037

r = CUR_CHAR(rl);

3038

NEXTL(rl);

3039

cur = CUR_CHAR(l);

3040

len = 0;

3041

while (IS_CHAR(cur) &&

3042

((cur != '>') ||

3043

(r != '-') || (q != '-'))) {

3044

if (len + 5 >= size) {

3045

xmlChar *tmp;

3046

3047

size *= 2;

3048

tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));

3049

if (tmp == NULL) {

3050

xmlFree(buf);

3051

htmlErrMemory(ctxt, "growing buffer failed\n");

3052

ctxt->instate = state;

3053

return;

3054

}

3055

buf = tmp;

3056

}

3057

COPY_BUF(ql,buf,len,q);

3058

q = r;

3059

ql = rl;

3060

r = cur;

3061

rl = l;

3062

NEXTL(l);

3063

cur = CUR_CHAR(l);

3064

if (cur == 0) {

3065

SHRINK;

3066

GROW;

3067

cur = CUR_CHAR(l);

3068

}

3069

}

3070

buf[len] = 0;

3071

if (!IS_CHAR(cur)) {

3072

htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,

3073

"Comment not terminated \n<!--%.50s\n", buf, NULL);

3074

xmlFree(buf);

3075

} else {

3076

NEXT;

3077

if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&

3078

(!ctxt->disableSAX))

3079

ctxt->sax->comment(ctxt->userData, buf);

3080

xmlFree(buf);

3081

}

3082

ctxt->instate = state;

3083

}

3084

3085

/**

3086

* htmlParseCharRef:

3087

* @ctxt: an HTML parser context

3088

3089

* parse Reference declarations

3090

3091

* [66] CharRef ::= '&#' [0-9]+ ';' |

3092

* '&#x' [0-9a-fA-F]+ ';'

3093

3094

* Returns the value parsed (as an int)

3095

3096

int

3097

htmlParseCharRef(htmlParserCtxtPtr ctxt) {

3098

int val = 0;

3099

3100

if ((ctxt == NULL) || (ctxt->input == NULL)) {

3101

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

3102

"htmlParseCharRef: context error\n",

3103

NULL, NULL);

3104

return(0);

3105

}

3106

if ((CUR == '&') && (NXT(1) == '#') &&

3107

((NXT(2) == 'x') || NXT(2) == 'X')) {

3108

SKIP(3);

3109

while (CUR != ';') {

3110

if ((CUR >= '0') && (CUR <= '9'))

3111

val = val * 16 + (CUR - '0');

3112

else if ((CUR >= 'a') && (CUR <= 'f'))

3113

val = val * 16 + (CUR - 'a') + 10;

3114

else if ((CUR >= 'A') && (CUR <= 'F'))

3115

val = val * 16 + (CUR - 'A') + 10;

3116

else {

3117

htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,

3118

"htmlParseCharRef: missing semicolumn\n",

3119

NULL, NULL);

3120

break;

3121

}

3122

NEXT;

3123

}

3124

if (CUR == ';')

3125

NEXT;

3126

} else if ((CUR == '&') && (NXT(1) == '#')) {

3127

SKIP(2);

3128

while (CUR != ';') {

3129

if ((CUR >= '0') && (CUR <= '9'))

3130

val = val * 10 + (CUR - '0');

3131

else {

3132

htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,

3133

"htmlParseCharRef: missing semicolumn\n",

3134

NULL, NULL);

3135

break;

3136

}

3137

NEXT;

3138

}

3139

if (CUR == ';')

3140

NEXT;

3141

} else {

3142

htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,

3143

"htmlParseCharRef: invalid value\n", NULL, NULL);

3144

}

3145

3146

* Check the value IS_CHAR ...

3147

3148

if (IS_CHAR(val)) {

3149

return(val);

3150

} else {

3151

htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,

3152

"htmlParseCharRef: invalid xmlChar value %d\n",

3153

val);

3154

}

3155

return(0);

3156

}

3157

3158

3159

/**

3160

* htmlParseDocTypeDecl:

3161

* @ctxt: an HTML parser context

3162

3163

* parse a DOCTYPE declaration

3164

3165

* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?

3166

* ('[' (markupdecl | PEReference | S)* ']' S?)? '>'

3167

3168

3169

static void

3170

htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {

3171

const xmlChar *name;

3172

xmlChar *ExternalID = NULL;

3173

xmlChar *URI = NULL;

3174

3175

3176

* We know that '<!DOCTYPE' has been detected.

3177

3178

SKIP(9);

3179

3180

SKIP_BLANKS;

3181

3182

3183

* Parse the DOCTYPE name.

3184

3185

name = htmlParseName(ctxt);

3186

if (name == NULL) {

3187

htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,

3188

"htmlParseDocTypeDecl : no DOCTYPE name !\n",

3189

NULL, NULL);

3190

}

3191

3192

* Check that upper(name) == "HTML" !!!!!!!!!!!!!

3193

3194

3195

SKIP_BLANKS;

3196

3197

3198

* Check for SystemID and ExternalID

3199

3200

URI = htmlParseExternalID(ctxt, &ExternalID);

3201

SKIP_BLANKS;

3202

3203

3204

* We should be at the end of the DOCTYPE declaration.

3205

3206

if (CUR != '>') {

3207

htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,

3208

"DOCTYPE improperly terminated\n", NULL, NULL);

3209

/* We shouldn't try to resynchronize ... */

3210

}

3211

NEXT;

3212

3213

3214

* Create or update the document accordingly to the DOCTYPE

3215

3216

if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&

3217

(!ctxt->disableSAX))

3218

ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);

3219

3220

3221

* Cleanup, since we don't use all those identifiers

3222

3223

if (URI != NULL) xmlFree(URI);

3224

if (ExternalID != NULL) xmlFree(ExternalID);

3225

}

3226

3227

/**

3228

* htmlParseAttribute:

3229

* @ctxt: an HTML parser context

3230

* @value: a xmlChar ** used to store the value of the attribute

3231

3232

* parse an attribute

3233

3234

* [41] Attribute ::= Name Eq AttValue

3235

3236

* [25] Eq ::= S? '=' S?

3237

3238

* With namespace:

3239

3240

* [NS 11] Attribute ::= QName Eq AttValue

3241

3242

* Also the case QName == xmlns:??? is handled independently as a namespace

3243

* definition.

3244

3245

* Returns the attribute name, and the value in *value.

3246

3247

3248

static const xmlChar *

3249

htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {

3250

const xmlChar *name;

3251

xmlChar *val = NULL;

3252

3253

*value = NULL;

3254

name = htmlParseHTMLName(ctxt);

3255

if (name == NULL) {

3256

htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,

3257

"error parsing attribute name\n", NULL, NULL);

3258

return(NULL);

3259

}

3260

3261

3262

* read the value

3263

3264

SKIP_BLANKS;

3265

if (CUR == '=') {

3266

NEXT;

3267

SKIP_BLANKS;

3268

val = htmlParseAttValue(ctxt);

3269

} else if (htmlIsBooleanAttr(name)) {

3270

3271

* assume a minimized attribute

3272

3273

val = xmlStrdup(name);

3274

}

3275

3276

*value = val;

3277

return(name);

3278

}

3279

3280

/**

3281

* htmlCheckEncoding:

3282

* @ctxt: an HTML parser context

3283

* @attvalue: the attribute value

3284

3285

* Checks an http-equiv attribute from a Meta tag to detect

3286

* the encoding

3287

* If a new encoding is detected the parser is switched to decode

3288

* it and pass UTF8

3289

3290

static void

3291

htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {

3292

const xmlChar *encoding;

3293

3294

if ((ctxt == NULL) || (attvalue == NULL))

3295

return;

3296

3297

/* do not change encoding */

3298

if (ctxt->input->encoding != NULL)

3299

return;

3300

3301

encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");

3302

if (encoding != NULL) {

3303

encoding += 8;

3304

} else {

3305

encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");

3306

if (encoding != NULL)

3307

encoding += 9;

3308

}

3309

if (encoding != NULL) {

3310

xmlCharEncoding enc;

3311

xmlCharEncodingHandlerPtr handler;

3312

3313

while ((*encoding == ' ') || (*encoding == '\t')) encoding++;

3314

3315

if (ctxt->input->encoding != NULL)

3316

xmlFree((xmlChar *) ctxt->input->encoding);

3317

ctxt->input->encoding = xmlStrdup(encoding);

3318

3319

enc = xmlParseCharEncoding((const char *) encoding);

3320

3321

* registered set of known encodings

3322

3323

if (enc != XML_CHAR_ENCODING_ERROR) {

3324

if (((enc == XML_CHAR_ENCODING_UTF16LE) ||

3325

(enc == XML_CHAR_ENCODING_UTF16BE) ||

3326

(enc == XML_CHAR_ENCODING_UCS4LE) ||

3327

(enc == XML_CHAR_ENCODING_UCS4BE)) &&

3328

(ctxt->input->buf != NULL) &&

3329

(ctxt->input->buf->encoder == NULL)) {

3330

htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,

3331

"htmlCheckEncoding: wrong encoding meta\n",

3332

NULL, NULL);

3333

} else {

3334

xmlSwitchEncoding(ctxt, enc);

3335

}

3336

ctxt->charset = XML_CHAR_ENCODING_UTF8;

3337

} else {

3338

3339

* fallback for unknown encodings

3340

3341

handler = xmlFindCharEncodingHandler((const char *) encoding);

3342

if (handler != NULL) {

3343

xmlSwitchToEncoding(ctxt, handler);

3344

ctxt->charset = XML_CHAR_ENCODING_UTF8;

3345

} else {

3346

ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;

3347

}

3348

}

3349

3350

if ((ctxt->input->buf != NULL) &&

3351

(ctxt->input->buf->encoder != NULL) &&

3352

(ctxt->input->buf->raw != NULL) &&

3353

(ctxt->input->buf->buffer != NULL)) {

3354

int nbchars;

3355

int processed;

3356

3357

3358

* convert as much as possible to the parser reading buffer.

3359

3360

processed = ctxt->input->cur - ctxt->input->base;

3361

xmlBufferShrink(ctxt->input->buf->buffer, processed);

3362

nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,

3363

ctxt->input->buf->buffer,

3364

ctxt->input->buf->raw);

3365

if (nbchars < 0) {

3366

htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,

3367

"htmlCheckEncoding: encoder error\n",

3368

NULL, NULL);

3369

}

3370

ctxt->input->base =

3371

ctxt->input->cur = ctxt->input->buf->buffer->content;

3372

}

3373

}

3374

}

3375

3376

/**

3377

* htmlCheckMeta:

3378

* @ctxt: an HTML parser context

3379

* @atts: the attributes values

3380

3381

* Checks an attributes from a Meta tag

3382

3383

static void

3384

htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {

3385

int i;

3386

const xmlChar *att, *value;

3387

int http = 0;

3388

const xmlChar *content = NULL;

3389

3390

if ((ctxt == NULL) || (atts == NULL))

3391

return;

3392

3393

i = 0;

3394

att = atts[i++];

3395

while (att != NULL) {

3396

value = atts[i++];

3397

if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))

3398

&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))

3399

http = 1;

3400

else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))

3401

content = value;

3402

att = atts[i++];

3403

}

3404

if ((http) && (content != NULL))

3405

htmlCheckEncoding(ctxt, content);

3406

3407

}

3408

3409

/**

3410

* htmlParseStartTag:

3411

* @ctxt: an HTML parser context

3412

3413

* parse a start of tag either for rule element or

3414

* EmptyElement. In both case we don't parse the tag closing chars.

3415

3416

* [40] STag ::= '<' Name (S Attribute)* S? '>'

3417

3418

* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'

3419

3420

* With namespace:

3421

3422

* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'

3423

3424

* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'

3425

3426

* Returns 0 in case of success, -1 in case of error and 1 if discarded

3427

3428

3429

static int

3430

htmlParseStartTag(htmlParserCtxtPtr ctxt) {

3431

const xmlChar *name;

3432

const xmlChar *attname;

3433

xmlChar *attvalue;

3434

const xmlChar **atts;

3435

int nbatts = 0;

3436

int maxatts;

3437

int meta = 0;

3438

int i;

3439

int discardtag = 0;

3440

3441

if ((ctxt == NULL) || (ctxt->input == NULL)) {

3442

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

3443

"htmlParseStartTag: context error\n", NULL, NULL);

3444

return -1;

3445

}

3446

if (CUR != '<') return -1;

3447

NEXT;

3448

3449

atts = ctxt->atts;

3450

maxatts = ctxt->maxatts;

3451

3452

GROW;

3453

name = htmlParseHTMLName(ctxt);

3454

if (name == NULL) {

3455

htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,

3456

"htmlParseStartTag: invalid element name\n",

3457

NULL, NULL);

3458

/* Dump the bogus tag like browsers do */

3459

while ((IS_CHAR_CH(CUR)) && (CUR != '>'))

3460

NEXT;

3461

return -1;

3462

}

3463

if (xmlStrEqual(name, BAD_CAST"meta"))

3464

meta = 1;

3465

3466

3467

* Check for auto-closure of HTML elements.

3468

3469

htmlAutoClose(ctxt, name);

3470

3471

3472

* Check for implied HTML elements.

3473

3474

htmlCheckImplied(ctxt, name);

3475

3476

3477

* Avoid html at any level > 0, head at any level != 1

3478

* or any attempt to recurse body

3479

3480

if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {

3481

htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,

3482

"htmlParseStartTag: misplaced <html> tag\n",

3483

name, NULL);

3484

discardtag = 1;

3485

}

3486

if ((ctxt->nameNr != 1) &&

3487

(xmlStrEqual(name, BAD_CAST"head"))) {

3488

htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,

3489

"htmlParseStartTag: misplaced <head> tag\n",

3490

name, NULL);

3491

discardtag = 1;

3492

}

3493

if (xmlStrEqual(name, BAD_CAST"body")) {

3494

int indx;

3495

for (indx = 0;indx < ctxt->nameNr;indx++) {

3496

if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {

3497

htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,

3498

"htmlParseStartTag: misplaced <body> tag\n",

3499

name, NULL);

3500

discardtag = 1;

3501

}

3502

}

3503

}

3504

3505

3506

* Now parse the attributes, it ends up with the ending

3507

3508

* (S Attribute)* S?

3509

3510

SKIP_BLANKS;

3511

while ((IS_CHAR_CH(CUR)) &&

3512

(CUR != '>') &&

3513

((CUR != '/') || (NXT(1) != '>'))) {

3514

long cons = ctxt->nbChars;

3515

3516

GROW;

3517

attname = htmlParseAttribute(ctxt, &attvalue);

3518

if (attname != NULL) {

3519

3520

3521

* Well formedness requires at most one declaration of an attribute

3522

3523

for (i = 0; i < nbatts;i += 2) {

3524

if (xmlStrEqual(atts[i], attname)) {

3525

htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,

3526

"Attribute %s redefined\n", attname, NULL);

3527

if (attvalue != NULL)

3528

xmlFree(attvalue);

3529

goto failed;

3530

}

3531

}

3532

3533

3534

* Add the pair to atts

3535

3536

if (atts == NULL) {

3537

maxatts = 22; /* allow for 10 attrs by default */

3538

atts = (const xmlChar **)

3539

xmlMalloc(maxatts * sizeof(xmlChar *));

3540

if (atts == NULL) {

3541

htmlErrMemory(ctxt, NULL);

3542

if (attvalue != NULL)

3543

xmlFree(attvalue);

3544

goto failed;

3545

}

3546

ctxt->atts = atts;

3547

ctxt->maxatts = maxatts;

3548

} else if (nbatts + 4 > maxatts) {

3549

const xmlChar **n;

3550

3551

maxatts *= 2;

3552

n = (const xmlChar **) xmlRealloc((void *) atts,

3553

maxatts * sizeof(const xmlChar *));

3554

if (n == NULL) {

3555

htmlErrMemory(ctxt, NULL);

3556

if (attvalue != NULL)

3557

xmlFree(attvalue);

3558

goto failed;

3559

}

3560

atts = n;

3561

ctxt->atts = atts;

3562

ctxt->maxatts = maxatts;

3563

}

3564

atts[nbatts++] = attname;

3565

atts[nbatts++] = attvalue;

3566

atts[nbatts] = NULL;

3567

atts[nbatts + 1] = NULL;

3568

}

3569

else {

3570

if (attvalue != NULL)

3571

xmlFree(attvalue);

3572

/* Dump the bogus attribute string up to the next blank or

3573

* the end of the tag. */

3574

while ((IS_CHAR_CH(CUR)) &&

3575

!(IS_BLANK_CH(CUR)) && (CUR != '>') &&

3576

((CUR != '/') || (NXT(1) != '>')))

3577

NEXT;

3578

}

3579

3580

failed:

3581

SKIP_BLANKS;

3582

if (cons == ctxt->nbChars) {

3583

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

3584

"htmlParseStartTag: problem parsing attributes\n",

3585

NULL, NULL);

3586

break;

3587

}

3588

}

3589

3590

3591

* Handle specific association to the META tag

3592

3593

if (meta && (nbatts != 0))

3594

htmlCheckMeta(ctxt, atts);

3595

3596

3597

* SAX: Start of Element !

3598

3599

if (!discardtag) {

3600

htmlnamePush(ctxt, name);

3601

if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {

3602

if (nbatts != 0)

3603

ctxt->sax->startElement(ctxt->userData, name, atts);

3604

else

3605

ctxt->sax->startElement(ctxt->userData, name, NULL);

3606

}

3607

}

3608

3609

if (atts != NULL) {

3610

for (i = 1;i < nbatts;i += 2) {

3611

if (atts[i] != NULL)

3612

xmlFree((xmlChar *) atts[i]);

3613

}

3614

}

3615

3616

return(discardtag);

3617

}

3618

3619

/**

3620

* htmlParseEndTag:

3621

* @ctxt: an HTML parser context

3622

3623

* parse an end of tag

3624

3625

* [42] ETag ::= '</' Name S? '>'

3626

3627

* With namespace

3628

3629

* [NS 9] ETag ::= '</' QName S? '>'

3630

3631

* Returns 1 if the current level should be closed.

3632

3633

3634

static int

3635

htmlParseEndTag(htmlParserCtxtPtr ctxt)

3636

{

3637

const xmlChar *name;

3638

const xmlChar *oldname;

3639

int i, ret;

3640

3641

if ((CUR != '<') || (NXT(1) != '/')) {

3642

htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,

3643

"htmlParseEndTag: '</' not found\n", NULL, NULL);

3644

return (0);

3645

}

3646

SKIP(2);

3647

3648

name = htmlParseHTMLName(ctxt);

3649

if (name == NULL)

3650

return (0);

3651

3652

3653

* We should definitely be at the ending "S? '>'" part

3654

3655

SKIP_BLANKS;

3656

if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {

3657

htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,

3658

"End tag : expected '>'\n", NULL, NULL);

3659

if (ctxt->recovery) {

3660

3661

* We're not at the ending > !!

3662

* Error, unless in recover mode where we search forwards

3663

* until we find a >

3664

3665

while (CUR != '\0' && CUR != '>') NEXT;

3666

NEXT;

3667

}

3668

} else

3669

NEXT;

3670

3671

3672

* If the name read is not one of the element in the parsing stack

3673

* then return, it's just an error.

3674

3675

for (i = (ctxt->nameNr - 1); i >= 0; i--) {

3676

if (xmlStrEqual(name, ctxt->nameTab[i]))

3677

break;

3678

}

3679

if (i < 0) {

3680

htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,

3681

"Unexpected end tag : %s\n", name, NULL);

3682

return (0);

3683

}

3684

3685

3686

3687

* Check for auto-closure of HTML elements.

3688

3689

3690

htmlAutoCloseOnClose(ctxt, name);

3691

3692

3693

* Well formedness constraints, opening and closing must match.

3694

* With the exception that the autoclose may have popped stuff out

3695

* of the stack.

3696

3697

if (!xmlStrEqual(name, ctxt->name)) {

3698

if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {

3699

htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,

3700

"Opening and ending tag mismatch: %s and %s\n",

3701

name, ctxt->name);

3702

}

3703

}

3704

3705

3706

* SAX: End of Tag

3707

3708

oldname = ctxt->name;

3709

if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {

3710

if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

3711

ctxt->sax->endElement(ctxt->userData, name);

3712

htmlnamePop(ctxt);

3713

ret = 1;

3714

} else {

3715

ret = 0;

3716

}

3717

3718

return (ret);

3719

}

3720

3721

3722

/**

3723

* htmlParseReference:

3724

* @ctxt: an HTML parser context

3725

3726

* parse and handle entity references in content,

3727

* this will end-up in a call to character() since this is either a

3728

* CharRef, or a predefined entity.

3729

3730

static void

3731

htmlParseReference(htmlParserCtxtPtr ctxt) {

3732

const htmlEntityDesc * ent;

3733

xmlChar out[6];

3734

const xmlChar *name;

3735

if (CUR != '&') return;

3736

3737

if (NXT(1) == '#') {

3738

unsigned int c;

3739

int bits, i = 0;

3740

3741

c = htmlParseCharRef(ctxt);

3742

if (c == 0)

3743

return;

3744

3745

if (c < 0x80) { out[i++]= c; bits= -6; }

3746

else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }

3747

else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }

3748

else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }

3749

3750

for ( ; bits >= 0; bits-= 6) {

3751

out[i++]= ((c >> bits) & 0x3F) | 0x80;

3752

}

3753

out[i] = 0;

3754

3755

htmlCheckParagraph(ctxt);

3756

if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))

3757

ctxt->sax->characters(ctxt->userData, out, i);

3758

} else {

3759

ent = htmlParseEntityRef(ctxt, &name);

3760

if (name == NULL) {

3761

htmlCheckParagraph(ctxt);

3762

if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))

3763

ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);

3764

return;

3765

}

3766

if ((ent == NULL) || !(ent->value > 0)) {

3767

htmlCheckParagraph(ctxt);

3768

if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {

3769

ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);

3770

ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));

3771

/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */

3772

}

3773

} else {

3774

unsigned int c;

3775

int bits, i = 0;

3776

3777

c = ent->value;

3778

if (c < 0x80)

3779

{ out[i++]= c; bits= -6; }

3780

else if (c < 0x800)

3781

{ out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }

3782

else if (c < 0x10000)

3783

{ out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }

3784

else

3785

{ out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }

3786

3787

for ( ; bits >= 0; bits-= 6) {

3788

out[i++]= ((c >> bits) & 0x3F) | 0x80;

3789

}

3790

out[i] = 0;

3791

3792

htmlCheckParagraph(ctxt);

3793

if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))

3794

ctxt->sax->characters(ctxt->userData, out, i);

3795

}

3796

}

3797

}

3798

3799

/**

3800

* htmlParseContent:

3801

* @ctxt: an HTML parser context

3802

3803

* Parse a content: comment, sub-element, reference or text.

3804

3805

3806

static void

3807

htmlParseContent(htmlParserCtxtPtr ctxt) {

3808

xmlChar *currentNode;

3809

int depth;

3810

const xmlChar *name;

3811

3812

currentNode = xmlStrdup(ctxt->name);

3813

depth = ctxt->nameNr;

3814

while (1) {

3815

long cons = ctxt->nbChars;

3816

3817

GROW;

3818

3819

* Our tag or one of it's parent or children is ending.

3820

3821

if ((CUR == '<') && (NXT(1) == '/')) {

3822

if (htmlParseEndTag(ctxt) &&

3823

((currentNode != NULL) || (ctxt->nameNr == 0))) {

3824

if (currentNode != NULL)

3825

xmlFree(currentNode);

3826

return;

3827

}

3828

continue; /* while */

3829

}

3830

3831

else if ((CUR == '<') &&

3832

((IS_ASCII_LETTER(NXT(1))) ||

3833

(NXT(1) == '_') || (NXT(1) == ':'))) {

3834

name = htmlParseHTMLName_nonInvasive(ctxt);

3835

if (name == NULL) {

3836

htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,

3837

"htmlParseStartTag: invalid element name\n",

3838

NULL, NULL);

3839

/* Dump the bogus tag like browsers do */

3840

while ((IS_CHAR_CH(CUR)) && (CUR != '>'))

3841

NEXT;

3842

3843

if (currentNode != NULL)

3844

xmlFree(currentNode);

3845

return;

3846

}

3847

3848

if (ctxt->name != NULL) {

3849

if (htmlCheckAutoClose(name, ctxt->name) == 1) {

3850

htmlAutoClose(ctxt, name);

3851

continue;

3852

}

3853

}

3854

}

3855

3856

3857

* Has this node been popped out during parsing of

3858

* the next element

3859

3860

if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&

3861

(!xmlStrEqual(currentNode, ctxt->name)))

3862

{

3863

if (currentNode != NULL) xmlFree(currentNode);

3864

return;

3865

}

3866

3867

if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||

3868

(xmlStrEqual(currentNode, BAD_CAST"style")))) {

3869

3870

* Handle SCRIPT/STYLE separately

3871

3872

htmlParseScript(ctxt);

3873

} else {

3874

3875

* Sometimes DOCTYPE arrives in the middle of the document

3876

3877

if ((CUR == '<') && (NXT(1) == '!') &&

3878

(UPP(2) == 'D') && (UPP(3) == 'O') &&

3879

(UPP(4) == 'C') && (UPP(5) == 'T') &&

3880

(UPP(6) == 'Y') && (UPP(7) == 'P') &&

3881

(UPP(8) == 'E')) {

3882

htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,

3883

"Misplaced DOCTYPE declaration\n",

3884

BAD_CAST "DOCTYPE" , NULL);

3885

htmlParseDocTypeDecl(ctxt);

3886

}

3887

3888

3889

* First case : a comment

3890

3891

if ((CUR == '<') && (NXT(1) == '!') &&

3892

(NXT(2) == '-') && (NXT(3) == '-')) {

3893

htmlParseComment(ctxt);

3894

}

3895

3896

3897

* Second case : a Processing Instruction.

3898

3899

else if ((CUR == '<') && (NXT(1) == '?')) {

3900

htmlParsePI(ctxt);

3901

}

3902

3903

3904

* Third case : a sub-element.

3905

3906

else if (CUR == '<') {

3907

htmlParseElement(ctxt);

3908

}

3909

3910

3911

* Fourth case : a reference. If if has not been resolved,

3912

* parsing returns it's Name, create the node

3913

3914

else if (CUR == '&') {

3915

htmlParseReference(ctxt);

3916

}

3917

3918

3919

* Fifth case : end of the resource

3920

3921

else if (CUR == 0) {

3922

htmlAutoCloseOnEnd(ctxt);

3923

break;

3924

}

3925

3926

3927

* Last case, text. Note that References are handled directly.

3928

3929

else {

3930

htmlParseCharData(ctxt);

3931

}

3932

3933

if (cons == ctxt->nbChars) {

3934

if (ctxt->node != NULL) {

3935

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

3936

"detected an error in element content\n",

3937

NULL, NULL);

3938

}

3939

break;

3940

}

3941

}

3942

GROW;

3943

}

3944

if (currentNode != NULL) xmlFree(currentNode);

3945

}

3946

3947

/**

3948

* htmlParseContent:

3949

* @ctxt: an HTML parser context

3950

3951

* Parse a content: comment, sub-element, reference or text.

3952

3953

3954

void

3955

__htmlParseContent(void *ctxt) {

3956

if (ctxt != NULL)

3957

htmlParseContent((htmlParserCtxtPtr) ctxt);

3958

}

3959

3960

/**

3961

* htmlParseElement:

3962

* @ctxt: an HTML parser context

3963

3964

* parse an HTML element, this is highly recursive

3965

3966

* [39] element ::= EmptyElemTag | STag content ETag

3967

3968

* [41] Attribute ::= Name Eq AttValue

3969

3970

3971

void

3972

htmlParseElement(htmlParserCtxtPtr ctxt) {

3973

const xmlChar *name;

3974

xmlChar *currentNode = NULL;

3975

const htmlElemDesc * info;

3976

htmlParserNodeInfo node_info;

3977

int failed;

3978

int depth;

3979

const xmlChar *oldptr;

3980

3981

if ((ctxt == NULL) || (ctxt->input == NULL)) {

3982

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

3983

"htmlParseElement: context error\n", NULL, NULL);

3984

return;

3985

}

3986

/* Capture start position */

3987

if (ctxt->record_info) {

3988

node_info.begin_pos = ctxt->input->consumed +

3989

(CUR_PTR - ctxt->input->base);

3990

node_info.begin_line = ctxt->input->line;

3991

}

3992

3993

failed = htmlParseStartTag(ctxt);

3994

name = ctxt->name;

3995

if ((failed == -1) || (name == NULL)) {

3996

if (CUR == '>')

3997

NEXT;

3998

return;

3999

}

4000

4001

4002

* Lookup the info for that element.

4003

4004

info = htmlTagLookup(name);

4005

if (info == NULL) {

4006

htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,

4007

"Tag %s invalid\n", name, NULL);

4008

}

4009

4010

4011

* Check for an Empty Element labeled the XML/SGML way

4012

4013

if ((CUR == '/') && (NXT(1) == '>')) {

4014

SKIP(2);

4015

if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

4016

ctxt->sax->endElement(ctxt->userData, name);

4017

htmlnamePop(ctxt);

4018

return;

4019

}

4020

4021

if (CUR == '>') {

4022

NEXT;

4023

} else {

4024

htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,

4025

"Couldn't find end of Start Tag %s\n", name, NULL);

4026

4027

4028

* end of parsing of this node.

4029

4030

if (xmlStrEqual(name, ctxt->name)) {

4031

nodePop(ctxt);

4032

htmlnamePop(ctxt);

4033

}

4034

4035

4036

* Capture end position and add node

4037

4038

if (ctxt->record_info) {

4039

node_info.end_pos = ctxt->input->consumed +

4040

(CUR_PTR - ctxt->input->base);

4041

node_info.end_line = ctxt->input->line;

4042

node_info.node = ctxt->node;

4043

xmlParserAddNodeInfo(ctxt, &node_info);

4044

}

4045

return;

4046

}

4047

4048

4049

* Check for an Empty Element from DTD definition

4050

4051

if ((info != NULL) && (info->empty)) {

4052

if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

4053

ctxt->sax->endElement(ctxt->userData, name);

4054

htmlnamePop(ctxt);

4055

return;

4056

}

4057

4058

4059

* Parse the content of the element:

4060

4061

currentNode = xmlStrdup(ctxt->name);

4062

depth = ctxt->nameNr;

4063

while (IS_CHAR_CH(CUR)) {

4064

oldptr = ctxt->input->cur;

4065

htmlParseContent(ctxt);

4066

if (oldptr==ctxt->input->cur) break;

4067

if (ctxt->nameNr < depth) break;

4068

}

4069

4070

4071

* Capture end position and add node

4072

4073

if ( currentNode != NULL && ctxt->record_info ) {

4074

node_info.end_pos = ctxt->input->consumed +

4075

(CUR_PTR - ctxt->input->base);

4076

node_info.end_line = ctxt->input->line;

4077

node_info.node = ctxt->node;

4078

xmlParserAddNodeInfo(ctxt, &node_info);

4079

}

4080

if (!IS_CHAR_CH(CUR)) {

4081

htmlAutoCloseOnEnd(ctxt);

4082

}

4083

4084

if (currentNode != NULL)

4085

xmlFree(currentNode);

4086

}

4087

4088

/**

4089

* htmlParseDocument:

4090

* @ctxt: an HTML parser context

4091

4092

* parse an HTML document (and build a tree if using the standard SAX

4093

* interface).

4094

4095

* Returns 0, -1 in case of error. the parser context is augmented

4096

* as a result of the parsing.

4097

4098

4099

int

4100

htmlParseDocument(htmlParserCtxtPtr ctxt) {

4101

xmlDtdPtr dtd;

4102

4103

xmlInitParser();

4104

4105

htmlDefaultSAXHandlerInit();

4106

4107

if ((ctxt == NULL) || (ctxt->input == NULL)) {

4108

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

4109

"htmlParseDocument: context error\n", NULL, NULL);

4110

return(XML_ERR_INTERNAL_ERROR);

4111

}

4112

ctxt->html = 1;

4113

GROW;

4114

4115

* SAX: beginning of the document processing.

4116

4117

if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))

4118

ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);

4119

4120

4121

* Wipe out everything which is before the first '<'

4122

4123

SKIP_BLANKS;

4124

if (CUR == 0) {

4125

htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,

4126

"Document is empty\n", NULL, NULL);

4127

}

4128

4129

if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))

4130

ctxt->sax->startDocument(ctxt->userData);

4131

4132

4133

4134

* Parse possible comments and PIs before any content

4135

4136

while (((CUR == '<') && (NXT(1) == '!') &&

4137

(NXT(2) == '-') && (NXT(3) == '-')) ||

4138

((CUR == '<') && (NXT(1) == '?'))) {

4139

htmlParseComment(ctxt);

4140

htmlParsePI(ctxt);

4141

SKIP_BLANKS;

4142

}

4143

4144

4145

4146

* Then possibly doc type declaration(s) and more Misc

4147

* (doctypedecl Misc*)?

4148

4149

if ((CUR == '<') && (NXT(1) == '!') &&

4150

(UPP(2) == 'D') && (UPP(3) == 'O') &&

4151

(UPP(4) == 'C') && (UPP(5) == 'T') &&

4152

(UPP(6) == 'Y') && (UPP(7) == 'P') &&

4153

(UPP(8) == 'E')) {

4154

htmlParseDocTypeDecl(ctxt);

4155

}

4156

SKIP_BLANKS;

4157

4158

4159

* Parse possible comments and PIs before any content

4160

4161

while (((CUR == '<') && (NXT(1) == '!') &&

4162

(NXT(2) == '-') && (NXT(3) == '-')) ||

4163

((CUR == '<') && (NXT(1) == '?'))) {

4164

htmlParseComment(ctxt);

4165

htmlParsePI(ctxt);

4166

SKIP_BLANKS;

4167

}

4168

4169

4170

* Time to start parsing the tree itself

4171

4172

htmlParseContent(ctxt);

4173

4174

4175

* autoclose

4176

4177

if (CUR == 0)

4178

htmlAutoCloseOnEnd(ctxt);

4179

4180

4181

4182

* SAX: end of the document processing.

4183

4184

if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))

4185

ctxt->sax->endDocument(ctxt->userData);

4186

4187

if (ctxt->myDoc != NULL) {

4188

dtd = xmlGetIntSubset(ctxt->myDoc);

4189

if (dtd == NULL)

4190

ctxt->myDoc->intSubset =

4191

xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",

4192

BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",

4193

BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");

4194

}

4195

if (! ctxt->wellFormed) return(-1);

4196

return(0);

4197

}

4198

4199

4200

/************************************************************************

4201

* *

4202

* Parser contexts handling *

4203

* *

4204

************************************************************************/

4205

4206

/**

4207

* htmlInitParserCtxt:

4208

* @ctxt: an HTML parser context

4209

4210

* Initialize a parser context

4211

4212

* Returns 0 in case of success and -1 in case of error

4213

4214

4215

static int

4216

htmlInitParserCtxt(htmlParserCtxtPtr ctxt)

4217

{

4218

htmlSAXHandler *sax;

4219

4220

if (ctxt == NULL) return(-1);

4221

memset(ctxt, 0, sizeof(htmlParserCtxt));

4222

4223

ctxt->dict = xmlDictCreate();

4224

if (ctxt->dict == NULL) {

4225

htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");

4226

return(-1);

4227

}

4228

sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));

4229

if (sax == NULL) {

4230

htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");

4231

return(-1);

4232

}

4233

else

4234

memset(sax, 0, sizeof(htmlSAXHandler));

4235

4236

/* Allocate the Input stack */

4237

ctxt->inputTab = (htmlParserInputPtr *)

4238

xmlMalloc(5 * sizeof(htmlParserInputPtr));

4239

if (ctxt->inputTab == NULL) {

4240

htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");

4241

ctxt->inputNr = 0;

4242

ctxt->inputMax = 0;

4243

ctxt->input = NULL;

4244

return(-1);

4245

}

4246

ctxt->inputNr = 0;

4247

ctxt->inputMax = 5;

4248

ctxt->input = NULL;

4249

ctxt->version = NULL;

4250

ctxt->encoding = NULL;

4251

ctxt->standalone = -1;

4252

ctxt->instate = XML_PARSER_START;

4253

4254

/* Allocate the Node stack */

4255

ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));

4256

if (ctxt->nodeTab == NULL) {

4257

htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");

4258

ctxt->nodeNr = 0;

4259

ctxt->nodeMax = 0;

4260

ctxt->node = NULL;

4261

ctxt->inputNr = 0;

4262

ctxt->inputMax = 0;

4263

ctxt->input = NULL;

4264

return(-1);

4265

}

4266

ctxt->nodeNr = 0;

4267

ctxt->nodeMax = 10;

4268

ctxt->node = NULL;

4269

4270

/* Allocate the Name stack */

4271

ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));

4272

if (ctxt->nameTab == NULL) {

4273

htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");

4274

ctxt->nameNr = 0;

4275

ctxt->nameMax = 10;

4276

ctxt->name = NULL;

4277

ctxt->nodeNr = 0;

4278

ctxt->nodeMax = 0;

4279

ctxt->node = NULL;

4280

ctxt->inputNr = 0;

4281

ctxt->inputMax = 0;

4282

ctxt->input = NULL;

4283

return(-1);

4284

}

4285

ctxt->nameNr = 0;

4286

ctxt->nameMax = 10;

4287

ctxt->name = NULL;

4288

4289

if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;

4290

else {

4291

ctxt->sax = sax;

4292

memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));

4293

}

4294

ctxt->userData = ctxt;

4295

ctxt->myDoc = NULL;

4296

ctxt->wellFormed = 1;

4297

ctxt->replaceEntities = 0;

4298

ctxt->linenumbers = xmlLineNumbersDefaultValue;

4299

ctxt->html = 1;

4300

ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;

4301

ctxt->vctxt.userData = ctxt;

4302

ctxt->vctxt.error = xmlParserValidityError;

4303

ctxt->vctxt.warning = xmlParserValidityWarning;

4304

ctxt->record_info = 0;

4305

ctxt->validate = 0;

4306

ctxt->nbChars = 0;

4307

ctxt->checkIndex = 0;

4308

ctxt->catalogs = NULL;

4309

xmlInitNodeInfoSeq(&ctxt->node_seq);

4310

return(0);

4311

}

4312

4313

/**

4314

* htmlFreeParserCtxt:

4315

* @ctxt: an HTML parser context

4316

4317

* Free all the memory used by a parser context. However the parsed

4318

* document in ctxt->myDoc is not freed.

4319

4320

4321

void

4322

htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)

4323

{

4324

xmlFreeParserCtxt(ctxt);

4325

}

4326

4327

/**

4328

* htmlNewParserCtxt:

4329

4330

* Allocate and initialize a new parser context.

4331

4332

* Returns the htmlParserCtxtPtr or NULL in case of allocation error

4333

4334

4335

htmlParserCtxtPtr

4336

htmlNewParserCtxt(void)

4337

{

4338

xmlParserCtxtPtr ctxt;

4339

4340

ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));

4341

if (ctxt == NULL) {

4342

htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");

4343

return(NULL);

4344

}

4345

memset(ctxt, 0, sizeof(xmlParserCtxt));

4346

if (htmlInitParserCtxt(ctxt) < 0) {

4347

htmlFreeParserCtxt(ctxt);

4348

return(NULL);

4349

}

4350

return(ctxt);

4351

}

4352

4353

/**

4354

* htmlCreateMemoryParserCtxt:

4355

* @buffer: a pointer to a char array

4356

* @size: the size of the array

4357

4358

* Create a parser context for an HTML in-memory document.

4359

4360

* Returns the new parser context or NULL

4361

4362

htmlParserCtxtPtr

4363

htmlCreateMemoryParserCtxt(const char *buffer, int size) {

4364

xmlParserCtxtPtr ctxt;

4365

xmlParserInputPtr input;

4366

xmlParserInputBufferPtr buf;

4367

4368

if (buffer == NULL)

4369

return(NULL);

4370

if (size <= 0)

4371

return(NULL);

4372

4373

ctxt = htmlNewParserCtxt();

4374

if (ctxt == NULL)

4375

return(NULL);

4376

4377

buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);

4378

if (buf == NULL) return(NULL);

4379

4380

input = xmlNewInputStream(ctxt);

4381

if (input == NULL) {

4382

xmlFreeParserCtxt(ctxt);

4383

return(NULL);

4384

}

4385

4386

input->filename = NULL;

4387

input->buf = buf;

4388

input->base = input->buf->buffer->content;

4389

input->cur = input->buf->buffer->content;

4390

input->end = &input->buf->buffer->content[input->buf->buffer->use];

4391

4392

inputPush(ctxt, input);

4393

return(ctxt);

4394

}

4395

4396

/**

4397

* htmlCreateDocParserCtxt:

4398

* @cur: a pointer to an array of xmlChar

4399

* @encoding: a free form C string describing the HTML document encoding, or NULL

4400

4401

* Create a parser context for an HTML document.

4402

4403

* TODO: check the need to add encoding handling there

4404

4405

* Returns the new parser context or NULL

4406

4407

static htmlParserCtxtPtr

4408

htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {

4409

int len;

4410

htmlParserCtxtPtr ctxt;

4411

4412

if (cur == NULL)

4413

return(NULL);

4414

len = xmlStrlen(cur);

4415

ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);

4416

if (ctxt == NULL)

4417

return(NULL);

4418

4419

if (encoding != NULL) {

4420

xmlCharEncoding enc;

4421

xmlCharEncodingHandlerPtr handler;

4422

4423

if (ctxt->input->encoding != NULL)

4424

xmlFree((xmlChar *) ctxt->input->encoding);

4425

ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);

4426

4427

enc = xmlParseCharEncoding(encoding);

4428

4429

* registered set of known encodings

4430

4431

if (enc != XML_CHAR_ENCODING_ERROR) {

4432

xmlSwitchEncoding(ctxt, enc);

4433

if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {

4434

htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,

4435

"Unsupported encoding %s\n",

4436

(const xmlChar *) encoding, NULL);

4437

}

4438

} else {

4439

4440

* fallback for unknown encodings

4441

4442

handler = xmlFindCharEncodingHandler((const char *) encoding);

4443

if (handler != NULL) {

4444

xmlSwitchToEncoding(ctxt, handler);

4445

} else {

4446

htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,

4447

"Unsupported encoding %s\n",

4448

(const xmlChar *) encoding, NULL);

4449

}

4450

}

4451

}

4452

return(ctxt);

4453

}

4454

4455

#ifdef LIBXML_PUSH_ENABLED

4456

/************************************************************************

4457

* *

4458

* Progressive parsing interfaces *

4459

* *

4460

************************************************************************/

4461

4462

/**

4463

* htmlParseLookupSequence:

4464

* @ctxt: an HTML parser context

4465

* @first: the first char to lookup

4466

* @next: the next char to lookup or zero

4467

* @third: the next char to lookup or zero

4468

* @comment: flag to force checking inside comments

4469

4470

* Try to find if a sequence (first, next, third) or just (first next) or

4471

* (first) is available in the input stream.

4472

* This function has a side effect of (possibly) incrementing ctxt->checkIndex

4473

* to avoid rescanning sequences of bytes, it DOES change the state of the

4474

* parser, do not use liberally.

4475

* This is basically similar to xmlParseLookupSequence()

4476

4477

* Returns the index to the current parsing point if the full sequence

4478

* is available, -1 otherwise.

4479

4480

static int

4481

htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,

4482

xmlChar next, xmlChar third, int iscomment) {

4483

int base, len;

4484

htmlParserInputPtr in;

4485

const xmlChar *buf;

4486

int incomment = 0;

4487

4488

in = ctxt->input;

4489

if (in == NULL) return(-1);

4490

base = in->cur - in->base;

4491

if (base < 0) return(-1);

4492

if (ctxt->checkIndex > base)

4493

base = ctxt->checkIndex;

4494

if (in->buf == NULL) {

4495

buf = in->base;

4496

len = in->length;

4497

} else {

4498

buf = in->buf->buffer->content;

4499

len = in->buf->buffer->use;

4500

}

4501

/* take into account the sequence length */

4502

if (third) len -= 2;

4503

else if (next) len --;

4504

for (;base < len;base++) {

4505

if (!incomment && (base + 4 < len) && !iscomment) {

4506

if ((buf[base] == '<') && (buf[base + 1] == '!') &&

4507

(buf[base + 2] == '-') && (buf[base + 3] == '-')) {

4508

incomment = 1;

4509

/* do not increment past <! - some people use <!--> */

4510

base += 2;

4511

}

4512

}

4513

if (incomment) {

4514

if (base + 3 > len)

4515

return(-1);

4516

if ((buf[base] == '-') && (buf[base + 1] == '-') &&

4517

(buf[base + 2] == '>')) {

4518

incomment = 0;

4519

base += 2;

4520

}

4521

continue;

4522

}

4523

if (buf[base] == first) {

4524

if (third != 0) {

4525

if ((buf[base + 1] != next) ||

4526

(buf[base + 2] != third)) continue;

4527

} else if (next != 0) {

4528

if (buf[base + 1] != next) continue;

4529

}

4530

ctxt->checkIndex = 0;

4531

#ifdef DEBUG_PUSH

4532

if (next == 0)

4533

xmlGenericError(xmlGenericErrorContext,

4534

"HPP: lookup '%c' found at %d\n",

4535

first, base);

4536

else if (third == 0)

4537

xmlGenericError(xmlGenericErrorContext,

4538

"HPP: lookup '%c%c' found at %d\n",

4539

first, next, base);

4540

else

4541

xmlGenericError(xmlGenericErrorContext,

4542

"HPP: lookup '%c%c%c' found at %d\n",

4543

first, next, third, base);

4544

#endif

4545

return(base - (in->cur - in->base));

4546

}

4547

}

4548

ctxt->checkIndex = base;

4549

#ifdef DEBUG_PUSH

4550

if (next == 0)

4551

xmlGenericError(xmlGenericErrorContext,

4552

"HPP: lookup '%c' failed\n", first);

4553

else if (third == 0)

4554

xmlGenericError(xmlGenericErrorContext,

4555

"HPP: lookup '%c%c' failed\n", first, next);

4556

else

4557

xmlGenericError(xmlGenericErrorContext,

4558

"HPP: lookup '%c%c%c' failed\n", first, next, third);

4559

#endif

4560

return(-1);

4561

}

4562

4563

/**

4564

* htmlParseTryOrFinish:

4565

* @ctxt: an HTML parser context

4566

* @terminate: last chunk indicator

4567

4568

* Try to progress on parsing

4569

4570

* Returns zero if no parsing was possible

4571

4572

static int

4573

htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

4574

int ret = 0;

4575

htmlParserInputPtr in;

4576

int avail = 0;

4577

xmlChar cur, next;

4578

4579

#ifdef DEBUG_PUSH

4580

switch (ctxt->instate) {

4581

case XML_PARSER_EOF:

4582

xmlGenericError(xmlGenericErrorContext,

4583

"HPP: try EOF\n"); break;

4584

case XML_PARSER_START:

4585

xmlGenericError(xmlGenericErrorContext,

4586

"HPP: try START\n"); break;

4587

case XML_PARSER_MISC:

4588

xmlGenericError(xmlGenericErrorContext,

4589

"HPP: try MISC\n");break;

4590

case XML_PARSER_COMMENT:

4591

xmlGenericError(xmlGenericErrorContext,

4592

"HPP: try COMMENT\n");break;

4593

case XML_PARSER_PROLOG:

4594

xmlGenericError(xmlGenericErrorContext,

4595

"HPP: try PROLOG\n");break;

4596

case XML_PARSER_START_TAG:

4597

xmlGenericError(xmlGenericErrorContext,

4598

"HPP: try START_TAG\n");break;

4599

case XML_PARSER_CONTENT:

4600

xmlGenericError(xmlGenericErrorContext,

4601

"HPP: try CONTENT\n");break;

4602

case XML_PARSER_CDATA_SECTION:

4603

xmlGenericError(xmlGenericErrorContext,

4604

"HPP: try CDATA_SECTION\n");break;

4605

case XML_PARSER_END_TAG:

4606

xmlGenericError(xmlGenericErrorContext,

4607

"HPP: try END_TAG\n");break;

4608

case XML_PARSER_ENTITY_DECL:

4609

xmlGenericError(xmlGenericErrorContext,

4610

"HPP: try ENTITY_DECL\n");break;

4611

case XML_PARSER_ENTITY_VALUE:

4612

xmlGenericError(xmlGenericErrorContext,

4613

"HPP: try ENTITY_VALUE\n");break;

4614

case XML_PARSER_ATTRIBUTE_VALUE:

4615

xmlGenericError(xmlGenericErrorContext,

4616

"HPP: try ATTRIBUTE_VALUE\n");break;

4617

case XML_PARSER_DTD:

4618

xmlGenericError(xmlGenericErrorContext,

4619

"HPP: try DTD\n");break;

4620

case XML_PARSER_EPILOG:

4621

xmlGenericError(xmlGenericErrorContext,

4622

"HPP: try EPILOG\n");break;

4623

case XML_PARSER_PI:

4624

xmlGenericError(xmlGenericErrorContext,

4625

"HPP: try PI\n");break;

4626

case XML_PARSER_SYSTEM_LITERAL:

4627

xmlGenericError(xmlGenericErrorContext,

4628

"HPP: try SYSTEM_LITERAL\n");break;

4629

}

4630

#endif

4631

4632

while (1) {

4633

4634

in = ctxt->input;

4635

if (in == NULL) break;

4636

if (in->buf == NULL)

4637

avail = in->length - (in->cur - in->base);

4638

else

4639

avail = in->buf->buffer->use - (in->cur - in->base);

4640

if ((avail == 0) && (terminate)) {

4641

htmlAutoCloseOnEnd(ctxt);

4642

if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {

4643

4644

* SAX: end of the document processing.

4645

4646

ctxt->instate = XML_PARSER_EOF;

4647

if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))

4648

ctxt->sax->endDocument(ctxt->userData);

4649

}

4650

}

4651

if (avail < 1)

4652

goto done;

4653

cur = in->cur[0];

4654

if (cur == 0) {

4655

SKIP(1);

4656

continue;

4657

}

4658

4659

switch (ctxt->instate) {

4660

case XML_PARSER_EOF:

4661

4662

* Document parsing is done !

4663

4664

goto done;

4665

case XML_PARSER_START:

4666

4667

* Very first chars read from the document flow.

4668

4669

cur = in->cur[0];

4670

if (IS_BLANK_CH(cur)) {

4671

SKIP_BLANKS;

4672

if (in->buf == NULL)

4673

avail = in->length - (in->cur - in->base);

4674

else

4675

avail = in->buf->buffer->use - (in->cur - in->base);

4676

}

4677

if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))

4678

ctxt->sax->setDocumentLocator(ctxt->userData,

4679

&xmlDefaultSAXLocator);

4680

if ((ctxt->sax) && (ctxt->sax->startDocument) &&

4681

(!ctxt->disableSAX))

4682

ctxt->sax->startDocument(ctxt->userData);

4683

4684

cur = in->cur[0];

4685

next = in->cur[1];

4686

if ((cur == '<') && (next == '!') &&

4687

(UPP(2) == 'D') && (UPP(3) == 'O') &&

4688

(UPP(4) == 'C') && (UPP(5) == 'T') &&

4689

(UPP(6) == 'Y') && (UPP(7) == 'P') &&

4690

(UPP(8) == 'E')) {

4691

if ((!terminate) &&

4692

(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

4693

goto done;

4694

#ifdef DEBUG_PUSH

4695

xmlGenericError(xmlGenericErrorContext,

4696

"HPP: Parsing internal subset\n");

4697

#endif

4698

htmlParseDocTypeDecl(ctxt);

4699

ctxt->instate = XML_PARSER_PROLOG;

4700

#ifdef DEBUG_PUSH

4701

xmlGenericError(xmlGenericErrorContext,

4702

"HPP: entering PROLOG\n");

4703

#endif

4704

} else {

4705

ctxt->instate = XML_PARSER_MISC;

4706

#ifdef DEBUG_PUSH

4707

xmlGenericError(xmlGenericErrorContext,

4708

"HPP: entering MISC\n");

4709

#endif

4710

}

4711

break;

4712

case XML_PARSER_MISC:

4713

SKIP_BLANKS;

4714

if (in->buf == NULL)

4715

avail = in->length - (in->cur - in->base);

4716

else

4717

avail = in->buf->buffer->use - (in->cur - in->base);

4718

if (avail < 2)

4719

goto done;

4720

cur = in->cur[0];

4721

next = in->cur[1];

4722

if ((cur == '<') && (next == '!') &&

4723

(in->cur[2] == '-') && (in->cur[3] == '-')) {

4724

if ((!terminate) &&

4725

(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))

4726

goto done;

4727

#ifdef DEBUG_PUSH

4728

xmlGenericError(xmlGenericErrorContext,

4729

"HPP: Parsing Comment\n");

4730

#endif

4731

htmlParseComment(ctxt);

4732

ctxt->instate = XML_PARSER_MISC;

4733

} else if ((cur == '<') && (next == '?')) {

4734

if ((!terminate) &&

4735

(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

4736

goto done;

4737

#ifdef DEBUG_PUSH

4738

xmlGenericError(xmlGenericErrorContext,

4739

"HPP: Parsing PI\n");

4740

#endif

4741

htmlParsePI(ctxt);

4742

ctxt->instate = XML_PARSER_MISC;

4743

} else if ((cur == '<') && (next == '!') &&

4744

(UPP(2) == 'D') && (UPP(3) == 'O') &&

4745

(UPP(4) == 'C') && (UPP(5) == 'T') &&

4746

(UPP(6) == 'Y') && (UPP(7) == 'P') &&

4747

(UPP(8) == 'E')) {

4748

if ((!terminate) &&

4749

(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

4750

goto done;

4751

#ifdef DEBUG_PUSH

4752

xmlGenericError(xmlGenericErrorContext,

4753

"HPP: Parsing internal subset\n");

4754

#endif

4755

htmlParseDocTypeDecl(ctxt);

4756

ctxt->instate = XML_PARSER_PROLOG;

4757

#ifdef DEBUG_PUSH

4758

xmlGenericError(xmlGenericErrorContext,

4759

"HPP: entering PROLOG\n");

4760

#endif

4761

} else if ((cur == '<') && (next == '!') &&

4762

(avail < 9)) {

4763

goto done;

4764

} else {

4765

ctxt->instate = XML_PARSER_START_TAG;

4766

#ifdef DEBUG_PUSH

4767

xmlGenericError(xmlGenericErrorContext,

4768

"HPP: entering START_TAG\n");

4769

#endif

4770

}

4771

break;

4772

case XML_PARSER_PROLOG:

4773

SKIP_BLANKS;

4774

if (in->buf == NULL)

4775

avail = in->length - (in->cur - in->base);

4776

else

4777

avail = in->buf->buffer->use - (in->cur - in->base);

4778

if (avail < 2)

4779

goto done;

4780

cur = in->cur[0];

4781

next = in->cur[1];

4782

if ((cur == '<') && (next == '!') &&

4783

(in->cur[2] == '-') && (in->cur[3] == '-')) {

4784

if ((!terminate) &&

4785

(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))

4786

goto done;

4787

#ifdef DEBUG_PUSH

4788

xmlGenericError(xmlGenericErrorContext,

4789

"HPP: Parsing Comment\n");

4790

#endif

4791

htmlParseComment(ctxt);

4792

ctxt->instate = XML_PARSER_PROLOG;

4793

} else if ((cur == '<') && (next == '?')) {

4794

if ((!terminate) &&

4795

(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

4796

goto done;

4797

#ifdef DEBUG_PUSH

4798

xmlGenericError(xmlGenericErrorContext,

4799

"HPP: Parsing PI\n");

4800

#endif

4801

htmlParsePI(ctxt);

4802

ctxt->instate = XML_PARSER_PROLOG;

4803

} else if ((cur == '<') && (next == '!') &&

4804

(avail < 4)) {

4805

goto done;

4806

} else {

4807

ctxt->instate = XML_PARSER_START_TAG;

4808

#ifdef DEBUG_PUSH

4809

xmlGenericError(xmlGenericErrorContext,

4810

"HPP: entering START_TAG\n");

4811

#endif

4812

}

4813

break;

4814

case XML_PARSER_EPILOG:

4815

if (in->buf == NULL)

4816

avail = in->length - (in->cur - in->base);

4817

else

4818

avail = in->buf->buffer->use - (in->cur - in->base);

4819

if (avail < 1)

4820

goto done;

4821

cur = in->cur[0];

4822

if (IS_BLANK_CH(cur)) {

4823

htmlParseCharData(ctxt);

4824

goto done;

4825

}

4826

if (avail < 2)

4827

goto done;

4828

next = in->cur[1];

4829

if ((cur == '<') && (next == '!') &&

4830

(in->cur[2] == '-') && (in->cur[3] == '-')) {

4831

if ((!terminate) &&

4832

(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))

4833

goto done;

4834

#ifdef DEBUG_PUSH

4835

xmlGenericError(xmlGenericErrorContext,

4836

"HPP: Parsing Comment\n");

4837

#endif

4838

htmlParseComment(ctxt);

4839

ctxt->instate = XML_PARSER_EPILOG;

4840

} else if ((cur == '<') && (next == '?')) {

4841

if ((!terminate) &&

4842

(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

4843

goto done;

4844

#ifdef DEBUG_PUSH

4845

xmlGenericError(xmlGenericErrorContext,

4846

"HPP: Parsing PI\n");

4847

#endif

4848

htmlParsePI(ctxt);

4849

ctxt->instate = XML_PARSER_EPILOG;

4850

} else if ((cur == '<') && (next == '!') &&

4851

(avail < 4)) {

4852

goto done;

4853

} else {

4854

ctxt->errNo = XML_ERR_DOCUMENT_END;

4855

ctxt->wellFormed = 0;

4856

ctxt->instate = XML_PARSER_EOF;

4857

#ifdef DEBUG_PUSH

4858

xmlGenericError(xmlGenericErrorContext,

4859

"HPP: entering EOF\n");

4860

#endif

4861

if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))

4862

ctxt->sax->endDocument(ctxt->userData);

4863

goto done;

4864

}

4865

break;

4866

case XML_PARSER_START_TAG: {

4867

const xmlChar *name;

4868

int failed;

4869

const htmlElemDesc * info;

4870

4871

if (avail < 2)

4872

goto done;

4873

cur = in->cur[0];

4874

if (cur != '<') {

4875

ctxt->instate = XML_PARSER_CONTENT;

4876

#ifdef DEBUG_PUSH

4877

xmlGenericError(xmlGenericErrorContext,

4878

"HPP: entering CONTENT\n");

4879

#endif

4880

break;

4881

}

4882

if (in->cur[1] == '/') {

4883

ctxt->instate = XML_PARSER_END_TAG;

4884

ctxt->checkIndex = 0;

4885

#ifdef DEBUG_PUSH

4886

xmlGenericError(xmlGenericErrorContext,

4887

"HPP: entering END_TAG\n");

4888

#endif

4889

break;

4890

}

4891

if ((!terminate) &&

4892

(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

4893

goto done;

4894

4895

failed = htmlParseStartTag(ctxt);

4896

name = ctxt->name;

4897

if ((failed == -1) ||

4898

(name == NULL)) {

4899

if (CUR == '>')

4900

NEXT;

4901

break;

4902

}

4903

4904

4905

* Lookup the info for that element.

4906

4907

info = htmlTagLookup(name);

4908

if (info == NULL) {

4909

htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,

4910

"Tag %s invalid\n", name, NULL);

4911

}

4912

4913

4914

* Check for an Empty Element labeled the XML/SGML way

4915

4916

if ((CUR == '/') && (NXT(1) == '>')) {

4917

SKIP(2);

4918

if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

4919

ctxt->sax->endElement(ctxt->userData, name);

4920

htmlnamePop(ctxt);

4921

ctxt->instate = XML_PARSER_CONTENT;

4922

#ifdef DEBUG_PUSH

4923

xmlGenericError(xmlGenericErrorContext,

4924

"HPP: entering CONTENT\n");

4925

#endif

4926

break;

4927

}

4928

4929

if (CUR == '>') {

4930

NEXT;

4931

} else {

4932

htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,

4933

"Couldn't find end of Start Tag %s\n",

4934

name, NULL);

4935

4936

4937

* end of parsing of this node.

4938

4939

if (xmlStrEqual(name, ctxt->name)) {

4940

nodePop(ctxt);

4941

htmlnamePop(ctxt);

4942

}

4943

4944

ctxt->instate = XML_PARSER_CONTENT;

4945

#ifdef DEBUG_PUSH

4946

xmlGenericError(xmlGenericErrorContext,

4947

"HPP: entering CONTENT\n");

4948

#endif

4949

break;

4950

}

4951

4952

4953

* Check for an Empty Element from DTD definition

4954

4955

if ((info != NULL) && (info->empty)) {

4956

if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

4957

ctxt->sax->endElement(ctxt->userData, name);

4958

htmlnamePop(ctxt);

4959

}

4960

ctxt->instate = XML_PARSER_CONTENT;

4961

#ifdef DEBUG_PUSH

4962

xmlGenericError(xmlGenericErrorContext,

4963

"HPP: entering CONTENT\n");

4964

#endif

4965

break;

4966

}

4967

case XML_PARSER_CONTENT: {

4968

long cons;

4969

4970

* Handle preparsed entities and charRef

4971

4972

if (ctxt->token != 0) {

4973

xmlChar chr[2] = { 0 , 0 } ;

4974

4975

chr[0] = (xmlChar) ctxt->token;

4976

htmlCheckParagraph(ctxt);

4977

if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))

4978

ctxt->sax->characters(ctxt->userData, chr, 1);

4979

ctxt->token = 0;

4980

ctxt->checkIndex = 0;

4981

}

4982

if ((avail == 1) && (terminate)) {

4983

cur = in->cur[0];

4984

if ((cur != '<') && (cur != '&')) {

4985

if (ctxt->sax != NULL) {

4986

if (IS_BLANK_CH(cur)) {

4987

if (ctxt->sax->ignorableWhitespace != NULL)

4988

ctxt->sax->ignorableWhitespace(

4989

ctxt->userData, &cur, 1);

4990

} else {

4991

htmlCheckParagraph(ctxt);

4992

if (ctxt->sax->characters != NULL)

4993

ctxt->sax->characters(

4994

ctxt->userData, &cur, 1);

4995

}

4996

}

4997

ctxt->token = 0;

4998

ctxt->checkIndex = 0;

4999

in->cur++;

5000

break;

5001

}

5002

}

5003

if (avail < 2)

5004

goto done;

5005

cur = in->cur[0];

5006

next = in->cur[1];

5007

cons = ctxt->nbChars;

5008

if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||

5009

(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {

5010

5011

* Handle SCRIPT/STYLE separately

5012

5013

if (!terminate) {

5014

int idx;

5015

xmlChar val;

5016

5017

idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);

5018

if (idx < 0)

5019

goto done;

5020

val = in->cur[idx + 2];

5021

if (val == 0) /* bad cut of input */

5022

goto done;

5023

}

5024

htmlParseScript(ctxt);

5025

if ((cur == '<') && (next == '/')) {

5026

ctxt->instate = XML_PARSER_END_TAG;

5027

ctxt->checkIndex = 0;

5028

#ifdef DEBUG_PUSH

5029

xmlGenericError(xmlGenericErrorContext,

5030

"HPP: entering END_TAG\n");

5031

#endif

5032

break;

5033

}

5034

} else {

5035

5036

* Sometimes DOCTYPE arrives in the middle of the document

5037

5038

if ((cur == '<') && (next == '!') &&

5039

(UPP(2) == 'D') && (UPP(3) == 'O') &&

5040

(UPP(4) == 'C') && (UPP(5) == 'T') &&

5041

(UPP(6) == 'Y') && (UPP(7) == 'P') &&

5042

(UPP(8) == 'E')) {

5043

if ((!terminate) &&

5044

(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

5045

goto done;

5046

htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,

5047

"Misplaced DOCTYPE declaration\n",

5048

BAD_CAST "DOCTYPE" , NULL);

5049

htmlParseDocTypeDecl(ctxt);

5050

} else if ((cur == '<') && (next == '!') &&

5051

(in->cur[2] == '-') && (in->cur[3] == '-')) {

5052

if ((!terminate) &&

5053

(htmlParseLookupSequence(

5054

ctxt, '-', '-', '>', 1) < 0))

5055

goto done;

5056

#ifdef DEBUG_PUSH

5057

xmlGenericError(xmlGenericErrorContext,

5058

"HPP: Parsing Comment\n");

5059

#endif

5060

htmlParseComment(ctxt);

5061

ctxt->instate = XML_PARSER_CONTENT;

5062

} else if ((cur == '<') && (next == '?')) {

5063

if ((!terminate) &&

5064

(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

5065

goto done;

5066

#ifdef DEBUG_PUSH

5067

xmlGenericError(xmlGenericErrorContext,

5068

"HPP: Parsing PI\n");

5069

#endif

5070

htmlParsePI(ctxt);

5071

ctxt->instate = XML_PARSER_CONTENT;

5072

} else if ((cur == '<') && (next == '!') && (avail < 4)) {

5073

goto done;

5074

} else if ((cur == '<') && (next == '/')) {

5075

ctxt->instate = XML_PARSER_END_TAG;

5076

ctxt->checkIndex = 0;

5077

#ifdef DEBUG_PUSH

5078

xmlGenericError(xmlGenericErrorContext,

5079

"HPP: entering END_TAG\n");

5080

#endif

5081

break;

5082

} else if (cur == '<') {

5083

ctxt->instate = XML_PARSER_START_TAG;

5084

ctxt->checkIndex = 0;

5085

#ifdef DEBUG_PUSH

5086

xmlGenericError(xmlGenericErrorContext,

5087

"HPP: entering START_TAG\n");

5088

#endif

5089

break;

5090

} else if (cur == '&') {

5091

if ((!terminate) &&

5092

(htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))

5093

goto done;

5094

#ifdef DEBUG_PUSH

5095

xmlGenericError(xmlGenericErrorContext,

5096

"HPP: Parsing Reference\n");

5097

#endif

5098

/* TODO: check generation of subtrees if noent !!! */

5099

htmlParseReference(ctxt);

5100

} else {

5101

5102

* check that the text sequence is complete

5103

* before handing out the data to the parser

5104

* to avoid problems with erroneous end of

5105

* data detection.

5106

5107

if ((!terminate) &&

5108

(htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))

5109

goto done;

5110

ctxt->checkIndex = 0;

5111

#ifdef DEBUG_PUSH

5112

xmlGenericError(xmlGenericErrorContext,

5113

"HPP: Parsing char data\n");

5114

#endif

5115

htmlParseCharData(ctxt);

5116

}

5117

}

5118

if (cons == ctxt->nbChars) {

5119

if (ctxt->node != NULL) {

5120

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5121

"detected an error in element content\n",

5122

NULL, NULL);

5123

}

5124

NEXT;

5125

break;

5126

}

5127

5128

break;

5129

}

5130

case XML_PARSER_END_TAG:

5131

if (avail < 2)

5132

goto done;

5133

if ((!terminate) &&

5134

(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

5135

goto done;

5136

htmlParseEndTag(ctxt);

5137

if (ctxt->nameNr == 0) {

5138

ctxt->instate = XML_PARSER_EPILOG;

5139

} else {

5140

ctxt->instate = XML_PARSER_CONTENT;

5141

}

5142

ctxt->checkIndex = 0;

5143

#ifdef DEBUG_PUSH

5144

xmlGenericError(xmlGenericErrorContext,

5145

"HPP: entering CONTENT\n");

5146

#endif

5147

break;

5148

case XML_PARSER_CDATA_SECTION:

5149

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5150

"HPP: internal error, state == CDATA\n",

5151

NULL, NULL);

5152

ctxt->instate = XML_PARSER_CONTENT;

5153

ctxt->checkIndex = 0;

5154

#ifdef DEBUG_PUSH

5155

xmlGenericError(xmlGenericErrorContext,

5156

"HPP: entering CONTENT\n");

5157

#endif

5158

break;

5159

case XML_PARSER_DTD:

5160

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5161

"HPP: internal error, state == DTD\n",

5162

NULL, NULL);

5163

ctxt->instate = XML_PARSER_CONTENT;

5164

ctxt->checkIndex = 0;

5165

#ifdef DEBUG_PUSH

5166

xmlGenericError(xmlGenericErrorContext,

5167

"HPP: entering CONTENT\n");

5168

#endif

5169

break;

5170

case XML_PARSER_COMMENT:

5171

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5172

"HPP: internal error, state == COMMENT\n",

5173

NULL, NULL);

5174

ctxt->instate = XML_PARSER_CONTENT;

5175

ctxt->checkIndex = 0;

5176

#ifdef DEBUG_PUSH

5177

xmlGenericError(xmlGenericErrorContext,

5178

"HPP: entering CONTENT\n");

5179

#endif

5180

break;

5181

case XML_PARSER_PI:

5182

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5183

"HPP: internal error, state == PI\n",

5184

NULL, NULL);

5185

ctxt->instate = XML_PARSER_CONTENT;

5186

ctxt->checkIndex = 0;

5187

#ifdef DEBUG_PUSH

5188

xmlGenericError(xmlGenericErrorContext,

5189

"HPP: entering CONTENT\n");

5190

#endif

5191

break;

5192

case XML_PARSER_ENTITY_DECL:

5193

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5194

"HPP: internal error, state == ENTITY_DECL\n",

5195

NULL, NULL);

5196

ctxt->instate = XML_PARSER_CONTENT;

5197

ctxt->checkIndex = 0;

5198

#ifdef DEBUG_PUSH

5199

xmlGenericError(xmlGenericErrorContext,

5200

"HPP: entering CONTENT\n");

5201

#endif

5202

break;

5203

case XML_PARSER_ENTITY_VALUE:

5204

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5205

"HPP: internal error, state == ENTITY_VALUE\n",

5206

NULL, NULL);

5207

ctxt->instate = XML_PARSER_CONTENT;

5208

ctxt->checkIndex = 0;

5209

#ifdef DEBUG_PUSH

5210

xmlGenericError(xmlGenericErrorContext,

5211

"HPP: entering DTD\n");

5212

#endif

5213

break;

5214

case XML_PARSER_ATTRIBUTE_VALUE:

5215

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5216

"HPP: internal error, state == ATTRIBUTE_VALUE\n",

5217

NULL, NULL);

5218

ctxt->instate = XML_PARSER_START_TAG;

5219

ctxt->checkIndex = 0;

5220

#ifdef DEBUG_PUSH

5221

xmlGenericError(xmlGenericErrorContext,

5222

"HPP: entering START_TAG\n");

5223

#endif

5224

break;

5225

case XML_PARSER_SYSTEM_LITERAL:

5226

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5227

"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",

5228

NULL, NULL);

5229

ctxt->instate = XML_PARSER_CONTENT;

5230

ctxt->checkIndex = 0;

5231

#ifdef DEBUG_PUSH

5232

xmlGenericError(xmlGenericErrorContext,

5233

"HPP: entering CONTENT\n");

5234

#endif

5235

break;

5236

case XML_PARSER_IGNORE:

5237

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5238

"HPP: internal error, state == XML_PARSER_IGNORE\n",

5239

NULL, NULL);

5240

ctxt->instate = XML_PARSER_CONTENT;

5241

ctxt->checkIndex = 0;

5242

#ifdef DEBUG_PUSH

5243

xmlGenericError(xmlGenericErrorContext,

5244

"HPP: entering CONTENT\n");

5245

#endif

5246

break;

5247

case XML_PARSER_PUBLIC_LITERAL:

5248

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5249

"HPP: internal error, state == XML_PARSER_LITERAL\n",

5250

NULL, NULL);

5251

ctxt->instate = XML_PARSER_CONTENT;

5252

ctxt->checkIndex = 0;

5253

#ifdef DEBUG_PUSH

5254

xmlGenericError(xmlGenericErrorContext,

5255

"HPP: entering CONTENT\n");

5256

#endif

5257

break;

5258

5259

}

5260

}

5261

done:

5262

if ((avail == 0) && (terminate)) {

5263

htmlAutoCloseOnEnd(ctxt);

5264

if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {

5265

5266

* SAX: end of the document processing.

5267

5268

ctxt->instate = XML_PARSER_EOF;

5269

if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))

5270

ctxt->sax->endDocument(ctxt->userData);

5271

}

5272

}

5273

if ((ctxt->myDoc != NULL) &&

5274

((terminate) || (ctxt->instate == XML_PARSER_EOF) ||

5275

(ctxt->instate == XML_PARSER_EPILOG))) {

5276

xmlDtdPtr dtd;

5277

dtd = xmlGetIntSubset(ctxt->myDoc);

5278

if (dtd == NULL)

5279

ctxt->myDoc->intSubset =

5280

xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",

5281

BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",

5282

BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");

5283

}

5284

#ifdef DEBUG_PUSH

5285

xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);

5286

#endif

5287

return(ret);

5288

}

5289

5290

/**

5291

* htmlParseChunk:

5292

* @ctxt: an HTML parser context

5293

* @chunk: an char array

5294

* @size: the size in byte of the chunk

5295

* @terminate: last chunk indicator

5296

5297

* Parse a Chunk of memory

5298

5299

* Returns zero if no error, the xmlParserErrors otherwise.

5300

5301

int

5302

htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,

5303

int terminate) {

5304

if ((ctxt == NULL) || (ctxt->input == NULL)) {

5305

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

5306

"htmlParseChunk: context error\n", NULL, NULL);

5307

return(XML_ERR_INTERNAL_ERROR);

5308

}

5309

if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&

5310

(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {

5311

int base = ctxt->input->base - ctxt->input->buf->buffer->content;

5312

int cur = ctxt->input->cur - ctxt->input->base;

5313

int res;

5314

5315

res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);

5316

if (res < 0) {

5317

ctxt->errNo = XML_PARSER_EOF;

5318

ctxt->disableSAX = 1;

5319

return (XML_PARSER_EOF);

5320

}

5321

ctxt->input->base = ctxt->input->buf->buffer->content + base;

5322

ctxt->input->cur = ctxt->input->base + cur;

5323

ctxt->input->end =

5324

&ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];

5325

#ifdef DEBUG_PUSH

5326

xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);

5327

#endif

5328

5329

#if 0

5330

if ((terminate) || (ctxt->input->buf->buffer->use > 80))

5331

htmlParseTryOrFinish(ctxt, terminate);

5332

#endif

5333

} else if (ctxt->instate != XML_PARSER_EOF) {

5334

if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {

5335

xmlParserInputBufferPtr in = ctxt->input->buf;

5336

if ((in->encoder != NULL) && (in->buffer != NULL) &&

5337

(in->raw != NULL)) {

5338

int nbchars;

5339

5340

nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);

5341

if (nbchars < 0) {

5342

htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,

5343

"encoder error\n", NULL, NULL);

5344

return(XML_ERR_INVALID_ENCODING);

5345

}

5346

}

5347

}

5348

}

5349

htmlParseTryOrFinish(ctxt, terminate);

5350

if (terminate) {

5351

if ((ctxt->instate != XML_PARSER_EOF) &&

5352

(ctxt->instate != XML_PARSER_EPILOG) &&

5353

(ctxt->instate != XML_PARSER_MISC)) {

5354

ctxt->errNo = XML_ERR_DOCUMENT_END;

5355

ctxt->wellFormed = 0;

5356

}

5357

if (ctxt->instate != XML_PARSER_EOF) {

5358

if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))

5359

ctxt->sax->endDocument(ctxt->userData);

5360

}

5361

ctxt->instate = XML_PARSER_EOF;

5362

}

5363

return((xmlParserErrors) ctxt->errNo);

5364

}

5365

5366

/************************************************************************

5367

* *

5368

* User entry points *

5369

* *

5370

************************************************************************/

5371

5372

/**

5373

* htmlCreatePushParserCtxt:

5374

* @sax: a SAX handler

5375

* @user_data: The user data returned on SAX callbacks

5376

* @chunk: a pointer to an array of chars

5377

* @size: number of chars in the array

5378

* @filename: an optional file name or URI

5379

* @enc: an optional encoding

5380

5381

* Create a parser context for using the HTML parser in push mode

5382

* The value of @filename is used for fetching external entities

5383

* and error/warning reports.

5384

5385

* Returns the new parser context or NULL

5386

5387

htmlParserCtxtPtr

5388

htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,

5389

const char *chunk, int size, const char *filename,

5390

xmlCharEncoding enc) {

5391

htmlParserCtxtPtr ctxt;

5392

htmlParserInputPtr inputStream;

5393

xmlParserInputBufferPtr buf;

5394

5395

xmlInitParser();

5396

5397

buf = xmlAllocParserInputBuffer(enc);

5398

if (buf == NULL) return(NULL);

5399

5400

ctxt = htmlNewParserCtxt();

5401

if (ctxt == NULL) {

5402

xmlFreeParserInputBuffer(buf);

5403

return(NULL);

5404

}

5405

if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)

5406

ctxt->charset=XML_CHAR_ENCODING_UTF8;

5407

if (sax != NULL) {

5408

if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)

5409

xmlFree(ctxt->sax);

5410

ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));

5411

if (ctxt->sax == NULL) {

5412

xmlFree(buf);

5413

xmlFree(ctxt);

5414

return(NULL);

5415

}

5416

memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));

5417

if (user_data != NULL)

5418

ctxt->userData = user_data;

5419

}

5420

if (filename == NULL) {

5421

ctxt->directory = NULL;

5422

} else {

5423

ctxt->directory = xmlParserGetDirectory(filename);

5424

}

5425

5426

inputStream = htmlNewInputStream(ctxt);

5427

if (inputStream == NULL) {

5428

xmlFreeParserCtxt(ctxt);

5429

xmlFree(buf);

5430

return(NULL);

5431

}

5432

5433

if (filename == NULL)

5434

inputStream->filename = NULL;

5435

else

5436

inputStream->filename = (char *)

5437

xmlCanonicPath((const xmlChar *) filename);

5438

inputStream->buf = buf;

5439

inputStream->base = inputStream->buf->buffer->content;

5440

inputStream->cur = inputStream->buf->buffer->content;

5441

inputStream->end =

5442

&inputStream->buf->buffer->content[inputStream->buf->buffer->use];

5443

5444

inputPush(ctxt, inputStream);

5445

5446

if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&

5447

(ctxt->input->buf != NULL)) {

5448

int base = ctxt->input->base - ctxt->input->buf->buffer->content;

5449

int cur = ctxt->input->cur - ctxt->input->base;

5450

5451

xmlParserInputBufferPush(ctxt->input->buf, size, chunk);

5452

5453

ctxt->input->base = ctxt->input->buf->buffer->content + base;

5454

ctxt->input->cur = ctxt->input->base + cur;

5455

ctxt->input->end =

5456

&ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];

5457

#ifdef DEBUG_PUSH

5458

xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);

5459

#endif

5460

}

5461

ctxt->progressive = 1;

5462

5463

return(ctxt);

5464

}

5465

#endif /* LIBXML_PUSH_ENABLED */

5466

5467

/**

5468

* htmlSAXParseDoc:

5469

* @cur: a pointer to an array of xmlChar

5470

* @encoding: a free form C string describing the HTML document encoding, or NULL

5471

* @sax: the SAX handler block

5472

* @userData: if using SAX, this pointer will be provided on callbacks.

5473

5474

* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks

5475

* to handle parse events. If sax is NULL, fallback to the default DOM

5476

* behavior and return a tree.

5477

5478

* Returns the resulting document tree unless SAX is NULL or the document is

5479

* not well formed.

5480

5481

5482

htmlDocPtr

5483

htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {

5484

htmlDocPtr ret;

5485

htmlParserCtxtPtr ctxt;

5486

5487

xmlInitParser();

5488

5489

if (cur == NULL) return(NULL);

5490

5491

5492

ctxt = htmlCreateDocParserCtxt(cur, encoding);

5493

if (ctxt == NULL) return(NULL);

5494

if (sax != NULL) {

5495

if (ctxt->sax != NULL) xmlFree (ctxt->sax);

5496

ctxt->sax = sax;

5497

ctxt->userData = userData;

5498

}

5499

5500

htmlParseDocument(ctxt);

5501

ret = ctxt->myDoc;

5502

if (sax != NULL) {

5503

ctxt->sax = NULL;

5504

ctxt->userData = NULL;

5505

}

5506

htmlFreeParserCtxt(ctxt);

5507

5508

return(ret);

5509

}

5510

5511

/**

5512

* htmlParseDoc:

5513

* @cur: a pointer to an array of xmlChar

5514

* @encoding: a free form C string describing the HTML document encoding, or NULL

5515

5516

* parse an HTML in-memory document and build a tree.

5517

5518

* Returns the resulting document tree

5519

5520

5521

htmlDocPtr

5522

htmlParseDoc(xmlChar *cur, const char *encoding) {

5523

return(htmlSAXParseDoc(cur, encoding, NULL, NULL));

5524

}

5525

5526

5527

/**

5528

* htmlCreateFileParserCtxt:

5529

* @filename: the filename

5530

* @encoding: a free form C string describing the HTML document encoding, or NULL

5531

5532

* Create a parser context for a file content.

5533

* Automatic support for ZLIB/Compress compressed document is provided

5534

* by default if found at compile-time.

5535

5536

* Returns the new parser context or NULL

5537

5538

htmlParserCtxtPtr

5539

htmlCreateFileParserCtxt(const char *filename, const char *encoding)

5540

{

5541

htmlParserCtxtPtr ctxt;

5542

htmlParserInputPtr inputStream;

5543

char *canonicFilename;

5544

/* htmlCharEncoding enc; */

5545

xmlChar *content, *content_line = (xmlChar *) "charset=";

5546

5547

if (filename == NULL)

5548

return(NULL);

5549

5550

ctxt = htmlNewParserCtxt();

5551

if (ctxt == NULL) {

5552

return(NULL);

5553

}

5554

canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);

5555

if (canonicFilename == NULL) {

5556

#ifdef LIBXML_SAX1_ENABLED

5557

if (xmlDefaultSAXHandler.error != NULL) {

5558

xmlDefaultSAXHandler.error(NULL, "out of memory\n");

5559

}

5560

#endif

5561

xmlFreeParserCtxt(ctxt);

5562

return(NULL);

5563

}

5564

5565

inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);

5566

xmlFree(canonicFilename);

5567

if (inputStream == NULL) {

5568

xmlFreeParserCtxt(ctxt);

5569

return(NULL);

5570

}

5571

5572

inputPush(ctxt, inputStream);

5573

5574

/* set encoding */

5575

if (encoding) {

5576

content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);

5577

if (content) {

5578

strcpy ((char *)content, (char *)content_line);

5579

strcat ((char *)content, (char *)encoding);

5580

htmlCheckEncoding (ctxt, content);

5581

xmlFree (content);

5582

}

5583

}

5584

5585

return(ctxt);

5586

}

5587

5588

/**

5589

* htmlSAXParseFile:

5590

* @filename: the filename

5591

* @encoding: a free form C string describing the HTML document encoding, or NULL

5592

* @sax: the SAX handler block

5593

* @userData: if using SAX, this pointer will be provided on callbacks.

5594

5595

* parse an HTML file and build a tree. Automatic support for ZLIB/Compress

5596

* compressed document is provided by default if found at compile-time.

5597

* It use the given SAX function block to handle the parsing callback.

5598

* If sax is NULL, fallback to the default DOM tree building routines.

5599

5600

* Returns the resulting document tree unless SAX is NULL or the document is

5601

* not well formed.

5602

5603

5604

htmlDocPtr

5605

htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,

5606

void *userData) {

5607

htmlDocPtr ret;

5608

htmlParserCtxtPtr ctxt;

5609

htmlSAXHandlerPtr oldsax = NULL;

5610

5611

xmlInitParser();

5612

5613

ctxt = htmlCreateFileParserCtxt(filename, encoding);

5614

if (ctxt == NULL) return(NULL);

5615

if (sax != NULL) {

5616

oldsax = ctxt->sax;

5617

ctxt->sax = sax;

5618

ctxt->userData = userData;

5619

}

5620

5621

htmlParseDocument(ctxt);

5622

5623

ret = ctxt->myDoc;

5624

if (sax != NULL) {

5625

ctxt->sax = oldsax;

5626

ctxt->userData = NULL;

5627

}

5628

htmlFreeParserCtxt(ctxt);

5629

5630

return(ret);

5631

}

5632

5633

/**

5634

* htmlParseFile:

5635

* @filename: the filename

5636

* @encoding: a free form C string describing the HTML document encoding, or NULL

5637

5638

* parse an HTML file and build a tree. Automatic support for ZLIB/Compress

5639

* compressed document is provided by default if found at compile-time.

5640

5641

* Returns the resulting document tree

5642

5643

5644

htmlDocPtr

5645

htmlParseFile(const char *filename, const char *encoding) {

5646

return(htmlSAXParseFile(filename, encoding, NULL, NULL));

5647

}

5648

5649

/**

5650

* htmlHandleOmittedElem:

5651

* @val: int 0 or 1

5652

5653

* Set and return the previous value for handling HTML omitted tags.

5654

5655

* Returns the last value for 0 for no handling, 1 for auto insertion.

5656

5657

5658

int

5659

htmlHandleOmittedElem(int val) {

5660

int old = htmlOmittedDefaultValue;

5661

5662

htmlOmittedDefaultValue = val;

5663

return(old);

5664

}

5665

5666

/**

5667

* htmlElementAllowedHere:

5668

* @parent: HTML parent element

5669

* @elt: HTML element

5670

5671

* Checks whether an HTML element may be a direct child of a parent element.

5672

* Note - doesn't check for deprecated elements

5673

5674

* Returns 1 if allowed; 0 otherwise.

5675

5676

int

5677

htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {

5678

const char** p ;

5679

5680

if ( ! elt || ! parent || ! parent->subelts )

5681

return 0 ;

5682

5683

for ( p = parent->subelts; *p; ++p )

5684

if ( !xmlStrcmp((const xmlChar *)*p, elt) )

5685

return 1 ;

5686

5687

return 0 ;

5688

}

5689

/**

5690

* htmlElementStatusHere:

5691

* @parent: HTML parent element

5692

* @elt: HTML element

5693

5694

* Checks whether an HTML element may be a direct child of a parent element.

5695

* and if so whether it is valid or deprecated.

5696

5697

* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID

5698

5699

htmlStatus

5700

htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {

5701

if ( ! parent || ! elt )

5702

return HTML_INVALID ;

5703

if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )

5704

return HTML_INVALID ;

5705

5706

return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;

5707

}

5708

/**

5709

* htmlAttrAllowed:

5710

* @elt: HTML element

5711

* @attr: HTML attribute

5712

* @legacy: whether to allow deprecated attributes

5713

5714

* Checks whether an attribute is valid for an element

5715

* Has full knowledge of Required and Deprecated attributes

5716

5717

* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID

5718

5719

htmlStatus

5720

htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {

5721

const char** p ;

5722

5723

if ( !elt || ! attr )

5724

return HTML_INVALID ;

5725

5726

if ( elt->attrs_req )

5727

for ( p = elt->attrs_req; *p; ++p)

5728

if ( !xmlStrcmp((const xmlChar*)*p, attr) )

5729

return HTML_REQUIRED ;

5730

5731

if ( elt->attrs_opt )

5732

for ( p = elt->attrs_opt; *p; ++p)

5733

if ( !xmlStrcmp((const xmlChar*)*p, attr) )

5734

return HTML_VALID ;

5735

5736

if ( legacy && elt->attrs_depr )

5737

for ( p = elt->attrs_depr; *p; ++p)

5738

if ( !xmlStrcmp((const xmlChar*)*p, attr) )

5739

return HTML_DEPRECATED ;

5740

5741

return HTML_INVALID ;

5742

}

5743

/**

5744

* htmlNodeStatus:

5745

* @node: an htmlNodePtr in a tree

5746

* @legacy: whether to allow deprecated elements (YES is faster here

5747

* for Element nodes)

5748

5749

* Checks whether the tree node is valid. Experimental (the author

5750

* only uses the HTML enhancements in a SAX parser)

5751

5752

* Return: for Element nodes, a return from htmlElementAllowedHere (if

5753

* legacy allowed) or htmlElementStatusHere (otherwise).

5754

* for Attribute nodes, a return from htmlAttrAllowed

5755

* for other nodes, HTML_NA (no checks performed)

5756

5757

htmlStatus

5758

htmlNodeStatus(const htmlNodePtr node, int legacy) {

5759

if ( ! node )

5760

return HTML_INVALID ;

5761

5762

switch ( node->type ) {

5763

case XML_ELEMENT_NODE:

5764

return legacy

5765

? ( htmlElementAllowedHere (

5766

htmlTagLookup(node->parent->name) , node->name

5767

) ? HTML_VALID : HTML_INVALID )

5768

: htmlElementStatusHere(

5769

htmlTagLookup(node->parent->name) ,

5770

htmlTagLookup(node->name) )

5771

;

5772

case XML_ATTRIBUTE_NODE:

5773

return htmlAttrAllowed(

5774

htmlTagLookup(node->parent->name) , node->name, legacy) ;

5775

default: return HTML_NA ;

5776

}

5777

}

5778

/************************************************************************

5779

* *

5780

* New set (2.6.0) of simpler and more flexible APIs *

5781

* *

5782

************************************************************************/

5783

/**

5784

* DICT_FREE:

5785

* @str: a string

5786

5787

* Free a string if it is not owned by the "dict" dictionnary in the

5788

* current scope

5789

5790

#define DICT_FREE(str) \

5791

if ((str) && ((!dict) || \

5792

(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \

5793

xmlFree((char *)(str));

5794

5795

/**

5796

* htmlCtxtReset:

5797

* @ctxt: an HTML parser context

5798

5799

* Reset a parser context

5800

5801

void

5802

htmlCtxtReset(htmlParserCtxtPtr ctxt)

5803

{

5804

xmlParserInputPtr input;

5805

xmlDictPtr dict;

5806

5807

if (ctxt == NULL)

5808

return;

5809

5810

xmlInitParser();

5811

dict = ctxt->dict;

5812

5813

while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */

5814

xmlFreeInputStream(input);

5815

}

5816

ctxt->inputNr = 0;

5817

ctxt->input = NULL;

5818

5819

ctxt->spaceNr = 0;

5820

if (ctxt->spaceTab != NULL) {

5821

ctxt->spaceTab[0] = -1;

5822

ctxt->space = &ctxt->spaceTab[0];

5823

} else {

5824

ctxt->space = NULL;

5825

}

5826

5827

5828

ctxt->nodeNr = 0;

5829

ctxt->node = NULL;

5830

5831

ctxt->nameNr = 0;

5832

ctxt->name = NULL;

5833

5834

DICT_FREE(ctxt->version);

5835

ctxt->version = NULL;

5836

DICT_FREE(ctxt->encoding);

5837

ctxt->encoding = NULL;

5838

DICT_FREE(ctxt->directory);

5839

ctxt->directory = NULL;

5840

DICT_FREE(ctxt->extSubURI);

5841

ctxt->extSubURI = NULL;

5842

DICT_FREE(ctxt->extSubSystem);

5843

ctxt->extSubSystem = NULL;

5844

if (ctxt->myDoc != NULL)

5845

xmlFreeDoc(ctxt->myDoc);

5846

ctxt->myDoc = NULL;

5847

5848

ctxt->standalone = -1;

5849

ctxt->hasExternalSubset = 0;

5850

ctxt->hasPErefs = 0;

5851

ctxt->html = 1;

5852

ctxt->external = 0;

5853

ctxt->instate = XML_PARSER_START;

5854

ctxt->token = 0;

5855

5856

ctxt->wellFormed = 1;

5857

ctxt->nsWellFormed = 1;

5858

ctxt->valid = 1;

5859

ctxt->vctxt.userData = ctxt;

5860

ctxt->vctxt.error = xmlParserValidityError;

5861

ctxt->vctxt.warning = xmlParserValidityWarning;

5862

ctxt->record_info = 0;

5863

ctxt->nbChars = 0;

5864

ctxt->checkIndex = 0;

5865

ctxt->inSubset = 0;

5866

ctxt->errNo = XML_ERR_OK;

5867

ctxt->depth = 0;

5868

ctxt->charset = XML_CHAR_ENCODING_NONE;

5869

ctxt->catalogs = NULL;

5870

xmlInitNodeInfoSeq(&ctxt->node_seq);

5871

5872

if (ctxt->attsDefault != NULL) {

5873

xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);

5874

ctxt->attsDefault = NULL;

5875

}

5876

if (ctxt->attsSpecial != NULL) {

5877

xmlHashFree(ctxt->attsSpecial, NULL);

5878

ctxt->attsSpecial = NULL;

5879

}

5880

}

5881

5882

/**

5883

* htmlCtxtUseOptions:

5884

* @ctxt: an HTML parser context

5885

* @options: a combination of htmlParserOption(s)

5886

5887

* Applies the options to the parser context

5888

5889

* Returns 0 in case of success, the set of unknown or unimplemented options

5890

* in case of error.

5891

5892

int

5893

htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)

5894

{

5895

if (ctxt == NULL)

5896

return(-1);

5897

5898

if (options & HTML_PARSE_NOWARNING) {

5899

ctxt->sax->warning = NULL;

5900

ctxt->vctxt.warning = NULL;

5901

options -= XML_PARSE_NOWARNING;

5902

ctxt->options |= XML_PARSE_NOWARNING;

5903

}

5904

if (options & HTML_PARSE_NOERROR) {

5905

ctxt->sax->error = NULL;

5906

ctxt->vctxt.error = NULL;

5907

ctxt->sax->fatalError = NULL;

5908

options -= XML_PARSE_NOERROR;

5909

ctxt->options |= XML_PARSE_NOERROR;

5910

}

5911

if (options & HTML_PARSE_PEDANTIC) {

5912

ctxt->pedantic = 1;

5913

options -= XML_PARSE_PEDANTIC;

5914

ctxt->options |= XML_PARSE_PEDANTIC;

5915

} else

5916

ctxt->pedantic = 0;

5917

if (options & XML_PARSE_NOBLANKS) {

5918

ctxt->keepBlanks = 0;

5919

ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;

5920

options -= XML_PARSE_NOBLANKS;

5921

ctxt->options |= XML_PARSE_NOBLANKS;

5922

} else

5923

ctxt->keepBlanks = 1;

5924

if (options & HTML_PARSE_RECOVER) {

5925

ctxt->recovery = 1;

5926

options -= HTML_PARSE_RECOVER;

5927

} else

5928

ctxt->recovery = 0;

5929

if (options & HTML_PARSE_COMPACT) {

5930

ctxt->options |= HTML_PARSE_COMPACT;

5931

options -= HTML_PARSE_COMPACT;

5932

}

5933

ctxt->dictNames = 0;

5934

return (options);

5935

}

5936

5937

/**

5938

* htmlDoRead:

5939

* @ctxt: an HTML parser context

5940

* @URL: the base URL to use for the document

5941

* @encoding: the document encoding, or NULL

5942

* @options: a combination of htmlParserOption(s)

5943

* @reuse: keep the context for reuse

5944

5945

* Common front-end for the htmlRead functions

5946

5947

* Returns the resulting document tree or NULL

5948

5949

static htmlDocPtr

5950

htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,

5951

int options, int reuse)

5952

{

5953

htmlDocPtr ret;

5954

5955

htmlCtxtUseOptions(ctxt, options);

5956

ctxt->html = 1;

5957

if (encoding != NULL) {

5958

xmlCharEncodingHandlerPtr hdlr;

5959

5960

hdlr = xmlFindCharEncodingHandler(encoding);

5961

if (hdlr != NULL)

5962

xmlSwitchToEncoding(ctxt, hdlr);

5963

}

5964

if ((URL != NULL) && (ctxt->input != NULL) &&

5965

(ctxt->input->filename == NULL))

5966

ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);

5967

htmlParseDocument(ctxt);

5968

ret = ctxt->myDoc;

5969

ctxt->myDoc = NULL;

5970

if (!reuse) {

5971

if ((ctxt->dictNames) &&

5972

(ret != NULL) &&

5973

(ret->dict == ctxt->dict))

5974

ctxt->dict = NULL;

5975

xmlFreeParserCtxt(ctxt);

5976

}

5977

return (ret);

5978

}

5979

5980

/**

5981

* htmlReadDoc:

5982

* @cur: a pointer to a zero terminated string

5983

* @URL: the base URL to use for the document

5984

* @encoding: the document encoding, or NULL

5985

* @options: a combination of htmlParserOption(s)

5986

5987

* parse an XML in-memory document and build a tree.

5988

5989

* Returns the resulting document tree

5990

5991

htmlDocPtr

5992

htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)

5993

{

5994

htmlParserCtxtPtr ctxt;

5995

5996

if (cur == NULL)

5997

return (NULL);

5998

5999

xmlInitParser();

6000

ctxt = htmlCreateDocParserCtxt(cur, NULL);

6001

if (ctxt == NULL)

6002

return (NULL);

6003

return (htmlDoRead(ctxt, URL, encoding, options, 0));

6004

}

6005

6006

/**

6007

* htmlReadFile:

6008

* @filename: a file or URL

6009

* @encoding: the document encoding, or NULL

6010

* @options: a combination of htmlParserOption(s)

6011

6012

* parse an XML file from the filesystem or the network.

6013

6014

* Returns the resulting document tree

6015

6016

htmlDocPtr

6017

htmlReadFile(const char *filename, const char *encoding, int options)

6018

{

6019

htmlParserCtxtPtr ctxt;

6020

6021

xmlInitParser();

6022

ctxt = htmlCreateFileParserCtxt(filename, encoding);

6023

if (ctxt == NULL)

6024

return (NULL);

6025

return (htmlDoRead(ctxt, NULL, NULL, options, 0));

6026

}

6027

6028

/**

6029

* htmlReadMemory:

6030

* @buffer: a pointer to a char array

6031

* @size: the size of the array

6032

* @URL: the base URL to use for the document

6033

* @encoding: the document encoding, or NULL

6034

* @options: a combination of htmlParserOption(s)

6035

6036

* parse an XML in-memory document and build a tree.

6037

6038

* Returns the resulting document tree

6039

6040

htmlDocPtr

6041

htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)

6042

{

6043

htmlParserCtxtPtr ctxt;

6044

6045

xmlInitParser();

6046

ctxt = xmlCreateMemoryParserCtxt(buffer, size);

6047

if (ctxt == NULL)

6048

return (NULL);

6049

htmlDefaultSAXHandlerInit();

6050

if (ctxt->sax != NULL)

6051

memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));

6052

return (htmlDoRead(ctxt, URL, encoding, options, 0));

6053

}

6054

6055

/**

6056

* htmlReadFd:

6057

* @fd: an open file descriptor

6058

* @URL: the base URL to use for the document

6059

* @encoding: the document encoding, or NULL

6060

* @options: a combination of htmlParserOption(s)

6061

6062

* parse an XML from a file descriptor and build a tree.

6063

6064

* Returns the resulting document tree

6065

6066

htmlDocPtr

6067

htmlReadFd(int fd, const char *URL, const char *encoding, int options)

6068

{

6069

htmlParserCtxtPtr ctxt;

6070

xmlParserInputBufferPtr input;

6071

xmlParserInputPtr stream;

6072

6073

if (fd < 0)

6074

return (NULL);

6075

6076

xmlInitParser();

6077

input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);

6078

if (input == NULL)

6079

return (NULL);

6080

ctxt = xmlNewParserCtxt();

6081

if (ctxt == NULL) {

6082

xmlFreeParserInputBuffer(input);

6083

return (NULL);

6084

}

6085

stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);

6086

if (stream == NULL) {

6087

xmlFreeParserInputBuffer(input);

6088

xmlFreeParserCtxt(ctxt);

6089

return (NULL);

6090

}

6091

inputPush(ctxt, stream);

6092

return (htmlDoRead(ctxt, URL, encoding, options, 0));

6093

}

6094

6095

/**

6096

* htmlReadIO:

6097

* @ioread: an I/O read function

6098

* @ioclose: an I/O close function

6099

* @ioctx: an I/O handler

6100

* @URL: the base URL to use for the document

6101

* @encoding: the document encoding, or NULL

6102

* @options: a combination of htmlParserOption(s)

6103

6104

* parse an HTML document from I/O functions and source and build a tree.

6105

6106

* Returns the resulting document tree

6107

6108

htmlDocPtr

6109

htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,

6110

void *ioctx, const char *URL, const char *encoding, int options)

6111

{

6112

htmlParserCtxtPtr ctxt;

6113

xmlParserInputBufferPtr input;

6114

xmlParserInputPtr stream;

6115

6116

if (ioread == NULL)

6117

return (NULL);

6118

xmlInitParser();

6119

6120

input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,

6121

XML_CHAR_ENCODING_NONE);

6122

if (input == NULL)

6123

return (NULL);

6124

ctxt = htmlNewParserCtxt();

6125

if (ctxt == NULL) {

6126

xmlFreeParserInputBuffer(input);

6127

return (NULL);

6128

}

6129

stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);

6130

if (stream == NULL) {

6131

xmlFreeParserInputBuffer(input);

6132

xmlFreeParserCtxt(ctxt);

6133

return (NULL);

6134

}

6135

inputPush(ctxt, stream);

6136

return (htmlDoRead(ctxt, URL, encoding, options, 0));

6137

}

6138

6139

/**

6140

* htmlCtxtReadDoc:

6141

* @ctxt: an HTML parser context

6142

* @cur: a pointer to a zero terminated string

6143

* @URL: the base URL to use for the document

6144

* @encoding: the document encoding, or NULL

6145

* @options: a combination of htmlParserOption(s)

6146

6147

* parse an XML in-memory document and build a tree.

6148

* This reuses the existing @ctxt parser context

6149

6150

* Returns the resulting document tree

6151

6152

htmlDocPtr

6153

htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,

6154

const char *URL, const char *encoding, int options)

6155

{

6156

xmlParserInputPtr stream;

6157

6158

if (cur == NULL)

6159

return (NULL);

6160

if (ctxt == NULL)

6161

return (NULL);

6162

6163

htmlCtxtReset(ctxt);

6164

6165

stream = xmlNewStringInputStream(ctxt, cur);

6166

if (stream == NULL) {

6167

return (NULL);

6168

}

6169

inputPush(ctxt, stream);

6170

return (htmlDoRead(ctxt, URL, encoding, options, 1));

6171

}

6172

6173

/**

6174

* htmlCtxtReadFile:

6175

* @ctxt: an HTML parser context

6176

* @filename: a file or URL

6177

* @encoding: the document encoding, or NULL

6178

* @options: a combination of htmlParserOption(s)

6179

6180

* parse an XML file from the filesystem or the network.

6181

* This reuses the existing @ctxt parser context

6182

6183

* Returns the resulting document tree

6184

6185

htmlDocPtr

6186

htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,

6187

const char *encoding, int options)

6188

{

6189

xmlParserInputPtr stream;

6190

6191

if (filename == NULL)

6192

return (NULL);

6193

if (ctxt == NULL)

6194

return (NULL);

6195

6196

htmlCtxtReset(ctxt);

6197

6198

stream = xmlLoadExternalEntity(filename, NULL, ctxt);

6199

if (stream == NULL) {

6200

return (NULL);

6201

}

6202

inputPush(ctxt, stream);

6203

return (htmlDoRead(ctxt, NULL, encoding, options, 1));

6204

}

6205

6206

/**

6207

* htmlCtxtReadMemory:

6208

* @ctxt: an HTML parser context

6209

* @buffer: a pointer to a char array

6210

* @size: the size of the array

6211

* @URL: the base URL to use for the document

6212

* @encoding: the document encoding, or NULL

6213

* @options: a combination of htmlParserOption(s)

6214

6215

* parse an XML in-memory document and build a tree.

6216

* This reuses the existing @ctxt parser context

6217

6218

* Returns the resulting document tree

6219

6220

htmlDocPtr

6221

htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,

6222

const char *URL, const char *encoding, int options)

6223

{

6224

xmlParserInputBufferPtr input;

6225

xmlParserInputPtr stream;

6226

6227

if (ctxt == NULL)

6228

return (NULL);

6229

if (buffer == NULL)

6230

return (NULL);

6231

6232

htmlCtxtReset(ctxt);

6233

6234

input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);

6235

if (input == NULL) {

6236

return(NULL);

6237

}

6238

6239

stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);

6240

if (stream == NULL) {

6241

xmlFreeParserInputBuffer(input);

6242

return(NULL);

6243

}

6244

6245

inputPush(ctxt, stream);

6246

return (htmlDoRead(ctxt, URL, encoding, options, 1));

6247

}

6248

6249

/**

6250

* htmlCtxtReadFd:

6251

* @ctxt: an HTML parser context

6252

* @fd: an open file descriptor

6253

* @URL: the base URL to use for the document

6254

* @encoding: the document encoding, or NULL

6255

* @options: a combination of htmlParserOption(s)

6256

6257

* parse an XML from a file descriptor and build a tree.

6258

* This reuses the existing @ctxt parser context

6259

6260

* Returns the resulting document tree

6261

6262

htmlDocPtr

6263

htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,

6264

const char *URL, const char *encoding, int options)

6265

{

6266

xmlParserInputBufferPtr input;

6267

xmlParserInputPtr stream;

6268

6269

if (fd < 0)

6270

return (NULL);

6271

if (ctxt == NULL)

6272

return (NULL);

6273

6274

htmlCtxtReset(ctxt);

6275

6276

6277

input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);

6278

if (input == NULL)

6279

return (NULL);

6280

stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);

6281

if (stream == NULL) {

6282

xmlFreeParserInputBuffer(input);

6283

return (NULL);

6284

}

6285

inputPush(ctxt, stream);

6286

return (htmlDoRead(ctxt, URL, encoding, options, 1));

6287

}

6288

6289

/**

6290

* htmlCtxtReadIO:

6291

* @ctxt: an HTML parser context

6292

* @ioread: an I/O read function

6293

* @ioclose: an I/O close function

6294

* @ioctx: an I/O handler

6295

* @URL: the base URL to use for the document

6296

* @encoding: the document encoding, or NULL

6297

* @options: a combination of htmlParserOption(s)

6298

6299

* parse an HTML document from I/O functions and source and build a tree.

6300

* This reuses the existing @ctxt parser context

6301

6302

* Returns the resulting document tree

6303

6304

htmlDocPtr

6305

htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,

6306

xmlInputCloseCallback ioclose, void *ioctx,

6307

const char *URL,

6308

const char *encoding, int options)

6309

{

6310

xmlParserInputBufferPtr input;

6311

xmlParserInputPtr stream;

6312

6313

if (ioread == NULL)

6314

return (NULL);

6315

if (ctxt == NULL)

6316

return (NULL);

6317

6318

htmlCtxtReset(ctxt);

6319

6320

input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,

6321

XML_CHAR_ENCODING_NONE);

6322

if (input == NULL)

6323

return (NULL);

6324

stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);

6325

if (stream == NULL) {

6326

xmlFreeParserInputBuffer(input);

6327

return (NULL);

6328

}

6329

inputPush(ctxt, stream);

6330

return (htmlDoRead(ctxt, URL, encoding, options, 1));

6331

}

6332

6333

#define bottom_HTMLparser

6334

#include "elfgcchack.h"

6335

#endif /* LIBXML_HTML_ENABLED */

Older »