1
# $Id: PurePerl.pm,v 1.15 2002/02/05 17:33:52 matt Exp $
3
package XML::SAX::PurePerl;
10
use XML::SAX::PurePerl::Productions qw($Any $CharMinusDash $SingleChar);
11
use XML::SAX::PurePerl::Reader;
12
use XML::SAX::PurePerl::EncodingDetect ();
13
use XML::SAX::Exception;
14
use XML::SAX::PurePerl::DocType ();
15
use XML::SAX::PurePerl::DTDDecls ();
16
use XML::SAX::PurePerl::XMLDecl ();
17
use XML::SAX::DocumentLocator ();
18
use XML::SAX::Base ();
19
use XML::SAX qw(Namespaces);
20
use XML::NamespaceSupport ();
24
require XML::SAX::PurePerl::NoUnicodeExt;
27
require XML::SAX::PurePerl::UnicodeExt;
31
@ISA = ('XML::SAX::Base');
41
my $xmlns_ns = "http://www.w3.org/2000/xmlns/";
42
my $xml_ns = "http://www.w3.org/XML/1998/namespace";
45
sub _parse_characterstream {
48
confess("CharacterStream is not yet correctly implemented");
49
my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh);
50
return $self->_parse($reader);
53
sub _parse_bytestream {
56
my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh);
57
return $self->_parse($reader);
63
my $reader = XML::SAX::PurePerl::Reader::String->new($str);
64
return $self->_parse($reader);
70
my $reader = XML::SAX::PurePerl::Reader::URI->new($uri);
71
return $self->_parse($reader);
75
my ($self, $reader) = @_;
77
$reader->public_id($self->{ParseOptions}{Source}{PublicId});
78
$reader->system_id($self->{ParseOptions}{Source}{SystemId});
81
$self->{NSHelper} = XML::NamespaceSupport->new({xmlns => 1});
83
$self->set_document_locator(
84
XML::SAX::DocumentLocator->new(
85
sub { $reader->public_id },
86
sub { $reader->system_id },
87
sub { $reader->line },
88
sub { $reader->column },
92
$self->start_document({});
94
if (defined $self->{ParseOptions}{Source}{Encoding}) {
95
$reader->set_encoding($self->{ParseOptions}{Source}{Encoding});
98
$self->encoding_detect($reader);
102
$self->document($reader);
104
return $self->end_document({});
109
my ($error, $reader) = @_;
111
# warn("parser error: $error from ", $reader->line, " : ", $reader->column, "\n");
112
my $exception = XML::SAX::Exception::Parse->new(
114
ColumnNumber => $reader->column,
115
LineNumber => $reader->line,
116
PublicId => $reader->public_id,
117
SystemId => $reader->system_id,
120
$self->fatal_error($exception);
125
my ($self, $reader) = @_;
127
# document ::= prolog element Misc*
129
$self->prolog($reader);
130
$self->element($reader) ||
131
$self->parser_error("Document requires an element", $reader);
133
while(!$reader->eof) {
134
$self->Misc($reader) ||
135
$self->parser_error("Only Comments, PIs and whitespace allowed at end of document", $reader);
140
my ($self, $reader) = @_;
142
$self->XMLDecl($reader);
144
# consume all misc bits
145
1 while($self->Misc($reader));
147
if ($self->doctypedecl($reader)) {
148
while (!$reader->eof) {
149
$self->Misc($reader) || last;
155
my ($self, $reader) = @_;
157
if ($reader->match_char('<')) {
158
my $name = $self->Name($reader) ||
159
$self->parser_error("Invalid element name", $reader);
163
while( my ($k, $v) = $self->Attribute($reader) ) {
167
$self->skip_whitespace($reader);
170
unless ($reader->match_sequence('/', '>')) {
171
$reader->match_char('>') ||
172
$self->parser_error("No close element tag", $reader);
174
# only push onto _el_stack if not an empty element
175
push @{$self->{_el_stack}}, $name;
179
# Namespace processing
180
$self->{NSHelper}->push_context;
182
# my %attrs = @attribs;
183
# while (my ($k,$v) = each %attrs) {
184
if ($self->get_feature(Namespaces)) {
185
while ( my ($k, $v) = each %attribs ) {
186
if ($k =~ m/^xmlns(:(.*))?$/) {
187
my $prefix = $2 || '';
188
$self->{NSHelper}->declare_prefix($prefix, $v);
195
$self->SUPER::start_prefix_mapping($ns);
200
# Create element object and fire event
202
while (my ($name, $value) = each %attribs ) {
203
# TODO normalise value here
204
my ($ns, $prefix, $lname);
205
if ($self->get_feature(Namespaces)) {
206
($ns, $prefix, $lname) = $self->{NSHelper}->process_attribute_name($name);
208
$ns ||= ''; $prefix ||= ''; $lname ||= '';
209
$attrib_hash{"{$ns}$lname"} = {
218
%attribs = (); # lose the memory since we recurse deep
220
my ($ns, $prefix, $lname);
221
if ($self->get_feature(Namespaces)) {
222
($ns, $prefix, $lname) = $self->{NSHelper}->process_element_name($name);
224
$ns ||= ''; $prefix ||= ''; $lname ||= '';
232
Attributes => \%attrib_hash,
234
$self->start_element($el);
239
$self->content($reader);
241
$reader->match_sequence('<', '/') || $self->parser_error("No close tag marker", $reader);
242
my $end_name = $self->Name($reader);
243
$end_name eq $name || $self->parser_error("End tag mismatch ($end_name != $name)", $reader);
244
$self->skip_whitespace($reader);
245
$reader->match_char('>') || $self->parser_error("No close '>' on end tag", $reader);
249
delete $end_el{Attributes};
250
$self->end_element(\%end_el);
252
for my $ns (@new_ns) {
253
$self->end_prefix_mapping($ns);
255
$self->{NSHelper}->pop_context;
264
my ($self, $reader) = @_;
266
$self->CharData($reader);
269
if ($reader->match_sequence('<', '/')) {
270
$reader->buffer('</');
273
elsif ( $self->Reference($reader) ||
274
$self->CDSect($reader) ||
275
$self->PI($reader) ||
276
$self->Comment($reader) ||
277
$self->element($reader)
280
$self->CharData($reader);
292
my ($self, $reader) = @_;
294
if ($reader->match_sequence('<', '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) {
295
$self->start_cdata({});
299
$self->parser_error("EOF looking for CDATA section end", $reader);
301
$reader->consume_not(']');
302
$chars .= $reader->consumed;
303
if ($reader->match_char(']')) {
304
if ($reader->match_sequence(']', '>')) {
305
# end of CDATA section
307
$self->characters({Data => $chars});
313
$self->end_cdata({});
321
my ($self, $reader) = @_;
325
$reader->consume_not('<', '&', ']');
326
$chars .= $reader->consumed;
327
if ($reader->match_char(']')) {
328
if ($reader->match_sequence(']', '>')) {
329
$self->parser_error("String ']]>' not allowed in character data", $reader);
339
$self->characters({ Data => $chars }) if length($chars);
343
my ($self, $reader) = @_;
344
if ($self->Comment($reader)) {
347
elsif ($self->PI($reader)) {
350
elsif ($self->skip_whitespace($reader)) {
358
my ($self, $reader) = @_;
360
if (!$reader->match_char('&')) {
364
if ($reader->match_char('#')) {
368
if ($reader->match_char('x')) {
369
$reader->consume(qr/[0-9a-fA-F]/) ||
370
$self->parser_error("Hex character reference contains illegal characters", $reader);
371
$ref = $reader->consumed;
372
$char = chr_ref(hex($ref));
376
$reader->consume(qr/[0-9]/) ||
377
$self->parser_error("Decimal character reference contains illegal characters", $reader);
378
$ref = $reader->consumed;
379
$char = chr_ref($ref);
381
$reader->match_char(';') ||
382
$self->parser_error("No semi-colon found after character reference", $reader);
383
if ($char !~ $SingleChar) { # match a single character
384
$self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader);
386
$self->characters({ Data => $char });
391
my $name = $self->Name($reader);
392
$reader->match_char(';') ||
393
$self->parser_error("No semi-colon found after entity name", $reader);
396
if ($self->_is_entity($name)) {
398
if ($self->_is_external($name)) {
399
my $value = $self->_get_entity($name);
400
my $ent_reader = XML::SAX::PurePerl::Reader::URI->new($value);
401
$self->encoding_detect($ent_reader);
402
$self->extParsedEnt($ent_reader);
405
my $value = $self->_stringify_entity($name);
406
my $ent_reader = XML::SAX::PurePerl::Reader::String->new($value);
407
$self->content($ent_reader);
411
elsif (_is_internal($name)) {
412
$self->characters({ Data => $int_ents{$name} });
416
$self->parser_error("Undeclared entity", $reader);
422
# a reference in an attribute value.
423
my ($self, $reader) = @_;
425
if ($reader->match_char('#')) {
429
if ($reader->match_char('x')) {
430
$reader->consume(qr/[0-9a-fA-F]/) ||
431
$self->parser_error("Hex character reference contains illegal characters", $reader);
432
$ref = $reader->consumed;
433
$char = chr_ref(hex($ref));
437
$reader->consume(qr/[0-9]/) ||
438
$self->parser_error("Decimal character reference contains illegal characters", $reader);
439
$ref = $reader->consumed;
440
$char = chr_ref($ref);
442
$reader->match_char(';') ||
443
$self->parser_error("No semi-colon found after character reference", $reader);
444
if ($char !~ $SingleChar) { # match a single character
445
$self->parser_error("Character reference '&#$ref;' refers to an illegal XML character ($char)", $reader);
451
my $name = $self->Name($reader);
452
$reader->match_char(';') ||
453
$self->parser_error("No semi-colon found after entity name", $reader);
456
if ($self->_is_entity($name)) {
457
if ($self->_is_external($name)) {
458
$self->parser_error("No external entity references allowed in attribute values", $reader);
461
my $value = $self->_stringify_entity($name);
465
elsif (_is_internal($name)) {
466
return $int_ents{$name};
469
$self->parser_error("Undeclared entity '$name'", $reader);
476
my ($self, $reader) = @_;
478
$self->TextDecl($reader);
479
$self->content($reader);
484
return 1 if $e eq 'amp' || $e eq 'lt' || $e eq 'gt' || $e eq 'quot' || $e eq 'apos';
489
my ($self, $name) = @_;
490
# TODO: Fix this to use $reader to store the entities perhaps.
491
if ($self->{ParseOptions}{external_entities}{$name}) {
498
my ($self, $name) = @_;
500
if (exists $self->{ParseOptions}{entities}{$name}) {
506
sub _stringify_entity {
507
my ($self, $name) = @_;
509
if (exists $self->{ParseOptions}{expanded_entity}{$name}) {
510
return $self->{ParseOptions}{expanded_entity}{$name};
513
my $reader = XML::SAX::PurePerl::Reader::URI->new($self->{ParseOptions}{entities}{$name});
514
$reader->consume(qr/./);
515
return $self->{ParseOptions}{expanded_entity}{$name} = $reader->consumed;
519
my ($self, $name) = @_;
521
return $self->{ParseOptions}{entities}{$name};
524
sub skip_whitespace {
525
my ($self, $reader) = @_;
529
if ($reader->match_char("\x20") ||
530
$reader->match_char("\x0A") ||
531
$reader->match_char("\x0D") ||
532
$reader->match_char("\x09"))
544
my ($self, $reader) = @_;
546
$self->skip_whitespace($reader) || return;
547
if ($reader->match_sequence('/', '>')) {
548
$reader->buffer("/>");
551
if ($reader->match_char(">")) {
552
$reader->buffer(">");
555
if (my $name = $self->Name($reader)) {
556
$self->skip_whitespace($reader);
557
$reader->match_char('=') ||
558
$self->parser_error("No '=' in Attribute", $reader);
559
$self->skip_whitespace($reader);
560
my $value = $self->AttValue($reader);
562
if (!$self->cdata_attrib($name)) {
563
$value =~ s/^\x20*//; # discard leading spaces
564
$value =~ s/\x20*$//; # discard trailing spaces
565
$value =~ s/ {1,}/ /g; # all >1 space to single space
568
return $name, $value;
575
# TODO implement this!
580
my ($self, $reader) = @_;
583
if (!$reader->match_char($quote)) {
585
$reader->match_char($quote) ||
586
$self->parser_error("Not a quote character", $reader);
592
if ($reader->consume_not('<', '&', $quote)) {
593
my $to_append = $reader->consumed;
594
$to_append =~ s/[\x09\x0A\x0D]/\x20/g; # Attrib value normalize
595
$value .= $to_append;
597
elsif ($reader->match_char('&')) {
598
$value .= $self->AttReference($reader);
600
elsif ($reader->match_char($quote)) {
605
$self->parser_error("Invalid character in attribute value", $reader);
613
my ($self, $reader) = @_;
615
if ($reader->match_sequence('<', '!', '-', '-')) {
616
my $comment_str = '';
618
if ($reader->match_char('-')) {
619
if ($reader->match_char('-')) {
620
$reader->match_char('>') ||
621
$self->parser_error("Invalid string in comment field", $reader);
625
$reader->consume($CharMinusDash) ||
626
$self->parser_error("Invalid string in comment field", $reader);
627
$comment_str .= $reader->consumed;
629
elsif ($reader->consume($CharMinusDash)) {
630
$comment_str .= $reader->consumed;
633
$self->parser_error("Invalid string in comment field", $reader);
637
$self->comment({ Data => $comment_str });
645
my ($self, $reader) = @_;
646
if ($reader->match_sequence('<', '?')) {
648
$target = $self->Name($reader) ||
649
$self->parser_error("PI has no target", $reader);
650
if ($self->skip_whitespace($reader)) {
652
if ($reader->match_sequence('?', '>')) {
655
elsif ($reader->match_re($Any)) {
656
$data .= $reader->matched;
664
$reader->match_sequence('?', '>') ||
665
$self->parser_error("PI closing sequence not found", $reader);
667
$self->processing_instruction({ Target => $target, Data => $data });
675
my ($self, $reader) = @_;
677
return $reader->consume_name();
681
my ($self, $reader) = @_;
684
if (!$reader->match_char($quote)) {
686
$reader->match_char($quote) ||
687
$self->parser_error("Invalid quote token", $reader);
697
XML::SAX::PurePerl - Pure Perl XML Parser with SAX2 interface
701
use XML::Handler::Foo;
702
use XML::SAX::PurePerl;
703
my $handler = XML::Handler::Foo->new();
704
my $parser = XML::SAX::PurePerl->new(Handler => $handler);
705
$parser->parse_uri("myfile.xml");
709
This module implements an XML parser in pure perl. It is written around the
710
upcoming perl 5.8's unicode support and support for multiple document
711
encodings (using the PerlIO layer), however it has been ported to work with
712
ASCII/UTF8 documents under lower perl versions.
714
The SAX2 API is described in detail at http://sourceforge.net/projects/perl-xml/, in
715
the CVS archive, under libxml-perl/docs. Hopefully those documents will be in a
716
better location soon.
718
Please refer to the SAX2 documentation for how to use this module - it is merely a
719
front end to SAX2, and implements nothing that is not in that spec (or at least tries
720
not to - please email me if you find errors in this implementation).
724
XML::SAX::PurePerl is B<slow>. Very slow. I suggest you use something else
725
in fact. However it is great as a fallback parser for XML::SAX, where the
726
user might not be able to install an XS based parser or C library.
728
Currently lots, probably. At the moment the weakest area is parsing DOCTYPE declarations,
729
though the code is in place to start doing this. Also parsing parameter entity
730
references is causing me much confusion, since it's not exactly what I would call
731
trivial, or well documented in the XML grammar. XML documents with internal subsets
734
I am however trying to work towards full conformance using the Oasis test suite.
738
Matt Sergeant, matt@sergeant.org. Copyright 2001.
740
Please report all bugs to the Perl-XML mailing list at perl-xml@listserv.activestate.com.
744
This is free software. You may use it or redistribute it under the same terms as