6
use vars qw( @ISA @EXPORT @EXPORT_OK $VERSION );
10
@ISA = qw(Exporter DynaLoader);
12
use vars qw($VERSION *AUTOLOAD);
14
*AUTOLOAD = \&XML::Bare::AUTOLOAD;
15
bootstrap XML::Bare $VERSION;
18
@EXPORT_OK = qw( xget merge clean add_node del_node find_node del_node forcearray del_by_perl xmlin xval );
22
XML::Bare - Minimal XML parser implemented via a C state engine
35
if( $self->{ 'text' } ) {
36
if( $self->{'unsafe'} ) {
37
$self->{'parser'} = XML::Bare::c_parse_unsafely( $self->{'text'} );
40
$self->{'parser'} = XML::Bare::c_parse( $self->{'text'} );
44
my $res = open( my $XML, $self->{ 'file' } );
51
$self->{'text'} = <$XML>;
54
$self->{'parser'} = XML::Bare::c_parse( $self->{'text'} );
56
bless $self, "XML::Bare::Object";
57
return $self if( !wantarray );
58
return ( $self, ( $self->{'simple'} ? $self->simple() : $self->parse() ) );
62
return new( @_, simple => 1 );
65
package XML::Bare::Object;
70
# Stubs ( to allow these functions to be used via an object as well, not just via import or namespace )
71
sub find_by_perl { shift; return XML::Bare::find_by_perl( @_ ); }
72
sub find_node { shift; return XML::Bare::find_node( @_ ); }
77
#print Dumper( $self );
78
undef $self->{'text'};
81
undef $self->{'parser'};
87
my $i = $self->{'i'}++;
89
$self->{"text$i"} = $p{'text'};
90
XML::Bare::c_parse_more( $self->{"text$i"}, $self->{'parser'} );
97
my $res = XML::Bare::xml2obj( $self->{'parser'} );
99
if( defined( $self->{'scheme'} ) ) {
100
$self->{'xbs'} = new XML::Bare( %{ $self->{'scheme'} } );
102
if( defined( $self->{'xbs'} ) ) {
103
my $xbs = $self->{'xbs'};
104
my $ob = $xbs->parse();
105
$self->{'xbso'} = $ob;
109
if( !ref( $res ) && $res < 0 ) { croak "Error at ".$self->lineinfo( -$res ); }
110
$self->{ 'xml' } = $res;
112
if( defined( $self->{'xbso'} ) ) {
113
my $ob = $self->{'xbso'};
114
my $cres = $self->check( $res, $ob );
115
croak( $cres ) if( $cres );
118
return $self->{ 'xml' };
123
my ( $self, $node, $scheme, $parent ) = @_;
126
if( ref( $scheme ) eq 'ARRAY' ) {
127
for my $one ( @$scheme ) {
128
my $res = $self->checkone( $node, $one, $parent );
129
return 0 if( !$res );
133
else { return $self->checkone( $node, $scheme, $parent ); }
138
my ( $self, $node, $scheme, $parent ) = @_;
140
for my $key ( keys %$node ) {
141
next if( substr( $key, 0, 1 ) eq '_' || $key eq '_att' || $key eq 'comment' );
142
if( $key eq 'value' ) {
143
my $val = $node->{ 'value' };
144
my $regexp = $scheme->{'value'};
146
if( $val !~ m/^($regexp)$/ ) {
147
my $linfo = $self->lineinfo( $node->{'_i'} );
148
return "Value of '$parent' node ($val) does not match /$regexp/ [$linfo]";
153
my $sub = $node->{ $key };
154
my $ssub = $scheme->{ $key };
155
if( !$ssub ) { #&& ref( $schemesub ) ne 'HASH'
156
my $linfo = $self->lineinfo( $sub->{'_i'} );
157
return "Invalid node '$key' in xml [$linfo]";
159
if( ref( $sub ) eq 'HASH' ) {
160
my $res = $self->check( $sub, $ssub, $key );
161
return $res if( $res );
163
if( ref( $sub ) eq 'ARRAY' ) {
165
if( ref( $asub ) eq 'ARRAY' ) {
168
if( $asub->{'_t'} ) {
169
my $max = $asub->{'_max'} || 0;
170
if( $#$sub >= $max ) {
171
my $linfo = $self->lineinfo( $sub->[0]->{'_i'} );
172
return "Too many nodes of type '$key'; max $max; [$linfo]"
174
my $min = $asub->{'_min'} || 0;
175
if( ($#$sub+1)<$min ) {
176
my $linfo = $self->lineinfo( $sub->[0]->{'_i'} );
177
return "Not enough nodes of type '$key'; min $min [$linfo]"
181
my $res = $self->check( $_, $ssub, $key );
182
return $res if( $res );
186
if( my $dem = $scheme->{'_demand'} ) {
187
for my $req ( @{$scheme->{'_demand'}} ) {
188
my $ck = $node->{ $req };
190
my $linfo = $self->lineinfo( $node->{'_i'} );
191
return "Required node '$req' does not exist [$linfo]"
193
if( ref( $ck ) eq 'ARRAY' ) {
194
my $linfo = $self->lineinfo( $node->{'_i'} );
195
return "Required node '$req' is empty array [$linfo]" if( $#$ck == -1 );
205
my $res = XML::Bare::xml2obj_simple( $self->{'parser'} );#$self->xml2obj();
207
if( !ref( $res ) && $res < 0 ) { croak "Error at ".$self->lineinfo( -$res ); }
208
$self->{ 'xml' } = $res;
214
my ( $self, $node, $name ) = @_;
217
$node->{ 'multi_'.$name } = \%blank if( ! $node->{ 'multi_'.$name } );
218
$node->{ $name } = \@newar if( ! $node->{ $name } );
219
my $newnode = new_node( 0, splice( @_, 3 ) );
220
push( @{ $node->{ $name } }, $newnode );
225
my ( $self, $node, $prev, $name ) = @_;
228
$node->{ 'multi_'.$name } = \%blank if( ! $node->{ 'multi_'.$name } );
229
$node->{ $name } = \@newar if( ! $node->{ $name } );
230
my $newnode = $self->new_node( splice( @_, 4 ) );
233
for my $anode ( @{ $node->{ $name } } ) {
234
$anode->{'_pos'} = $cur if( !$anode->{'_pos'} );
237
my $opos = $prev->{'_pos'};
238
for my $anode ( @{ $node->{ $name } } ) {
239
$anode->{'_pos'}++ if( $anode->{'_pos'} > $opos );
241
$newnode->{'_pos'} = $opos + 1;
243
push( @{ $node->{ $name } }, $newnode );
253
$node = $node->{ $name };
255
for( my $i = 0; $i <= $#$node; $i++ ) {
256
my $one = $node->[ $i ];
257
foreach my $key ( keys %match ) {
258
my $val = $match{ $key };
259
if( $one->{ $key }->{'value'} eq $val ) {
260
delete $node->[ $i ];
266
# Created a node of XML hash with the passed in variables already set
272
foreach( keys %parts ) {
273
my $val = $parts{$_};
274
if( m/^_/ || ref( $val ) eq 'HASH' ) {
275
$newnode{ $_ } = $val;
278
$newnode{ $_ } = { value => $val };
286
my $node = CORE::shift;
287
my $ref = ref( $node );
288
if( $ref eq 'ARRAY' ) {
290
for my $sub ( @$node ) {
291
CORE::push( @ret, simplify( $sub ) );
295
if( $ref eq 'HASH' ) {
298
for my $key ( keys %$node ) {
299
next if( $key eq 'comment' || $key eq 'value' || $key =~ m/^_/ );
301
$ret{ $key } = simplify( $node->{ $key } );
304
return $node->{'value'};
312
my ( $node, $name ) = @_;
313
my $ref = ref( $node );
314
return '' if( $name && $name =~ m/^\_/ );
315
my $txt = $name ? "<$name>" : '';
316
if( $ref eq 'ARRAY' ) {
318
for my $sub ( @$node ) {
319
$txt .= hash2xml( $sub, $name );
323
elsif( $ref eq 'HASH' ) {
324
for my $key ( keys %$node ) {
325
$txt .= hash2xml( $node->{ $key }, $key );
330
if( $node =~ /[<]/ ) { $txt .= '<![CDATA[' . $node . ']]>'; }
331
else { $txt .= $node; }
340
# Save an XML hash tree into a file
343
return if( ! $self->{ 'xml' } );
345
my $xml = $self->xml( $self->{'xml'} );
350
$len = length( $xml );
354
# This is intentionally just :utf8 and not :encoding(UTF-8)
355
# :encoding(UTF-8) checks the data for actually being valid UTF-8, and doing so would slow down the file write
356
# See http://perldoc.perl.org/functions/binmode.html
361
# Note on the following conditional OS check... WTF? This is total bullshit.
362
if( $os eq 'MSWin32' ) {
363
open( $F, '>:utf8', $self->{ 'file' } );
367
open( $F, '>', $self->{ 'file' } );
373
my $cursize = tell( $F );
374
if( $cursize != $len ) { # concurrency; we are writing a smaller file
375
warn "Truncating File $self->{'file'}";
376
`cp $self->{'file'} $self->{'file'}.bad`;
380
$cursize = tell( $F );
381
if( $cursize != $len ) { # still not the right size even after truncate??
382
die "Write problem; $cursize != $len";
388
my ( $self, $obj, $name ) = @_;
392
return XML::Bare::obj2xml( \%hash, '', 0 );
396
return XML::Bare::obj2xml( \%hash, '', 0 );
400
my ( $self, $obj, $name ) = @_;
402
if( $self->{'style'} ) {
403
$pre = "<style type='text/css'>\@import '$self->{'style'}';</style>";
408
return $pre.obj2html( \%hash, '', 0 );
412
return $pre.obj2html( \%hash, '', 0 );
420
for( my $i=0;$i<$res;$i++ ) {
421
my $let = substr( $self->{'text'}, $i, 1 );
422
if( ord($let) == 10 ) {
427
my $part = substr( $self->{'text'}, $res, 10 );
430
if( $self->{'offset'} ) {
431
my $off = $self->{'offset'};
433
return "$off line $line char $res \"$part\"";
435
return "line $line char $res \"$part\"";
438
sub free_tree { my $self = shift; XML::Bare::free_tree_c( $self->{'parser'} ); }
446
return 0 if( ! defined $node );
447
$node = $node->{ $name } or return 0;
448
$node = [ $node ] if( ref( $node ) eq 'HASH' );
449
if( ref( $node ) eq 'ARRAY' ) {
450
for( my $i = 0; $i <= $#$node; $i++ ) {
451
my $one = $node->[ $i ];
452
for my $key ( keys %match ) {
453
my $val = $match{ $key };
454
croak('undefined value in find') unless defined $val;
455
if( $one->{ $key }{'value'} eq $val ) {
456
return $node->[ $i ];
466
return map $_->{'value'}, @{$hash}{@_};
471
return [] if( !$ref );
472
return $ref if( ref( $ref ) eq 'ARRAY' );
477
# shift in the two array references as well as the field to merge on
478
my ( $a, $b, $id ) = @_;
479
my %hash = map { $_->{ $id } ? ( $_->{ $id }->{ 'value' } => $_ ) : ( 0 => 0 ) } @$a;
480
for my $one ( @$b ) {
481
next if( !$one->{ $id } );
482
my $short = $hash{ $one->{ $id }->{ 'value' } };
484
foreach my $key ( keys %$one ) {
485
next if( $key eq '_pos' || $key eq 'id' );
486
my $cur = $short->{ $key };
487
my $add = $one->{ $key };
488
if( !$cur ) { $short->{ $key } = $add; }
490
my $type = ref( $cur );
491
if( $type eq 'HASH' ) {
493
$short->{ $key } = \@arr;
496
if( ref( $add ) eq 'HASH' ) {
497
push( @{$short->{ $key }}, $add );
499
else { # we are merging an array
500
push( @{$short->{ $key }}, @$add );
503
# we need to deal with the case where this node
504
# is already there, either alone or as an array
511
my $ob = new XML::Bare( @_ );
512
my $root = $ob->parse();
513
if( $ob->{'save'} ) {
514
$ob->{'file'} = $ob->{'save'} if( "$ob->{'save'}" ne "1" );
518
return $ob->xml( $root );
524
my $ob = new XML::Bare( text => $text );
525
my $simple = $ob->simple();
526
if( !$ops{'keeproot'} ) {
527
my @keys = keys %$simple;
528
my $first = $keys[0];
529
$simple = $simple->{ $first } if( $first );
536
my $ob = new XML::Bare( %ops );
537
return $ob->html( $ob->parse(), $ops{'root'} || 'xml' );
540
sub readxbs { # xbs = xml bare schema
543
for my $key ( keys %$node ) {
544
next if( substr( $key, 0, 1 ) eq '_' || $key eq '_att' || $key eq 'comment' );
545
if( $key eq 'value' ) {
546
my $val = $node->{'value'};
547
delete $node->{'value'} if( $val =~ m/^\W*$/ );
550
my $sub = $node->{ $key };
552
if( $key =~ m/([a-z_]+)([^a-z_]+)/ ) {
570
$name = 'multi_'.$name;
574
elsif( $t =~ m/\{([0-9]+),([0-9]+)\}/ ) {
580
if( ref( $sub ) eq 'HASH' ) {
581
my $res = readxbs( $sub );
583
$sub->{'_min'} = $min;
584
$sub->{'_max'} = $max;
586
if( ref( $sub ) eq 'ARRAY' ) {
587
for my $item ( @$sub ) {
588
my $res = readxbs( $item );
590
$item->{'_min'} = $min;
591
$item->{'_max'} = $max;
595
push( @demand, $name ) if( $min );
596
$node->{$name} = $node->{$key};
597
delete $node->{$key};
600
if( ref( $sub ) eq 'HASH' ) {
606
if( ref( $sub ) eq 'ARRAY' ) {
607
for my $item ( @$sub ) {
615
push( @demand, $key );
618
if( @demand ) { $node->{'_demand'} = \@demand; }
626
if( ref( $arr ) eq 'ARRAY' ) {
627
$cond =~ s/-([a-z_]+)/\$ob->\{'$1'\}->\{'value'\}/gi;
628
foreach my $ob ( @$arr ) { push( @res, $ob ) if( eval( $cond ) ); }
631
$cond =~ s/-([a-z_]+)/\$arr->\{'$1'\}->\{'value'\}/gi;
632
push( @res, $arr ) if( eval( $cond ) );
640
$cond =~ s/-value/\$ob->\{'value'\}/g;
641
$cond =~ s/-([a-z]+)/\$ob->\{'$1'\}->\{'value'\}/g;
643
for( my $i = 0; $i <= $#$arr; $i++ ) {
644
my $ob = $arr->[ $i ];
645
delete $arr->[ $i ] if( eval( $cond ) );
650
sub newhash { shift; return { value => shift }; }
653
return $_[0] ? $_[0]->{'value'} : ( $_[1] || '' );
657
my ( $objs, $name, $pad, $level, $pdex ) = @_;
658
$level = 0 if( !$level );
659
$pad = '' if( $level <= 2 );
663
return '' if( !$objs );
664
#return $objs->{'_raw'} if( $objs->{'_raw'} );
666
my $oba = $objs->{ $a };
667
my $obb = $objs->{ $b };
670
$oba = $oba->[0] if( ref( $oba ) eq 'ARRAY' );
671
$obb = $obb->[0] if( ref( $obb ) eq 'ARRAY' );
672
if( ref( $oba ) eq 'HASH' ) { $posa = $oba->{'_pos'} || 0; }
673
if( ref( $obb ) eq 'HASH' ) { $posb = $obb->{'_pos'} || 0; }
674
return $posa <=> $posb;
677
my $obj = $objs->{ $i } || '';
678
my $type = ref( $obj );
679
if( $type eq 'ARRAY' ) {
683
if( !$a ) { return 0; }
684
if( !$b ) { return 0; }
685
if( ref( $a ) eq 'HASH' && ref( $b ) eq 'HASH' ) {
686
my $posa = $a->{'_pos'};
687
my $posb = $b->{'_pos'};
688
if( !$posa ) { $posa = 0; }
689
if( !$posb ) { $posb = 0; }
690
return $posa <=> $posb;
695
for my $j ( @dex2 ) {
696
$xml .= obj2xml( $j, $i, $pad.' ', $level+1, $#dex );
699
elsif( $type eq 'HASH' && $i !~ /^_/ ) {
700
if( $obj->{ '_att' } ) {
701
$att .= ' ' . $i . '="' . $obj->{ 'value' } . '"' if( $i !~ /^_/ );;
705
$xml .= obj2xml( $obj , $i, $pad.' ', $level+1, $#dex );
709
if( $i eq 'comment' ) { $xml .= '<!--' . $obj . '-->' . "\n"; }
710
elsif( $i eq 'value' ) {
711
if( $level > 1 ) { # $#dex < 4 &&
712
if( $obj && $obj =~ /[<>&;]/ ) { $xml .= '<![CDATA[' . $obj . ']]>'; }
713
else { $xml .= $obj if( $obj =~ /\S/ ); }
716
elsif( $i =~ /^_/ ) {}
717
else { $xml .= '<' . $i . '>' . $obj . '</' . $i . '>'; }
720
my $pad2 = $imm ? '' : $pad;
721
my $cr = $imm ? '' : "\n";
722
if( substr( $name, 0, 1 ) ne '_' ) {
725
$xml = $pad . '<' . $name . $att . '>' . $cr . $xml . $pad2 . '</' . $name . '>';
728
$xml = $pad . '<' . $name . $att . ' />';
731
return $xml."\n" if( $level > 1 );
738
my ( $objs, $name, $pad, $level, $pdex ) = @_;
740
my $less = "<span class='ang'><</span>";
741
my $more = "<span class='ang'>></span>";
742
my $tn0 = "<span class='tname'>";
744
my $eq0 = "<span class='eq'>";
746
my $qo0 = "<span class='qo'>";
748
my $sp0 = "<span class='sp'>";
753
$level = 0 if( !$level );
754
$pad = '' if( $level == 1 );
758
return '' if( !$objs );
760
my $oba = $objs->{ $a };
761
my $obb = $objs->{ $b };
764
$oba = $oba->[0] if( ref( $oba ) eq 'ARRAY' );
765
$obb = $obb->[0] if( ref( $obb ) eq 'ARRAY' );
766
if( ref( $oba ) eq 'HASH' ) { $posa = $oba->{'_pos'} || 0; }
767
if( ref( $obb ) eq 'HASH' ) { $posb = $obb->{'_pos'} || 0; }
768
return $posa <=> $posb;
771
if( $objs->{'_cdata'} ) {
772
my $val = $objs->{'value'};
773
$val =~ s/^(\s*\n)+//;
777
$objs->{'value'} = $val;
778
#$xml = "$less![CDATA[<div class='node'><div class='cdata'>$val</div></div>]]$more";
779
$cd0 = "$less![CDATA[<div class='node'><div class='cdata'>";
780
$cd1 = "</div></div>]]$more";
783
my $obj = $objs->{ $i } || '';
784
my $type = ref( $obj );
785
if( $type eq 'ARRAY' ) {
789
if( !$a ) { return 0; }
790
if( !$b ) { return 0; }
791
if( ref( $a ) eq 'HASH' && ref( $b ) eq 'HASH' ) {
792
my $posa = $a->{'_pos'};
793
my $posb = $b->{'_pos'};
794
if( !$posa ) { $posa = 0; }
795
if( !$posb ) { $posb = 0; }
796
return $posa <=> $posb;
801
for my $j ( @dex2 ) { $xml .= obj2html( $j, $i, $pad.' ', $level+1, $#dex ); }
803
elsif( $type eq 'HASH' && $i !~ /^_/ ) {
804
if( $obj->{ '_att' } ) {
805
my $val = $obj->{ 'value' };
808
$att .= " <span class='aname'>$i</span>" if( $i !~ /^_/ );
811
$att .= " <span class='aname'>$i</span>$eq0=$eq1$qo0\"$qo1$val$qo0\"$qo1" if( $i !~ /^_/ );
816
$xml .= obj2html( $obj , $i, $pad.' ', $level+1, $#dex );
820
if( $i eq 'comment' ) { $xml .= "$less!--" . $obj . "--$more" . "<br>\n"; }
821
elsif( $i eq 'value' ) {
823
if( $obj && $obj =~ /[<>&;]/ && ! $objs->{'_cdata'} ) { $xml .= "$less![CDATA[$obj]]$more"; }
824
else { $xml .= $obj if( $obj =~ /\S/ ); }
827
elsif( $i =~ /^_/ ) {}
828
else { $xml .= "$less$tn0$i$tn1$more$obj$less/$tn0$i$tn1$more"; }
831
my $pad2 = $imm ? '' : $pad;
832
if( substr( $name, 0, 1 ) ne '_' ) {
836
$xml = "$sp0$pad$sp1$less$tn0$name$tn1$att$more$cd0$xml$cd1$less/$tn0$name$tn1$more";
839
$xml = "$sp0$pad$sp1$less$tn0$name$tn1$att/$more";
844
$xml = "$sp0$pad$sp1$less$tn0$name$tn1$att$more<div class='node'>$xml</div>$sp0$pad$sp1$less/$tn0$name$tn1$more";
846
else { $xml = "$sp0$pad$sp1$less$tn0$name$tn1$att/$more"; }
849
$xml .= "<br>" if( $objs->{'_br'} );
850
if( $objs->{'_note'} ) {
852
my $note = $objs->{'_note'}{'value'};
853
my @notes = split( /\|/, $note );
855
$xml .= "<div class='note'>$sp0$pad$sp1<span class='com'><!--</span> $_ <span class='com'>--></span></div>";
858
return $xml."<br>\n" if( $level );
872
my $ob = new XML::Bare( text => '<xml><name>Bob</name></xml>' );
874
# Parse the xml into a hash tree
875
my $root = $ob->parse();
877
# Print the content of the name node
878
print $root->{xml}->{name}->{value};
882
# Load xml from a file ( assume same contents as first example )
883
my $ob2 = new XML::Bare( file => 'test.xml' );
885
my $root2 = $ob2->parse();
887
$root2->{xml}->{name}->{value} = 'Tim';
889
# Save the changes back to the file
894
# Load xml and verify against XBS ( XML Bare Schema )
895
my $xml_text = '<xml><item name=bob/></xml>''
896
my $schema_text = '<xml><item* name=[a-z]+></item*></xml>'
897
my $ob = new XML::Bare( text => $xml_text, schema => { text => $schema_text } );
898
$ob->parse(); # this will error out if schema is invalid
902
This module is a 'Bare' XML parser. It is implemented in C. The parser
903
itself is a simple state engine that is less than 500 lines of C. The
904
parser builds a C struct tree from input text. That C struct tree is
905
converted to a Perl hash by a Perl function that makes basic calls back
906
to the C to go through the nodes sequentially.
908
The parser itself will only cease parsing if it encounters tags that
909
are not closed properly. All other inputs will parse, even invalid
910
inputs. To allowing checking for validity, a schema checker is included
911
in the module as well.
913
The schema format is custom and is meant to be as simple as possible.
914
It is based loosely around the way multiplicity is handled in Perl
919
To demonstrate what sort of XML is supported, consider the following
920
examples. Each of the PERL statements evaluates to true.
924
=item * Node containing just text
927
PERL: $root->{xml}->{value} eq "blah";
931
XML: <xml><name>Bob</name></xml>
932
PERL: $root->{xml}->{name}->{value} eq "Bob";
934
=item * Attributes unquoted
936
XML: <xml><a href=index.htm>Link</a></xml>
937
PERL: $root->{xml}->{a}->{href}->{value} eq "index.htm";
939
=item * Attributes quoted
941
XML: <xml><a href="index.htm">Link</a></xml>
942
PERL: $root->{xml}->{a}->{href}->{value} eq "index.htm";
946
XML: <xml><raw><![CDATA[some raw $~<!bad xml<>]]></raw></xml>
947
PERL: $root->{xml}->{raw}->{value} eq "some raw \$~<!bad xml<>";
949
=item * Multiple nodes; form array
951
XML: <xml><item>1</item><item>2</item></xml>
952
PERL: $root->{xml}->{item}->[0]->{value} eq "1";
954
=item * Forcing array creation
956
XML: <xml><multi_item/><item>1</item></xml>
957
PERL: $root->{xml}->{item}->[0]->{value} eq "1";
959
=item * One comment supported per node
961
XML: <xml><!--test--></xml>
962
PERL: $root->{xml}->{comment} eq 'test';
966
=head2 Schema Checking
968
Schema checking is done by providing the module with an XBS (XML::Bare Schema) to check
969
the XML against. If the XML checks as valid against the schema, parsing will continue as
970
normal. If the XML is invalid, the parse function will die, providing information about
973
The following information is provided in the error message:
977
=item * The type of error
979
=item * Where the error occured ( line and char )
981
=item * A short snippet of the XML at the point of failure
985
=head2 XBS ( XML::Bare Schema ) Format
989
=item * Required nodes
994
=item * Optional nodes - allow one
998
or XBS: <xml><item?/></xml>
1000
=item * Optional nodes - allow 0 or more
1002
XML: <xml><item/></xml>
1005
=item * Required nodes - allow 1 or more
1007
XML: <xml><item/><item/></xml>
1010
=item * Nodes - specified minimum and maximum number
1012
XML: <xml><item/><item/></xml>
1013
XBS: <xml item{1,2}/>
1014
or XBS: <xml><item{1,2}/></xml>
1015
or XBS: <xml><item{1,2}></item{1,2}></xml>
1017
=item * Multiple acceptable node formats
1019
XML: <xml><item type=box volume=20/><item type=line length=10/></xml>
1020
XBS: <xml><item type=box volume/><item type=line length/></xml>
1022
=item * Regular expressions checking for values
1024
XML: <xml name=Bob dir=up num=10/>
1025
XBS: <xml name=[A-Za-z]+ dir=up|down num=[0-9]+/>
1027
=item * Require multi_ tags
1029
XML: <xml><multi_item/></xml>
1034
=head2 Parsed Hash Structure
1036
The hash structure returned from XML parsing is created in a specific format.
1037
Besides as described above, the structure contains some additional nodes in
1038
order to preserve information that will allow that structure to be correctly
1039
converted back to XML.
1041
Nodes may contain the following 3 additional subnodes:
1047
The character offset within the original parsed XML of where the node
1048
begins. This is used to provide line information for errors when XML
1049
fails a schema check.
1053
This is a number indicating the ordering of nodes. It is used to allow
1054
items in a perl hash to be sorted when writing back to xml. Note that
1055
items are not sorted after parsing in order to save time if all you
1056
are doing is reading and you do not care about the order.
1058
In future versions of this module an option will be added to allow
1059
you to sort your nodes so that you can read them in order.
1060
( note that multiple nodes of the same name are stored in order )
1064
This is a boolean value that exists and is 1 iff the node is an
1069
=head2 Parsing Limitations / Features
1073
=item * CDATA parsed correctly, but stripped if unneeded
1075
Currently the contents of a node that are CDATA are read and
1076
put into the value hash, but the hash structure does not have
1077
a value indicating the node contains CDATA.
1079
When converting back to XML, the contents of the value hash
1080
are parsed to check for xml incompatible data using a regular
1081
expression. If 'CDATA like' stuff is encountered, the node
1084
=item * Node position stored, but hash remains unsorted
1086
The ordering of nodes is noted using the '_pos' value, but
1087
the hash itself is not ordered after parsing. Currently
1088
items will be out of order when looking at them in the
1091
Note that when converted back to XML, the nodes are then
1092
sorted and output in the correct order to XML. Note that
1093
nodes of the same name with the same parent will be
1094
grouped together; the position of the first item to
1095
appear will determine the output position of the group.
1097
=item * Comments are parsed but only one is stored per node.
1099
For each node, there can be a comment within it, and that
1100
comment will be saved and output back when dumping to XML.
1102
=item * Comments override output of immediate value
1104
If a node contains only a comment node and a text value,
1105
only the comment node will be displayed. This is in line
1106
with treating a comment node as a node and only displaying
1107
immediate values when a node contains no subnodes.
1109
=item * PI sections are parsed, but discarded
1111
=item * Unknown C<< <! >> sections are parsed, but discarded
1113
=item * Attributes may use no quotes, single quotes, quotes, or backtics
1115
=item * Quoted attributes cannot contain escaped quotes
1117
No escape character is recognized within quotes. As a result,
1118
regular quotes cannot be stored to XML, or the written XML
1119
will not be correct, due to all attributes always being written
1122
=item * Attributes are always written back to XML with quotes
1124
=item * Nodes cannot contain subnodes as well as an immediate value
1126
Actually nodes can in fact contain a value as well, but that
1127
value will be discarded if you write back to XML. That value is
1128
equal to the first continuous string of text besides a subnode.
1130
<node>text<subnode/>text2</node>
1131
( the value of node is text )
1133
<node><subnode/>text</node>
1134
( the value of node is text )
1139
( the value of node is "\n " )
1141
=item * Entities are not parsed
1143
No entity parsing is done. This is intentional. Future versions of the module
1144
may include a feature to automatically parse entities, but by default any such
1145
feature will be disabled in order to keep from slowing down the parser.
1147
Also, this is done so that round trip ( read and then write back out ) behavior
1150
=item * Nodes named value
1152
Previously iterations of this module had problems with nodes named 'value',
1153
due to the fact that node contents are stored under the 'value' key already.
1154
The current version should parse such files without any problem, although it
1155
may be confusing to see a parsed tree with 'value' pointing to another hash
1156
containing 'value' as well.
1158
In a future version of the module it will be possible to alter the name that
1159
values are stored under.
1161
Note that node values are stored under the key 'content' when the "simple"
1162
parsing mode is used, so as to be consistent with XML::Simple.
1166
=head2 Module Functions
1170
=item * C<< $ob = XML::Bare->new( text => "[some xml]" ) >>
1172
Create a new XML object, with the given text as the xml source.
1174
=item * C<< $object = XML::Bare->new( file => "[filename]" ) >>
1176
Create a new XML object, with the given filename/path as the xml source
1178
=item * C<< $object = XML::Bare->new( text => "[some xml]", file => "[filename]" ) >>
1180
Create a new XML object, with the given text as the xml input, and the given
1181
filename/path as the potential output ( used by save() )
1183
=item * C<< $object = XML::Bare->new( file => "data.xml", scheme => { file => "scheme.xbs" } ) >>
1185
Create a new XML object and check to ensure it is valid xml by way of the XBS scheme.
1187
=item * C<< $tree = $object->parse() >>
1189
Parse the xml of the object and return a tree reference
1191
=item * C<< $tree = $object->simple() >>
1193
Alternate to the parse function which generates a tree similar to that
1194
generated by XML::Simple. Note that the sets of nodes are turned into
1195
arrays always, regardless of whether they have a 'name' attribute, unlike
1198
Note that currently the generated tree cannot be used with any of the
1199
functions in this module that operate upon trees. The function is provided
1200
purely as a quick and dirty way to read simple XML files.
1202
=item * C<< $tree = xmlin( $xmlext, keeproot => 1 ) >>
1204
The xmlin function is a shortcut to creating an XML::Bare object and
1205
parsing it using the simple function. It behaves similarly to the
1206
XML::Simple function by the same name. The keeproot option is optional
1207
and if left out the root node will be discarded, same as the function
1210
=item * C<< $text = $object->xml( [root] ) >>
1212
Take the hash tree in [root] and turn it into cleanly indented ( 2 spaces )
1215
=item * C<< $text = $object->html( [root], [root node name] ) >>
1217
Take the hash tree in [root] and turn it into nicely colorized and styled
1218
html. [root node name] is optional.
1220
=item * C<< $object->save() >>
1222
The the current tree in the object, cleanly indent it, and save it
1223
to the file parameter specified when creating the object.
1225
=item * C<< $value = xval $node, $default >>
1227
Returns the value of $node or $default if the node does not exist.
1228
If default is not passed to the function, then '' is returned as
1229
a default value when the node does not exist.
1231
=item * C<< ( $name, $age ) = xget( $personnode, qw/name age/ ) >>
1233
Shortcut function to grab a number of values from a node all at the
1234
same time. Note that this function assumes that all of the subnodes
1235
exist; it will fail if they do not.
1237
=item * C<< $text = XML::Bare::clean( text => "[some xml]" ) >>
1239
Shortcut to creating an xml object and immediately turning it into clean xml text.
1241
=item * C<< $text = XML::Bare::clean( file => "[filename]" ) >>
1243
Similar to previous.
1245
=item * C<< XML::Bare::clean( file => "[filename]", save => 1 ) >>
1247
Clean up the xml in the file, saving the results back to the file
1249
=item * C<< XML::Bare::clean( text => "[some xml]", save => "[filename]" ) >>
1251
Clean up the xml provided, and save it into the specified file.
1253
=item * C<< XML::Bare::clean( file => "[filename1]", save => "[filename2]" ) >>
1255
Clean up the xml in filename1 and save the results to filename2.
1257
=item * C<< $html = XML::Bare::tohtml( text => "[some xml]", root => 'xml' ) >>
1259
Shortcut to creating an xml object and immediately turning it into html.
1260
Root is optional, and specifies the name of the root node for the xml
1261
( which defaults to 'xml' )
1263
=item * C<< $object->add_node( [node], [nodeset name], name => value, name2 => value2, ... ) >>
1266
$object->add_node( $root->{xml}, 'item', name => 'Bob' );
1275
=item * C<< $object->add_node_after( [node], [subnode within node to add after], [nodeset name], ... ) >>
1277
=item * C<< $object->del_node( [node], [nodeset name], name => value ) >>
1291
$xml->del_node( $root->{xml}, 'a', b=>'1' );
1300
=item * C<< $object->find_node( [node], [nodeset name], name => value ) >>
1316
$object->find_node( $root->{xml}, 'ob', key => '1' )->{val}->{value} = 'test';
1330
=item * C<< $object->find_by_perl( [nodeset], "[perl code]" ) >>
1332
find_by_perl evaluates some perl code for each node in a set of nodes, and
1333
returns the nodes where the perl code evaluates as true. In order to
1334
easily reference node values, node values can be directly referred
1335
to from within the perl code by the name of the node with a dash(-) in
1336
front of the name. See the example below.
1338
Note that this function returns an array reference as opposed to a single
1339
node unlike the find_node function.
1355
$object->find_by_perl( $root->{xml}->{ob}, "-key eq '1'" )->[0]->{val}->{value} = 'test';
1369
=item * C<< XML::Bare::merge( [nodeset1], [nodeset2], [id node name] ) >>
1371
Merges the nodes from nodeset2 into nodeset1, matching the contents of
1372
each node based up the content in the id node.
1377
my $ob1 = new XML::Bare( text => "
1386
my $ob2 = new XML::Bare( text => "
1396
my $root1 = $ob1->parse();
1397
my $root2 = $ob2->parse();
1398
merge( $root1->{'xml'}->{'a'}, $root2->{'xml'}->{'a'}, 'id' );
1399
print $ob1->xml( $root1 );
1413
=item * C<< XML::Bare::del_by_perl( ... ) >>
1415
Works exactly like find_by_perl, but deletes whatever matches.
1417
=item * C<< XML::Bare::forcearray( [noderef] ) >>
1419
Turns the node reference into an array reference, whether that
1420
node is just a single node, or is already an array reference.
1422
=item * C<< XML::Bare::new_node( ... ) >>
1424
Creates a new node...
1426
=item * C<< XML::Bare::newhash( ... ) >>
1428
Creates a new hash with the specified value.
1430
=item * C<< XML::Bare::simplify( [noderef] ) >>
1432
Take a node with children that have immediate values and
1433
creates a hashref to reference those values by the name of
1436
=item * C<< XML::Bare::hash2xml( [hashref] ) >>
1438
Take a recursive hash tree ( perhaps generated by the simplify function ) and turn it
1439
into a raw XML string. Note that this function does not indent nicely. You will need
1440
to feed this string back into the parser and output it again if you want it to look
1441
nice. ( or you could use the 'clean' function to do it in one go )
1443
=item * C<< XML::Bare->new( text => "[xml]", unsafe => 1 ) >>
1445
An extra speedy way to parse XML. It is unsafe; may harm pets and children. Don't
1446
say you weren't warned. 30% speed boost compared to the normal parsing. You -must-
1447
use $ob->simple() in combination with this for it to work properly.
1449
The speed boost is gained by skipping checks for the end of the string when in the
1450
middle of properly formatted XML. The only time the check is done is within "values"
1451
( which includes the space after the final closing </xml> )
1453
Also, in the unsafe mode, tags, complete with their attributes, must be on one line.
1454
Node contents of course, can still have carriage returns...
1456
=item * C<< $object->read_more( text => "[xml fragment]" ) >>
1458
Add more XML text to be handled. Note that this function must be called before
1459
calling the parse function.
1464
my $ob = XML::Bare->new( text => "
1467
$ob->read_more( text => "<node>b</node>" );
1468
$ob->read_more( text => "</xml>" );
1469
my $root = $ob->parse();
1470
print $ob->xml( $root );
1478
Warning! Reading in additional XML fragments only works properly at proper "division points".
1479
Currently the parser will -not- work properly if you split in the middle of a node value, or
1480
in the middle of a node name. A future version of the module will be properly updated to handle
1483
Currently there is little to no benefit to parsing this way, rather than simple concatenating
1484
the two strings together and then reading all the XML in at once.
1488
=head2 Functions Used Internally
1492
=item * C<< check() checkone() readxbs() free_tree_c() >>
1494
=item * C<< lineinfo() c_parse() c_parse_unsafely() c_parse_more() c_parsefile() free_tree() xml2obj() >>
1496
=item * C<< obj2xml() get_root() obj2html() xml2obj_simple() >>
1502
Since the creation of this module there has been a fair amount of controvesy surrounding
1503
it. A number of authors of other XML parsers have gone so far as to publicly attack this
1504
module and claim that it 'does not parse XML', and 'it is not XML compliant'. Some of the
1505
same people seem to be angered by the inclusion of a benchmark, claiming that it is an
1506
unfair comparison, and that if the proper options and setup are used, that other XML
1509
The module should parse any XML document that conforms to the standardized
1510
XML specifications, there is no need for alarm and fear that the module will corrupt
1511
your XML documents on reading.
1513
To be blunt about how the parser works, very little has been done to make the parser
1514
follow the specification known as 'XML'. The parser is meant to be flexibile and somewhat
1515
resilient, and will parse XML like garbage that would cause other parsers to error out.
1517
As far as I am concerned, as the author of the module, the 'XML' in 'XML::Bare' should
1518
be thought of to mean 'eXtremely Mad Language', because the module was written from
1519
scratch without referring to the specification known as 'XML'.
1521
In regard to the complaints about the unfairness of the included benchmarks, please
1522
make your own intelligent decision as to what module you like by trying multiple
1523
modules and/or running the performance tests yourself. If you like some other module,
1524
use that module. If you like XML::Bare and think it is the fastest thing on the planet,
1527
If you hate XML::Bare and want to go around on the internet trashing it and telling
1528
people to use something else, I think perhaps you may want to seek counseling.
1532
In comparison to other available perl xml parsers that create trees, XML::Bare
1533
is extremely fast. In order to measure the performance of loading and parsing
1534
compared to the alternatives, a templated speed comparison mechanism has been
1535
created and included with XML::Bare.
1537
The include makebench.pl file runs when you make the module and creates perl
1538
files within the bench directory corresponding to the .tmpl contained there.
1540
Currently there are three types of modules that can be tested against,
1541
executable parsers ( exe.tmpl ), tree parsers ( tree.tmpl ), and parsers
1542
that do not generated trees ( notree.tmpl ).
1544
A full list of modules currently tested against is as follows:
1548
XML::Descent (notree)
1554
XML::LibXML (notree)
1556
XML::Parser (notree)
1557
XML::Parser::EasyTree
1558
XML::Parser::Expat (notree)
1560
XML::Simple using XML::Parser
1561
XML::Simple using XML::SAX::PurePerl
1562
XML::Simple using XML::LibXML::SAX::Parser
1563
XML::Simple using XML::Bare::SAX::Parser
1568
XML::XPath::XMLParser
1570
To run the comparisons, run the appropriate perl file within the
1571
bench directory. ( exe.pl, tree.pl, or notree.pl )
1573
The script measures the milliseconds of loading and parsing, and
1574
compares the time against the time of XML::Bare. So a 7 means
1575
it takes 7 times as long as XML::Bare.
1577
Here is a combined table of the script run against each alternative
1578
using the included test.xml:
1580
-Module- load parse total
1582
XML::TreePP 2.3063 33.1776 6.1598
1583
XML::Parser::EasyTree 4.9405 25.7278 7.4571
1584
XML::Handler::Trees 7.2303 26.5688 9.6447
1585
XML::Trivial 5.0636 12.4715 7.3046
1586
XML::Smart 6.8138 78.7939 15.8296
1587
XML::Simple (XML::Parser) 2.3346 50.4772 10.7455
1588
XML::Simple (PurePerl) 2.361 261.4571 33.6524
1589
XML::Simple (LibXML) 2.3187 163.7501 23.1816
1590
XML::Simple (XML::Bare) 2.3252 59.1254 10.9163
1591
XML::SAX::Simple 8.7792 170.7313 28.3634
1592
XML::Twig 27.8266 56.4476 31.3594
1593
XML::Grove::Builder 7.1267 26.1672 9.4064
1594
XML::XPath::XMLParser 9.7783 35.5486 13.0002
1595
XML::LibXML (notree) 11.0038 4.5758 10.6881
1596
XML::Parser (notree) 4.4698 17.6448 5.8609
1597
XML::Parser::Expat(notree) 3.7681 50.0382 6.0069
1598
XML::Descent (notree) 6.0525 37.0265 11.0322
1599
Tiny XML (exe) 1.0095
1603
Here is a combined table of the script run against each alternative
1604
using the included feed2.xml:
1606
-Module- load parse total
1608
XML::Bare (simple) 1 0.7238 ?
1609
XML::Bare (unsafe simple) 1 ~0.5538 ?
1610
XML::Fast 1.516 0.9733 1.4783
1611
XML::TreePP 0.6393 30.5951 2.6874
1612
XML::MyXML 1.8266 14.2571 2.7113
1613
XML::Parser::EasyTree 1.5208 22.8283 2.9748
1614
XML::Trivial 2.007 25.742 3.615
1615
XML::Tiny 0.1665 61.4918 4.3234
1616
XML::XPath::XMLParser 2.5762 33.2567 4.6742
1617
XML::Smart 1.702 59.4907 5.7566
1618
XML::Simple (XML::Parser) 0.5838 64.7243 5.0006
1619
XML::DOM::Lite 4.5207 17.4617 5.4033
1620
XML::Simple (LibXML) 0.5904 161.7544 11.5731
1621
XML::Twig 8.553 56.9034 11.8805
1622
XML::Grove::Builder 7.2021 30.7926 12.9334
1623
XML::Handler::Trees 6.8545 33.1007 13.0575
1624
XML::LibXML::Simple 14.0204 11.8482 13.8707
1625
XML::Simple (PurePerl) 0.6176 321.3422 23.0465
1626
XML::Simple 2.7168 90.7203 26.7525
1627
XML::SAX::Simple 8.7386 94.8276 29.2166
1628
XML::LibXML (notree) 11.0023 5.022 10.5214
1629
XML::Parser (notree) 4.3748 25.0213 5.9803
1630
XML::Parser::Expat(notree) 3.6555 51.6426 7.4316
1631
XML::Descent (notree) 5.9206 155.0289 18.7767
1632
Tiny XML (exe) 1.2212
1636
These results show that XML::Bare is, at least on the
1637
test machine, running all tests within cygwin, faster
1638
at loading and parsing than everything being tested
1641
The following things are shown as well:
1642
- XML::Bare can parse XML and create a hash tree
1643
in less time than it takes LibXML just to parse.
1644
- XML::Bare can parse XML and create a tree
1645
in less time than all three binary parsers take
1647
- XML::Fast is theoretically faster at parsing than
1648
the default 'full' mode of XML::Bare. Despite that,
1649
the 'simple' mode of XML::Bare is even faster.
1651
Note that the executable parsers are not perl modules
1652
and are timed using dummy programs that just uses the
1653
library to load and parse the example files. The
1654
executables are not included with this program. Any
1655
source modifications used to generate the shown test
1656
results can be found in the bench/src directory of
1661
Copyright (C) 2008 David Helkowski
1663
This program is free software; you can redistribute it and/or
1664
modify it under the terms of the GNU General Public License as
1665
published by the Free Software Foundation; either version 2 of the
1666
License, or (at your option) any later version. You may also can
1667
redistribute it and/or modify it under the terms of the Perl
1670
This program is distributed in the hope that it will be useful,
1671
but WITHOUT ANY WARRANTY; without even the implied warranty of
1672
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1673
GNU General Public License for more details.