1
# $Id: Twig.pm.slow,v 1.191 2005/03/16 14:03:52 mrodrigu Exp $
3
# Copyright (c) 1999-2004 Michel Rodriguez
6
# This program is free software; you can redistribute it and/or
7
# modify it under the same terms as Perl itself.
10
# This is created in the caller's space
12
{ sub ::PCDATA { '#PCDATA' }
13
sub ::CDATA { '#CDATA' }
17
######################################################################
19
######################################################################
23
use UNIVERSAL qw( isa ) ;
25
use vars qw($VERSION @ISA %valid_option);
28
#start-extract twig_global
30
# constants: element types
31
use constant (PCDATA => '#PCDATA');
32
use constant (CDATA => '#CDATA');
33
use constant (PI => '#PI');
34
use constant (COMMENT => '#COMMENT');
35
use constant (ENT => '#ENT');
38
use constant (ELT => '#ELT');
39
use constant (TEXT => '#TEXT');
42
use constant (ASIS => '#ASIS');
43
use constant (EMPTY => '#EMPTY');
45
#end-extract twig_global
47
# used in parseurl to set the buffer size to the same size as in XML::Parser::Expat
48
use constant (BUFSIZE => 32768);
51
# used to store the gi's
52
my %gi2index; # gi => index
53
my @index2gi; # list of gi's
54
my $SPECIAL_GI; # first non-special gi;
55
my %base_ent; # base entity character => replacement
57
# flag, set to true if the weaken sub is available
58
use vars qw( $weakrefs);
60
#start-extract twig_global
61
my $REG_NAME = q{(?:(?:[^\W\d]|[:#])(?:[\w.-]*:)?[\w.-]*)}; # xml name
62
my $REG_NAME_W = q{(?:(?:[^\W\d_]|[:#])(?:[\w.-]*:)?[\w.-]*|\*)}; # name or wildcard (* or '')
63
my $REG_REGEXP = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)}; # regexp
64
my $REG_REGEXP_EXP = q{(?:(?:[^\\/]|\\.)*)}; # content of a regexp
65
my $REG_REGEXP_MOD = q{(?:[eimso]*)}; # regexp modifiers
66
my $REG_MATCH = q{[!=]~}; # match (or not)
67
my $REG_STRING = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')}; # string (simple or double quoted)
68
my $REG_NUMBER = q{(?:\d+(?:\.\d*)?|\.\d+)}; # number
69
my $REG_VALUE = qq{(?:$REG_STRING|$REG_NUMBER)}; # value
70
my $REG_OP = q{=|==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge}; # op
72
#end-extract twig_global
75
my( $FB_HTMLCREF, $FB_XMLCREF);
83
my $needVersion = '2.23';
84
$parser_version= $XML::Parser::VERSION;
85
croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion;
88
{ eval "use Encode qw( :all)";
89
$FB_XMLCREF = 0x0400; # Encode::FB_XMLCREF;
90
$FB_HTMLCREF = 0x0200; # Encode::FB_HTMLCREF;
93
# test whether we can use weak references
94
# set local empty signal handler to trap error messages
95
{ local $SIG{__DIE__};
96
if( eval( 'require Scalar::Util') && defined( &Scalar::Util::weaken) )
97
{ import Scalar::Util( 'weaken'); $weakrefs= 1; }
98
elsif( eval( 'require WeakRef'))
99
{ import WeakRef; $weakrefs= 1; }
104
import XML::Twig::Elt;
105
import XML::Twig::Entity;
106
import XML::Twig::Entity_list;
108
# used to store the gi's
109
# should be set for each twig really, at least when there are several
110
# the init ensures that special gi's are always the same
113
# do NOT use => or the constants become quoted!
114
%XML::Twig::gi2index=( PCDATA, 0, CDATA, 1, PI, 2, COMMENT, 3, ENT, 4);
116
@XML::Twig::index2gi=( PCDATA, CDATA, PI, COMMENT, ENT);
118
# gi's under this value are special
119
$XML::Twig::SPECIAL_GI= @XML::Twig::index2gi;
121
%XML::Twig::base_ent= ( '>' => '>', '<' => '<', '&' => '&', "'" => ''', '"' => '"',);
123
# now set some aliases
124
*find_nodes = *get_xpath; # same as XML::XPath
125
*findnodes = *get_xpath; # same as XML::LibXML
126
*getElementsByTagName = *descendants;
127
*descendants_or_self = *descendants; # valid in XML::Twig, not in XML::Twig::Elt
128
*find_by_tag_name = *descendants;
129
*getElementById = *elt_id;
130
*getEltById = *elt_id;
134
@ISA = qw(XML::Parser);
136
# fake gi's used in twig_handlers and start_tag_handlers
137
my $ALL = '_all_'; # the associated function is always called
138
my $DEFAULT= '_default_'; # the function is called if no other handler has been
141
my $COMMENTS_DEFAULT= 'keep';
142
my $PI_DEFAULT = 'keep';
145
# handlers used in regular mode
146
my %twig_handlers=( Start => \&_twig_start,
148
Char => \&_twig_char,
149
Entity => \&_twig_entity,
150
XMLDecl => \&_twig_xmldecl,
151
Doctype => \&_twig_doctype,
152
Element => \&_twig_element,
153
Attlist => \&_twig_attlist,
154
CdataStart => \&_twig_cdatastart,
155
CdataEnd => \&_twig_cdataend,
157
Comment => \&_twig_comment,
158
Default => \&_twig_default,
161
# handlers used when twig_roots is used and we are outside of the roots
162
my %twig_handlers_roots=
163
( Start => \&_twig_start_check_roots,
164
End => \&_twig_end_check_roots,
165
Doctype => \&_twig_doctype,
166
Char => undef, Entity => undef, XMLDecl => \&_twig_xmldecl,
167
Element => undef, Attlist => undef, CdataStart => undef,
168
CdataEnd => undef, Proc => undef, Comment => undef,
169
Proc => \&_twig_pi_check_roots,
170
Default => sub {}, # hack needed for XML::Parser 2.27
173
# handlers used when twig_roots and print_outside_roots are used and we are
174
# outside of the roots
175
my %twig_handlers_roots_print_2_30=
176
( Start => \&_twig_start_check_roots,
177
End => \&_twig_end_check_roots,
178
Char => \&_twig_print,
179
# I have no idea why I should not be using this handler!
180
Entity => \&_twig_print_entity,
181
XMLDecl => \&_twig_print,
182
Doctype => \&_twig_print_doctype, # because recognized_string is broken here
183
# Element => \&_twig_print, Attlist => \&_twig_print,
184
CdataStart => \&_twig_print, CdataEnd => \&_twig_print,
185
Proc => \&_twig_pi_check_roots, Comment => \&_twig_print,
186
Default => \&_twig_print_check_doctype,
189
# handlers used when twig_roots, print_outside_roots and keep_encoding are used
190
# and we are outside of the roots
191
my %twig_handlers_roots_print_original_2_30=
192
( Start => \&_twig_start_check_roots,
193
End => \&_twig_end_check_roots,
194
Char => \&_twig_print_original,
195
# I have no idea why I should not be using this handler!
196
#Entity => \&_twig_print_original,
197
ExternEnt => \&_twig_print_entity,
198
XMLDecl => \&_twig_print_original,
199
Doctype => \&_twig_print_original_doctype, # because original_string is broken here
200
Element => \&_twig_print_original, Attlist => \&_twig_print_original,
201
CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
202
Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original,
203
Default => \&_twig_print_original_check_doctype,
206
# handlers used when twig_roots and print_outside_roots are used and we are
207
# outside of the roots
208
my %twig_handlers_roots_print_2_27=
209
( Start => \&_twig_start_check_roots,
210
End => \&_twig_end_check_roots,
211
Char => \&_twig_print,
212
# I have no idea why I should not be using this handler!
213
#Entity => \&_twig_print,
214
XMLDecl => \&_twig_print, Doctype => \&_twig_print,
215
CdataStart => \&_twig_print, CdataEnd => \&_twig_print,
216
Proc => \&_twig_pi_check_roots, Comment => \&_twig_print,
217
Default => \&_twig_print,
220
# handlers used when twig_roots, print_outside_roots and keep_encoding are used
221
# and we are outside of the roots
222
my %twig_handlers_roots_print_original_2_27=
223
( Start => \&_twig_start_check_roots,
224
End => \&_twig_end_check_roots,
225
Char => \&_twig_print_original,
226
# for some reason original_string is wrong here
227
# this can be a problem if the doctype includes non ascii characters
228
XMLDecl => \&_twig_print, Doctype => \&_twig_print,
229
# I have no idea why I should not be using this handler!
230
Entity => \&_twig_print,
231
#Element => undef, Attlist => undef,
232
CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
233
Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original,
234
Default => \&_twig_print, # _twig_print_original does not work
238
my %twig_handlers_roots_print= $parser_version > 2.27
239
? %twig_handlers_roots_print_2_30
240
: %twig_handlers_roots_print_2_27;
241
my %twig_handlers_roots_print_original= $parser_version > 2.27
242
? %twig_handlers_roots_print_original_2_30
243
: %twig_handlers_roots_print_original_2_27;
246
# handlers used when the finish_print method has been called
247
my %twig_handlers_finish_print=
248
( Start => \&_twig_print,
249
End => \&_twig_print, Char => \&_twig_print,
250
Entity => \&_twig_print, XMLDecl => \&_twig_print,
251
Doctype => \&_twig_print, Element => \&_twig_print,
252
Attlist => \&_twig_print, CdataStart => \&_twig_print,
253
CdataEnd => \&_twig_print, Proc => \&_twig_print,
254
Comment => \&_twig_print, Default => \&_twig_print,
257
# handlers used when the finish_print method has been called and the keep_encoding
259
my %twig_handlers_finish_print_original=
260
( Start => \&_twig_print_original, End => \&_twig_print_end_original,
261
Char => \&_twig_print_original, Entity => \&_twig_print_original,
262
XMLDecl => \&_twig_print_original, Doctype => \&_twig_print_original,
263
Element => \&_twig_print_original, Attlist => \&_twig_print_original,
264
CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
265
Proc => \&_twig_print_original, Comment => \&_twig_print_original,
266
Default => \&_twig_print_original,
269
# handlers used whithin ignored elements
270
my %twig_handlers_ignore=
271
( Start => \&_twig_ignore_start,
272
End => \&_twig_ignore_end,
273
Char => undef, Entity => undef, XMLDecl => undef,
274
Doctype => undef, Element => undef, Attlist => undef,
275
CdataStart => undef, CdataEnd => undef, Proc => undef,
276
Comment => undef, Default => undef,
280
# those handlers are only used if the entities are NOT to be expanded
281
my %twig_noexpand_handlers= ( Default => \&_twig_default );
283
my @saved_default_handler;
285
my $ID= 'id'; # default value, set by the Id argument
287
# all allowed options
289
( # XML::Twig options
290
TwigHandlers => 1, Id => 1,
291
TwigRoots => 1, TwigPrintOutsideRoots => 1,
292
StartTagHandlers => 1, EndTagHandlers => 1,
293
ForceEndTagHandlersUsage => 1,
294
DoNotChainHandlers => 1,
298
KeepEncoding => 1, DoNotEscapeAmpInAtts => 1,
299
ParseStartTag => 1, KeepAttsOrder => 1,
300
LoadDTD => 1, DTDHandler => 1,
301
DoNotOutputDTD => 1, NoProlog => 1,
302
ExpandExternalEnts => 1,
303
DiscardSpaces => 1, KeepSpaces => 1,
304
DiscardSpacesIn => 1, KeepSpacesIn => 1,
305
PrettyPrint => 1, EmptyTags => 1,
306
Comments => 1, Pi => 1,
307
OutputFilter => 1, InputFilter => 1,
308
OutputTextFilter => 1,
312
MapXmlns => 1, KeepOriginalPrefix => 1,
313
# XML::Parser options
314
ErrorContext => 1, ProtocolEncoding => 1,
315
Namespaces => 1, NoExpand => 1,
316
Stream_Delimiter => 1, ParseParamEnt => 1,
317
NoLWP => 1, Non_Expat_Options => 1,
321
# predefined input and output filters
322
use vars qw( %filter);
323
%filter= ( html => \&html_encode,
324
safe => \&safe_encode,
325
safe_hex => \&safe_encode_hex,
329
{ my ($class, %args) = @_;
332
# change all nice_perlish_names into nicePerlishNames
333
%args= _normalize_args( %args);
336
unless( $args{MoreOptions})
337
{ foreach my $arg (keys %args)
338
{ carp "invalid option $arg" unless $valid_option{$arg}; }
341
# a twig is really an XML::Parser
342
# my $self= XML::Parser->new(%args);
344
$self= XML::Parser->new(%args);
348
if( exists $args{TwigHandlers})
349
{ $handlers= $args{TwigHandlers};
350
$self->setTwigHandlers( $handlers);
351
delete $args{TwigHandlers};
354
# take care of twig-specific arguments
355
if( exists $args{StartTagHandlers})
356
{ $self->setStartTagHandlers( $args{StartTagHandlers});
357
delete $args{StartTagHandlers};
360
if( exists $args{DoNotChainHandlers})
361
{ $self->{twig_do_not_chain_handlers}= $args{DoNotChainHandlers}; }
363
if( exists $args{IgnoreElts})
364
{ $self->setIgnoreEltsHandlers( $args{IgnoreElts});
365
delete $args{IgnoreElts};
368
if( exists $args{Index})
369
{ my $index= $args{Index};
370
# we really want a hash name => path, we turn an array into a hash if necessary
371
if( ref( $index) eq 'ARRAY')
372
{ my %index= map { $_ => $_ } @$index;
375
while( my( $name, $exp)= each %$index)
376
{ $self->{_twig_index}->{$name} ||= [];
377
$self->setTwigHandler( $exp, sub { push @{$_[0]->{_twig_index}->{$name}}, $_; 1; });
381
$self->{twig_elt_class}= $args{EltClass} || 'XML::Twig::Elt';
382
if( exists( $args{EltClass})) { delete $args{EltClass}; }
384
if( exists( $args{MapXmlns}))
385
{ $self->{twig_map_xmlns}= $args{MapXmlns};
386
$self->{Namespaces}=1;
387
delete $args{MapXmlns};
390
if( exists( $args{KeepOriginalPrefix}))
391
{ $self->{twig_keep_original_prefix}= $args{KeepOriginalPrefix};
392
delete $args{KeepOriginalPrefix};
395
$self->{twig_dtd_handler}= $args{DTDHandler};
396
delete $args{DTDHandler};
398
if( $args{CharHandler})
399
{ $self->setCharHandler( $args{CharHandler});
400
delete $args{CharHandler};
404
{ $self->{twig_read_external_dtd}= 1;
405
delete $args{LoadDTD};
408
if( $args{ExpandExternalEnts})
409
{ $self->set_expand_external_entities( 1);
410
$self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts
411
delete $args{LoadDTD};
412
delete $args{ExpandExternalEnts};
415
if( $args{DoNotEscapeAmpInAtts})
416
{ $self->set_do_not_escape_amp_in_atts( 1);
417
$self->{twig_do_not_escape_amp_in_atts}=1;
420
{ $self->set_do_not_escape_amp_in_atts( 0);
421
$self->{twig_do_not_escape_amp_in_atts}=0;
424
# deal with TwigRoots argument, a hash of elements for which
425
# subtrees will be built (and associated handlers)
427
if( $args{TwigRoots})
428
{ $self->setTwigRoots( $args{TwigRoots});
429
delete $args{TwigRoots};
432
if( $args{EndTagHandlers})
433
{ unless ($self->{twig_roots} || $args{ForceEndTagHandlersUsage})
434
{ croak "you should not use EndTagHandlers without TwigRoots\n",
435
"if you want to use it anyway, normally because you have ",
436
"a start_tag_handlers that calls 'ignore' and you want to ",
437
"call an ent_tag_handlers at the end of the element, then ",
438
"pass 'force_end_tag_handlers_usage => 1' as an argument ",
442
$self->setEndTagHandlers( $args{EndTagHandlers});
443
delete $args{EndTagHandlers};
446
if( $args{TwigPrintOutsideRoots})
447
{ croak "cannot use TwigPrintOutsideRoots without TwigRoots"
448
unless( $self->{twig_roots});
449
# if the arg is a filehandle then store it
450
if( _is_fh( $args{TwigPrintOutsideRoots}) )
451
{ $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; }
452
$self->{twig_default_print}= $args{TwigPrintOutsideRoots};
455
if( $args{PrettyPrint})
456
{ $self->set_pretty_print( $args{PrettyPrint}); }
458
if( $args{EmptyTags})
459
{ $self->set_empty_tag_style( $args{EmptyTags}); }
462
if( $args{KeepSpaces})
463
{ croak "cannot use both keep_spaces and discard_spaces" if( $args{DiscardSpaces});
464
croak "cannot use both keep_spaces and keep_spaces_in" if( $args{KeepSpacesIn});
465
$self->{twig_keep_spaces}=1;
466
delete $args{KeepSpaces};
468
if( $args{DiscardSpaces})
469
{ croak "cannot use both discard_spaces and keep_spaces_in" if( $args{KeepSpacesIn});
470
$self->{twig_discard_spaces}=1;
471
delete $args{DiscardSpaces};
473
if( $args{KeepSpacesIn})
474
{ croak "cannot use both keep_spaces_in and discard_spaces_in" if( $args{DiscardSpacesIn});
475
$self->{twig_discard_spaces}=1;
476
$self->{twig_keep_spaces_in}={};
477
my @tags= @{$args{KeepSpacesIn}};
478
foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; }
479
delete $args{KeepSpacesIn};
481
if( $args{DiscardSpacesIn})
482
{ $self->{twig_keep_spaces}=1;
483
$self->{twig_discard_spaces_in}={};
484
my @tags= @{$args{DiscardSpacesIn}};
485
foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; }
486
delete $args{DiscardSpacesIn};
488
# discard spaces by default
489
$self->{twig_discard_spaces}= 1 unless( $self->{twig_keep_spaces});
491
$args{Comments}||= $COMMENTS_DEFAULT;
492
if( $args{Comments} eq 'drop') { $self->{twig_keep_comments}= 0; }
493
elsif( $args{Comments} eq 'keep') { $self->{twig_keep_comments}= 1; }
494
elsif( $args{Comments} eq 'process') { $self->{twig_process_comments}= 1; }
495
else { croak "wrong value for comments argument: '$args{Comments}' (should be 'drop', 'keep' or 'process')"; }
496
delete $args{Comments};
498
$args{Pi}||= $PI_DEFAULT;
499
if( $args{Pi} eq 'drop') { $self->{twig_keep_pi}= 0; }
500
elsif( $args{Pi} eq 'keep') { $self->{twig_keep_pi}= 1; }
501
elsif( $args{Pi} eq 'process') { $self->{twig_process_pi}= 1; }
502
else { croak "wrong value for pi argument: '$args{Pi}' (should be 'drop', 'keep' or 'process')"; }
505
if( $args{KeepEncoding})
506
{ $self->{twig_keep_encoding}= $args{KeepEncoding};
507
# set it in XML::Twig::Elt so print functions know what to do
508
$self->set_keep_encoding( 1);
509
$self->{parse_start_tag}= $args{ParseStartTag} || \&_parse_start_tag;
510
delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ;
511
delete $args{KeepEncoding};
512
$self->{NoExpand}= 1;
515
{ $self->set_keep_encoding( 0);
516
$self->{parse_start_tag}= $args{ParseStartTag} if( $args{ParseStartTag});
519
if( $args{OutputFilter})
520
{ $self->set_output_filter( $args{OutputFilter});
521
delete $args{OutputFilter};
524
{ $self->set_output_filter( 0); }
526
if( $args{RemoveCdata})
527
{ $self->set_remove_cdata( $args{RemoveCdata});
528
delete $args{RemoveCdata};
531
{ $self->set_remove_cdata( 0); }
533
if( $args{OutputTextFilter})
534
{ $self->set_output_text_filter( $args{OutputTextFilter});
535
delete $args{OutputTextFilter};
538
{ $self->set_output_text_filter( 0); }
541
if( exists $args{KeepAttsOrder})
542
{ $self->{keep_atts_order}= $args{KeepAttsOrder};
543
if( eval 'require Tie::IxHash')
544
{ import Tie::IxHash;
545
$self->set_keep_atts_order( $self->{keep_atts_order});
548
{ carp "Tie::IxHash not available, option keep_atts_order not allowed"; }
551
{ $self->set_keep_atts_order( 0); }
553
if( my $output_encoding= $args{OutputEncoding})
554
{ $self->set_output_encoding( $output_encoding);
555
delete $args{OutputFilter};
558
if( $args{InputFilter})
559
{ $self->set_input_filter( $args{InputFilter});
560
delete $args{InputFilter};
563
if( exists $args{Id}) { $ID= $args{Id}; delete $args{ID}; }
566
{ $self->setHandlers( %twig_noexpand_handlers);
567
$self->{twig_no_expand}=1;
571
{ $self->{no_prolog}= 1;
572
delete $args{NoProlog};
575
if( $args{DoNotOutputDTD})
576
{ $self->{no_dtd_output}= 1;
577
delete $args{DoNotOutputDTD};
581
if( $self->{twig_roots})
582
{ if( $self->{twig_default_print})
583
{ if( $self->{twig_keep_encoding})
584
{ $self->setHandlers( %twig_handlers_roots_print_original); }
586
{ $self->setHandlers( %twig_handlers_roots_print); }
589
{ $self->setHandlers( %twig_handlers_roots); }
592
{ $self->setHandlers( %twig_handlers); }
594
# XML::Parser::Expat does not like these handler to be set. So in order to
595
# use the various sets of handlers on XML::Parser or XML::Parser::Expat
596
# objects when needed, these ones have to be set only once, here, at
598
$self->setHandlers( Init => \&_twig_init, Final => \&_twig_final);
600
$self->{twig_entity_list}= XML::Twig::Entity_list->new;
602
$self->{twig_id}= $ID;
603
$self->{twig_stored_spaces}='';
605
$self->{twig}= $self;
606
weaken( $self->{twig}) if( $weakrefs);
614
return $t->_parseurl( 0, @_);
619
return $t->_parseurl( 1, @_);
622
# I should really add extra options to allow better configuration of the
623
# LWP::UserAgent object
625
# - the child gets the data and copies it to the pipe,
626
# - the parent reads the stream and sends it to XML::Parser
627
# the data is cut it chunks the size of the XML::Parser::Expat buffer
628
# the method returns the twig and the status
630
{ my( $t, $safe, $url, $agent)= @_;
631
pipe( README, WRITEME) or croak "cannot create connected pipes: $!";
633
{ # parent code: parse the incoming file
634
close WRITEME; # no need to write
635
my $result= $safe ? $t->safe_parse( \*README) : $t->parse( \*README);
641
close README; # no need to read
642
require LWP; # so we can get LWP::UserAgent and HTTP::Request
644
$agent ||= LWP::UserAgent->new;
645
my $request = HTTP::Request->new( GET => $url);
646
# _pass_url_content is called with chunks of data the same size as
647
# the XML::Parser buffer
648
my $response = $agent->request( $request,
649
sub { _pass_url_content( \*WRITEME, @_); }, BUFSIZE);
650
$response->is_success or croak "$url ", $response->message;
652
CORE::exit(); # CORE is there for mod_perl (which redefines exit)
656
# get the (hopefully!) XML data from the URL and
657
sub _pass_url_content
658
{ my( $fh, $data, $response, $protocol)= @_;
663
{ my %args= map { $_, 1 } @_;
664
%args= _normalize_args( %args);
665
foreach (keys %args) { $valid_option{$_}++; }
668
sub _twig_store_internal_dtd
670
my( $p, $string)= @_;
672
$string= $p->original_string() if( $t->{twig_keep_encoding});
673
$t->{twig_doctype}->{internal} .= $string;
676
sub _twig_stop_storing_internal_dtd
678
if( @saved_default_handler && defined $saved_default_handler[1])
679
{ $p->setHandlers( @saved_default_handler); }
682
$p->setHandlers( Default => undef);
688
{ my %normalized_args;
689
while( my $key= shift )
690
{ $key= join '', map { ucfirst } split /_/, $key;
691
#$key= "Twig".$key unless( substr( $key, 0, 4) eq 'Twig');
692
$normalized_args{$key}= shift ;
694
return %normalized_args;
697
sub _is_fh { return unless $_[0]; return $_[0] if( isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')); }
700
{ my( $handlers, $path, $handler)= @_;
702
$handlers ||= {}; # create the handlers struct if necessary
704
my $prev_handler= $handlers->{handlers}->{$path} || undef;
706
_set_gi_handler ( $handlers, $path, $handler, $prev_handler)
707
|| _set_path_handler ( $handlers, $path, $handler, $prev_handler)
708
|| _set_subpath_handler ( $handlers, $path, $handler, $prev_handler)
709
|| _set_attribute_handler ( $handlers, $path, $handler, $prev_handler)
710
|| _set_star_att_handler ( $handlers, $path, $handler, $prev_handler)
711
|| _set_star_att_regexp_handler ( $handlers, $path, $handler, $prev_handler)
712
|| _set_string_handler ( $handlers, $path, $handler, $prev_handler)
713
|| _set_attribute_regexp_handler( $handlers, $path, $handler, $prev_handler)
714
|| _set_string_regexp_handler ( $handlers, $path, $handler, $prev_handler)
715
|| _set_pi_handler ( $handlers, $path, $handler, $prev_handler)
716
|| _set_level_handler ( $handlers, $path, $handler, $prev_handler)
717
|| croak "unrecognized expression in handler: '$path'";
720
# this both takes care of the simple (gi) handlers and store
721
# the handler code reference for other handlers
722
$handlers->{handlers}->{$path}= $handler;
724
return $prev_handler;
729
{ my( $handlers, $path, $handler, $prev_handler)= @_;
730
if( $path =~ m{^\s*($REG_NAME)\s*$}o )
732
$handlers->{handlers}->{gi}->{$gi}= $handler;
739
sub _set_path_handler
740
{ my( $handlers, $path, $handler, $prev_handler)= @_;
741
if( $path=~ m{^\s*(?:/$REG_NAME)*/($REG_NAME)\s*$}o)
742
{ # a full path has been defined
743
# update the path_handlers count, knowing that
744
# either the previous or the new handler can be undef
745
$handlers->{path_handlers}->{gi}->{$1}-- if( $prev_handler);
747
{ $handlers->{path_handlers}->{gi}->{$1}++;
748
$handlers->{path_handlers}->{path}->{$path}= $handler;
756
sub _set_subpath_handler
757
{ my( $handlers, $path, $handler, $prev_handler)= @_;
758
if( $path=~ m{^\s*(?:$REG_NAME/)+($REG_NAME)\s*$}o)
759
{ # a partial path has been defined
760
# $1 is the "final" gi
761
$handlers->{subpath_handlers}->{gi}->{$1}-- if( $prev_handler);
763
{ $handlers->{subpath_handlers}->{gi}->{$1}++;
764
$handlers->{subpath_handlers}->{path}->{$path}= $handler;
772
sub _set_attribute_handler
773
{ my( $handlers, $path, $handler, $prev_handler)= @_;
774
# check for attribute conditions
775
if( $path=~ m{^\s*($REG_NAME) # elt
778
(?:=\s*($REG_STRING)\s*)? # = value (optional)
780
{ my( $gi, $att, $val)= ($1, $2, $3);
781
$val= substr( $val, 1, -1) if( defined $val); # remove the quotes
783
{ # replace or remove the previous handler
784
my $i=0; # so we can splice the array if need be
785
foreach my $exp ( @{$handlers->{attcond_handlers_exp}->{$gi}})
786
{ if( ($exp->{att} eq $att) && ( _eq( $exp->{val}, $val)) )
787
{ if( $handler) # just replace the handler
788
{ $exp->{handler}= $handler; }
789
else # remove the handler
790
{ $handlers->{attcond_handlers}->{$gi}--;
791
splice( @{$handlers->{attcond_handlers_exp}->{$gi}}, $i, 1);
800
$handlers->{attcond_handlers}->{$gi}++;
801
my $exp={att => $att, val => $val, handler => $handler};
802
$handlers->{attcond_handlers_exp}->{$gi} ||= [];
803
push @{$handlers->{attcond_handlers_exp}->{$gi}}, $exp;
812
sub _set_attribute_regexp_handler
813
{ my( $handlers, $path, $handler, $prev_handler)= @_;
814
# check for attribute regexp conditions
815
if( $path=~ m{^\s*($REG_NAME) # elt
819
/($REG_REGEXP_EXP)/ # /regexp/
820
($REG_REGEXP_MOD) # mods
822
{ my( $gi, $att, $regexp, $mods)= ($1, $2, $3, $4);
823
$regexp= qr/(?$mods:$regexp)/;
825
{ # replace or remove the previous handler
826
my $i=0; # so we can splice the array if need be
827
foreach my $exp ( @{$handlers->{attregexp_handlers_exp}->{$gi}})
828
{ if( ($exp->{att} eq $att) && ($exp->{regexp} eq $regexp) )
829
{ if( $handler) # just replace the handler
830
{ $exp->{handler}= $handler; }
831
else # remove the handler
832
{ $handlers->{attregexp_handlers}->{$gi}--;
833
splice( @{$handlers->{attregexp_handlers_exp}->{$gi}}, $i, 1);
842
$handlers->{attregexp_handlers}->{$gi}++;
843
my $exp={att => $att, regexp => $regexp, handler => $handler};
844
$handlers->{attregexp_handlers_exp}->{$gi} ||= [];
845
push @{$handlers->{attregexp_handlers_exp}->{$gi}}, $exp;
853
sub _set_string_handler
854
{ my( $handlers, $path, $handler, $prev_handler)= @_;
855
# check for string conditions
856
if( $path=~/^\s*($REG_NAME) # elt
857
\s*\[\s*string # [string
858
\s*\(\s*($REG_NAME)?\s*\) # (sub_elt)
860
($REG_STRING) # "text" (or 'text')
862
{ my( $gi, $sub_elt, $text)= ($1, $2, $3);
863
$text= substr( $text, 1, -1) if( defined $text); # remove the quotes
865
{ # replace or remove the previous handler
866
my $i=0; # so we can splice the array if need be
867
foreach my $exp ( @{$handlers->{text_handlers_exp}->{$gi}})
868
{ if( ($exp->{text} eq $text) &&
869
( !$exp->{sub_elt} || ($exp->{sub_elt} eq $sub_elt) )
871
{ if( $handler) # just replace the handler
872
{ $exp->{handler}= $handler; }
873
else # remove the handler
874
{ $handlers->{text_handlers}->{$gi}--;
875
splice( @{$handlers->{text_handlers_exp}->{$gi}}, $i, 1);
884
$handlers->{text_handlers}->{$gi}++;
885
my $exp={sub_elt => $sub_elt, text => $text, handler => $handler};
886
$handlers->{text_handlers_exp}->{$gi} ||= [];
887
push @{$handlers->{text_handlers_exp}->{$gi}}, $exp;
897
sub _set_string_regexp_handler
898
{ my( $handlers, $path, $handler, $prev_handler)= @_;
899
# check for string regexp conditions
900
if( $path=~m{^\s*($REG_NAME) # (elt)
901
\s*\[\s*string # [string
902
\s*\(\s*($REG_NAME?)\) # (sub_elt)
904
/($REG_REGEXP_EXP)/ # /(regexp)/
905
\s*($REG_REGEXP_MOD)? # (mods)
906
\s*\]\s*$}ox) # ] (or ')
907
{ my( $gi, $sub_elt, $regexp, $mods)= ($1, $2, $3, $4);
909
$regexp= qr/(?$mods:$regexp)/;
911
{ # replace or remove the previous handler
912
my $i=0; # so we can splice the array if need be
913
foreach my $exp ( @{$handlers->{regexp_handlers_exp}->{$gi}})
914
{ if( ($exp->{regexp} eq $regexp) &&
915
( !$exp->{sub_elt} || ($exp->{sub_elt} eq $sub_elt) )
917
{ if( $handler) # just replace the handler
918
{ $exp->{handler}= $handler;
920
else # remove the handler
921
{ $handlers->{regexp_handlers}->{$gi}--;
922
splice( @{$handlers->{regexp_handlers_exp}->{$gi}}, $i, 1);
931
$handlers->{regexp_handlers}->{$gi}++;
932
my $exp= {sub_elt => $sub_elt, regexp => $regexp, handler => $handler};
933
$handlers->{regexp_handlers_exp}->{$gi} ||= [];
934
push @{$handlers->{regexp_handlers_exp}->{$gi}}, $exp;
944
sub _set_star_att_handler
945
{ my( $handlers, $path, $handler, $prev_handler)= @_;
946
# check for *[@att="val"] or *[@att] conditions
947
if( $path=~/^(?:\s*\*)? # * (optional)
951
($REG_STRING))? # string
953
{ my( $att, $val)= ($1, $2);
954
$val= substr( $val, 1, -1) if( defined $val); # remove the quotes from the string
956
{ # replace or remove the previous handler
957
my $i=0; # so we can splice the array if need be
958
foreach my $exp ( @{$handlers->{att_handlers_exp}->{$att}})
959
{ if( ($exp->{att} eq $att) && ( !defined( $val) || _eq( $exp->{val}, $val) ) )
960
{ if( $handler) # just replace the handler
961
{ $exp->{handler}= $handler; }
962
else # remove the handler
963
{ splice( @{$handlers->{att_handlers_exp}->{$att}}, $i, 1);
964
$handlers->{att_handlers}->{$att}--;
973
$handlers->{att_handlers}->{$att}++;
974
my $exp={att => $att, val => $val, handler => $handler};
975
$handlers->{att_handlers_exp}->{$att} ||= [];
976
push @{$handlers->{att_handlers_exp}->{$att}}, $exp;
985
sub _set_star_att_regexp_handler
986
{ my( $handlers, $path, $handler, $prev_handler)= @_;
987
# check for *[@att=~ /regexp/] conditions
988
if( $path=~ m{^(?:\s*\*)? # * (optional)
992
/($REG_REGEXP_EXP)/ # /(regexp)/
993
\s*($REG_REGEXP_MOD)? # (mods)
995
{ my( $att, $regexp, $mods)= ($1, $2, $3);
997
$regexp= qr/(?$mods:$regexp)/;
999
{ # replace or remove the previous handler
1000
my $i=0; # so we can splice the array if need be
1001
foreach my $exp ( @{$handlers->{att_regexp_handlers_exp}->{$att}})
1002
{ if( $exp->{regexp} eq $regexp)
1003
{ if( $handler) # just replace the handler
1004
{ $exp->{handler}= $handler;
1006
else # remove the handler
1007
{ splice( @{$handlers->{att_regexp_handlers_exp}->{$att}}, $i, 1);
1008
$handlers->{att_regexp_handlers}--;
1016
{ # new handler only
1017
my $exp= { regexp => $regexp, handler => $handler};
1018
$handlers->{regexp_handlers_exp}->{$att} ||= [];
1019
push @{$handlers->{att_regexp_handlers_exp}->{$att}}, $exp;
1020
$handlers->{att_regexp_handlers}++;
1031
{ my( $handlers, $path, $handler, $prev_handler)= @_;
1032
# PI conditions ( '?target' => \&handler or '?' => \&handler
1033
# or '#PItarget' => \&handler or '#PI' => \&handler)
1034
if( $path=~ /^\s*(?:\?|#PI)\s*(?:([^\s]*)\s*)$/)
1035
{ my $target= $1 || '';
1036
# update the path_handlers count, knowing that
1037
# either the previous or the new handler can be undef
1038
$handlers->{pi_handlers}->{$1}= $handler;
1046
sub _set_level_handler
1047
{ my( $handlers, $path, $handler, $prev_handler)= @_;
1048
if( $path =~ m{^ \s* level \s* \( \s* ([0-9]+) \s* \) \s* $}ox )
1050
$handlers->{handlers}->{level}->{$level}= $handler;
1058
# just like eq except that undef values do not trigger warnings
1060
{ my( $val1, $val2)= @_;
1061
if( !defined $val1) { return !defined $val2 }
1062
if( !defined $val2) { return 0; }
1063
return $val1 eq $val2;
1067
{ my( $t, $handler)= @_;
1068
$t->{twig_char_handler}= $handler;
1073
{ my $handlers= shift;
1074
delete $handlers->{handlers};
1075
delete $handlers->{path_handlers};
1076
delete $handlers->{subpath_handlers};
1077
$handlers->{attcond_handlers_exp}=[] if( $handlers->{attcond_handlers});
1078
delete $handlers->{attcond_handlers};
1082
{ my $handlers= shift || return;
1083
my $set_handlers= {};
1084
foreach my $path (keys %{$handlers})
1085
{ _set_handler( $set_handlers, $path, $handlers->{$path}); }
1086
return $set_handlers;
1091
{ my( $t, $path, $handler)= @_;
1092
$t->{twig_handlers} ||={};
1093
return _set_handler( $t->{twig_handlers}, $path, $handler);
1097
{ my( $t, $handlers)= @_;
1098
my $previous_handlers= $t->{twig_handlers} || undef;
1099
_reset_handlers( $t->{twig_handlers});
1100
$t->{twig_handlers}= _set_handlers( $handlers);
1101
return $previous_handlers;
1104
sub setStartTagHandler
1105
{ my( $t, $path, $handler)= @_;
1106
$t->{twig_starttag_handlers}||={};
1107
return _set_handler( $t->{twig_starttag_handlers}, $path, $handler);
1110
sub setStartTagHandlers
1111
{ my( $t, $handlers)= @_;
1112
my $previous_handlers= $t->{twig_starttag_handlers} || undef;
1113
_reset_handlers( $t->{twig_starttag_handlers});
1114
$t->{twig_starttag_handlers}= _set_handlers( $handlers);
1115
return $previous_handlers;
1118
sub setIgnoreEltsHandler
1119
{ my( $t, $path, $action)= @_;
1120
$t->{twig_ignore_elts_handlers}||={};
1121
return _set_handler( $t->{twig_ignore_elts_handlers}, $path, $action );
1124
sub setIgnoreEltsHandlers
1125
{ my( $t, $handlers)= @_;
1126
my $previous_handlers= $t->{twig_ignore_elts_handlers} || undef;
1127
_reset_handlers( $t->{twig_ignore_elts_handlers});
1128
$t->{twig_ignore_elts_handlers}= _set_handlers( $handlers);
1129
return $previous_handlers;
1132
sub setEndTagHandler
1133
{ my( $t, $path, $handler)= @_;
1134
$t->{twig_endtag_handlers}||={};
1135
return _set_handler( $t->{twig_endtag_handlers}, $path,$handler);
1138
sub setEndTagHandlers
1139
{ my( $t, $handlers)= @_;
1140
my $previous_handlers= $t->{twig_endtag_handlers} || undef;
1141
_reset_handlers( $t->{twig_endtag_handlers});
1142
$t->{twig_endtag_handlers}= _set_handlers( $handlers);
1143
return $previous_handlers;
1146
# a little more complex: set the twig_handlers only if a code ref is given
1148
{ my( $t, $handlers)= @_;
1149
my $previous_roots= $t->{twig_roots} || undef;
1150
_reset_handlers($t->{twig_roots});
1151
$t->{twig_roots}= _set_handlers( $handlers);
1152
foreach my $path (keys %{$handlers})
1153
{ $t->{twig_handlers}||= {};
1154
_set_handler( $t->{twig_handlers}, $path, $handlers->{$path})
1155
if( isa( $handlers->{$path}, 'CODE'));
1157
return $previous_roots;
1160
# just store the reference to the expat object in the twig
1165
$t->{twig_parser}= $p;
1166
weaken( $t->{twig_parser}) if( $weakrefs);
1167
$t->{twig_parsing}=1;
1168
# in case they had been created by a previous parse
1169
delete $t->{twig_dtd};
1170
delete $t->{twig_doctype};
1171
delete $t->{twig_xmldecl};
1172
# if needed set the output filehandle
1173
$t->_set_fh_to_twig_output_fh();
1176
# uses eval to catch the parser's death
1178
{ my( $t, $str)= @_;
1179
eval { $t->parse( $str); } ;
1184
{ my( $t, $file)= @_;
1185
eval { $t->parsefile( $file); } ;
1190
sub _add_or_discard_stored_spaces
1194
if( $t->{twig_stored_spaces} || $option{force} || $t->{twig_preserve_space})
1195
{ if( $t->{twig_current}->is_pcdata)
1196
{ $t->{twig_current}->append_pcdata($t->{twig_stored_spaces}); }
1198
{ my $current_gi= $t->{twig_current}->gi;
1199
unless( defined( $t->{twig_space_policy}->{$current_gi}))
1200
{ $t->{twig_space_policy}->{$current_gi}= _space_policy( $t, $current_gi); }
1202
if( $t->{twig_space_policy}->{$current_gi} || ($t->{twig_stored_spaces}!~ m{\n}) || $option{force} || $t->{twig_preserve_space})
1203
{ _insert_pcdata( $t, $t->{twig_stored_spaces} ); }
1204
$t->{twig_stored_spaces}='';
1210
# the default twig handlers, which build the tree
1213
my ($p, $gi, @att)= @_;
1216
# empty the stored pcdata (space stored in case they are really part of
1217
# a pcdata element) or stored it if the space policy dictades so
1218
# create a pcdata element with the spaces if need be
1219
_add_or_discard_stored_spaces( $t);
1220
my $parent= $t->{twig_current};
1222
# if we were parsing PCDATA then we exit the pcdata
1223
if( $t->{twig_in_pcdata})
1224
{ $t->{twig_in_pcdata}= 0;
1225
$parent->del_twig_current;
1226
$parent= $parent->_parent;
1229
# if we choose to keep the encoding then we need to parse the tag
1230
if( my $func = $t->{parse_start_tag})
1231
{ ($gi, @att)= &$func($p->original_string); }
1232
elsif( $t->{twig_entities_in_attribute})
1234
($gi,@att)= _parse_start_tag( $p->recognized_string);
1235
$t->{twig_entities_in_attribute}=0;
1238
# if we are using an external DTD, we need to fill the default attributes
1239
if( $t->{twig_read_external_dtd}) { _fill_default_atts( $t, $gi, \@att); }
1241
# filter the input data if need be
1242
if( my $filter= $t->{twig_input_filter})
1243
{ $gi= $filter->( $gi);
1244
@att= map { $filter->($_) } @att;
1248
_replace_ns( $t, \$gi, \@att) if( $t->{twig_map_xmlns});
1250
my $elt= $t->{twig_elt_class}->new( $gi);
1251
$elt->set_atts( @att);
1253
$parent->del_twig_current if( $parent);
1254
$t->{twig_current}= $elt;
1255
$elt->set_twig_current;
1258
{ my $prev_sibling= $parent->_last_child;
1260
{ $prev_sibling->set_next_sibling( $elt);
1261
$elt->set_prev_sibling( $prev_sibling);
1264
$elt->set_parent( $parent);
1265
$parent->set_first_child( $elt) unless( $parent->_first_child);
1266
$parent->set_last_child( $elt);
1270
$t->set_root( $elt);
1271
# call dtd handlerif need be
1272
$t->{twig_dtd_handler}->($t, $t->{twig_dtd})
1273
if( defined $t->{twig_dtd_handler});
1275
# set this so we can catch external entities
1276
# (the handler was modified during DTD processing)
1277
if( $t->{twig_default_print})
1278
{ $p->setHandlers( Default => \&_twig_print); }
1279
elsif( $t->{twig_roots})
1280
{ $p->setHandlers( Default => sub { return }); }
1282
{ $p->setHandlers( Default => \&_twig_default); }
1285
$elt->set_empty( $p->recognized_string=~ m{/\s*>$}s ? 1 : 0);
1287
$elt->{extra_data}= $t->{extra_data} if( $t->{extra_data});
1288
$t->{extra_data}='';
1290
# if the element is ID-ed then store that info
1292
if( $id) { $t->{twig_id_list}->{$id}= $elt;
1293
weaken( $t->{twig_id_list}->{$id}) if( $weakrefs);
1297
# call user handler if need be
1298
if( $t->{twig_starttag_handlers})
1299
{ # call all appropriate handlers
1300
my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi, $elt);
1304
foreach my $handler ( @handlers)
1305
{ $handler->($t, $elt) || last; }
1306
# call _all_ handler if needed
1307
if( my $all= $t->{twig_starttag_handlers}->{handlers}->{$ALL})
1308
{ $all->($t, $elt); }
1311
# check if the tag is in the list of tags to be ignored
1312
if( $t->{twig_ignore_elts_handlers})
1313
{ my @handlers= _handler( $t, $t->{twig_ignore_elts_handlers}, $gi, $elt);
1314
# only the first handler counts, it contains the action (discard/print/string)
1315
if( @handlers) { my $action= shift @handlers; $t->ignore( $action); }
1318
if( $elt->att( 'xml:space') && ( $elt->att( 'xml:space') eq 'preserve')) { $t->{twig_preserve_space}++; }
1323
{ my( $t, $gi, $atts)= @_;
1324
foreach my $new_prefix ( $t->parser->new_ns_prefixes)
1325
{ my $uri= $t->parser->expand_ns_prefix( $new_prefix);
1326
# replace the prefix if it is mapped
1327
if( !$t->{twig_keep_original_prefix} && (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri}))
1328
{ $new_prefix= $mapped_prefix; }
1329
# now put the namespace declaration back in the element
1330
if( $new_prefix eq '#default')
1331
{ push @$atts, "xmlns" => $uri; }
1333
{ push @$atts, "xmlns:$new_prefix" => $uri; }
1336
if( $t->{twig_keep_original_prefix})
1337
{ # things become more complex: we need to find the original prefix
1338
# and store both prefixes
1339
my $ns_info= $t->_ns_info( $$gi);
1341
if( $ns_info->{mapped_prefix})
1342
{ $$gi= "$ns_info->{mapped_prefix}:$$gi";
1343
$map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix};
1349
my $ns_info= $t->_ns_info( $_);
1350
if( $ns_info->{mapped_prefix})
1351
{ $_= "$ns_info->{mapped_prefix}:$_";
1352
$map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix};
1359
push @$atts, '#original_gi', $map_att if( $map_att);
1362
{ $$gi= $t->_replace_prefix( $$gi);
1365
{ if( $att_name) { $_= $t->_replace_prefix( $_); $att_name=0; }
1366
else { $att_name=1; }
1372
# extract prefix, local_name, uri, mapped_prefix from a name
1373
# will only work if called from a start or end tag handler
1375
{ my( $t, $name)= @_;
1378
$ns_info->{uri}= $p->namespace( $name);
1379
return $ns_info unless( $ns_info->{uri});
1381
$ns_info->{prefix}= _a_proper_ns_prefix( $p, $ns_info->{uri});
1382
$ns_info->{mapped_prefix}= $t->{twig_map_xmlns}->{$ns_info->{uri}} || $ns_info->{prefix};
1387
sub _a_proper_ns_prefix
1388
{ my( $p, $uri)= @_;
1389
foreach my $prefix ($p->current_ns_prefixes)
1390
{ if( $p->expand_ns_prefix( $prefix) eq $uri)
1395
sub _fill_default_atts
1396
{ my( $t, $gi, $atts)= @_;
1397
my $dtd= $t->{twig_dtd};
1398
my $attlist= $dtd->{att}->{$gi};
1400
foreach my $att (keys %$attlist)
1401
{ if( !exists( $value{$att})
1402
&& exists( $attlist->{$att}->{default})
1403
&& ( $attlist->{$att}->{default} ne '#IMPLIED')
1405
{ # the quotes are included in the default, so we need to remove them
1406
my $default_value= substr( $attlist->{$att}->{default}, 1, -1);
1407
push @$atts, $att, $default_value;
1413
# the default function to parse a start tag (in keep_encoding mode)
1414
# can be overridden with the parse_start_tag method
1415
# only works for 1-byte character sets
1416
sub _parse_start_tag
1417
{ my $string= shift;
1420
# get the gi (between < and the first space, / or > character)
1421
#if( $string=~ s{^<\s*([^\s>/]*)[\s>/]*}{}s)
1422
if( $string=~ s{^<\s*($REG_NAME)\s*[\s>/]}{}s)
1425
{ croak "error parsing tag '$string'"; }
1426
while( $string=~ s{^([^\s=]*)\s*=\s*(["'])(.*?)\2\s*}{}s)
1427
{ push @atts, $1, $3; }
1432
{ my( $t, $elt)= @_;
1433
$t->{twig_root}= $elt;
1435
weaken( $elt->{twig}) if( $weakrefs);
1443
if( $t->{twig_map_xmlns}) { $gi= $t->_replace_prefix( $gi); }
1445
_add_or_discard_stored_spaces( $t);
1447
# the new twig_current is the parent
1448
my $elt= $t->{twig_current};
1449
$elt->del_twig_current;
1451
# if we were parsing PCDATA then we exit the pcdata too
1452
if( $t->{twig_in_pcdata})
1453
{ $t->{twig_in_pcdata}= 0;
1454
$elt= $elt->_parent if($elt->_parent);
1455
$elt->del_twig_current;
1458
# parent is the new current element
1459
my $parent= $elt->_parent;
1460
$parent->set_twig_current if( $parent);
1461
$t->{twig_current}= $parent;
1463
$elt->{extra_data_before_end_tag}= $t->{extra_data} if( $t->{extra_data});
1464
$t->{extra_data}='';
1466
if( $t->{twig_handlers})
1467
{ # look for handlers
1468
my @handlers= _handler( $t, $t->{twig_handlers}, $gi, $elt);
1470
local $_= $elt; # so we can use $_ in the handlers
1472
foreach my $handler ( @handlers)
1473
{ $handler->($t, $elt) || last; }
1474
# call _all_ handler if needed
1475
if( my $all= $t->{twig_handlers}->{handlers}->{$ALL})
1476
{ $all->($t, $elt); }
1479
# if twig_roots is set for the element then set appropriate handler
1480
if( $t->{twig_root_depth} and ($p->depth == $t->{twig_root_depth}) )
1481
{ if( $t->{twig_default_print})
1482
{ # select the proper fh (and store the currently selected one)
1483
$t->_set_fh_to_twig_output_fh();
1484
if( $t->{twig_keep_encoding})
1485
{ $p->setHandlers( %twig_handlers_roots_print_original); }
1487
{ $p->setHandlers( %twig_handlers_roots_print); }
1490
{ $p->setHandlers( %twig_handlers_roots); }
1493
if( $elt->att( 'xml:space') && ( $elt->att( 'xml:space') eq 'preserve')) { $t->{twig_preserve_space}--; }
1496
# return the list of handler that can be activated for an element
1497
# (either of CODE ref's or 1's for twig_roots)
1500
{ my( $t, $handlers, $gi, $elt)= @_;
1502
my @found_handlers=();
1505
# warning: $elt can be either
1506
# - a regular element
1507
# - a ref to the attribute hash (when called for an element
1508
# for which the XML::Twig::Elt has not been built, outside
1509
# of the twig_roots)
1510
# - a string (case of an entity in keep_encoding mode)
1512
# check for an attribute expression with no gi
1513
if( $handlers->{att_handlers})
1514
{ my %att_handlers= %{$handlers->{att_handlers_exp}};
1515
foreach my $att ( keys %att_handlers)
1517
# get the attribute value
1518
if( ref $elt eq 'HASH')
1519
{ $att_val= $elt->{$att}; } # $elt is the atts hash
1520
elsif( isa( $elt,'XML::Twig::Elt'))
1521
{ $att_val= $elt->att($att); } # $elt is an element
1522
if( defined $att_val)
1523
{ my @cond= @{$handlers->{att_handlers_exp}->{$att}};
1524
foreach my $cond (@cond)
1525
{ # 2 cases: either there is a val and the att value should be equal to it
1526
# or there is no val (condition was gi[@att]), just for the att to be defined
1527
if( !defined $cond->{val} || ($att_val eq $cond->{val}) )
1528
{ push @found_handlers, $cond->{handler};}
1534
# check for an attribute regexp expression with no gi
1535
if( $handlers->{att_regexp_handlers})
1536
{ my %att_handlers= %{$handlers->{att_regexp_handlers_exp}};
1537
foreach my $att ( keys %att_handlers)
1539
# get the attribute value
1540
if( ref $elt eq 'HASH')
1541
{ $att_val= $elt->{$att}; } # $elt is the atts hash
1542
elsif( isa( $elt,'XML::Twig::Elt'))
1543
{ $att_val= $elt->att($att); } # $elt is an element
1545
if( defined $att_val)
1546
{ my @cond= @{$handlers->{att_regexp_handlers_exp}->{$att}};
1547
foreach my $cond (@cond)
1548
{ if( $att_val=~ $cond->{regexp})
1549
{ push @found_handlers, $cond->{handler};}
1555
# check for a text expression
1556
if( $handlers->{text_handlers}->{$gi})
1557
{ my @text_handlers= @{$handlers->{text_handlers_exp}->{$gi}};
1558
foreach my $exp ( @text_handlers)
1559
{ if (!$exp->{sub_elt})
1560
{ push @found_handlers, $exp->{handler}
1561
if( $elt->text eq $exp->{text});
1564
{ foreach my $child ($elt->children($exp->{sub_elt}))
1565
{ if( $child->text eq $exp->{text})
1566
{ push @found_handlers, $exp->{handler};
1574
# check for a text regexp expression
1575
if( $handlers->{regexp_handlers}->{$gi})
1576
{ my @regexp_handlers= @{$handlers->{regexp_handlers_exp}->{$gi}};
1577
foreach my $exp ( @regexp_handlers)
1578
{ if( !$exp->{sub_elt})
1579
{ push @found_handlers, $exp->{handler}
1580
if $elt->text =~ $exp->{regexp};
1583
{ foreach my $child ($elt->children($exp->{sub_elt}))
1584
{ if( $child->text =~ $exp->{regexp})
1585
{ push @found_handlers, $exp->{handler};
1593
# check for an attribute expression
1594
if( $handlers->{attcond_handlers}->{$gi})
1595
{ my @attcond_handlers= @{$handlers->{attcond_handlers_exp}->{$gi}};
1596
foreach my $exp ( @attcond_handlers)
1598
# get the attribute value
1599
if( ref $elt eq 'HASH')
1600
{ $att_val= $elt->{$exp->{att}}; } # $elt is the atts hash
1602
{ $att_val= $elt->att( $exp->{att}); }# $elt is an element
1604
# 2 cases: either there is a val and the att value should be equal to it
1605
# or there is no val (condition was gi[@att]), just for the att to be defined
1606
if( defined $att_val && ( !defined $exp->{val} || ($att_val eq $exp->{val}) ) )
1607
{ push @found_handlers, $exp->{handler}; }
1611
# check for an attribute regexp
1612
if( $handlers->{attregexp_handlers}->{$gi})
1613
{ my @attregexp_handlers= @{$handlers->{attregexp_handlers_exp}->{$gi}};
1614
foreach my $exp ( @attregexp_handlers)
1616
# get the attribute value
1617
if( ref $elt eq 'HASH')
1618
{ $att_val= $elt->{$exp->{att}}; } # $elt is the atts hash
1620
{ $att_val= $elt->att( $exp->{att}); }# $elt is an element
1622
if( defined $att_val && ( ($att_val=~ $exp->{regexp}) ) )
1623
{ push @found_handlers, $exp->{handler}; }
1627
# check for a full path
1628
if( defined $handlers->{path_handlers}->{gi}->{$gi})
1629
{ my $path= $t->path( $gi);
1630
if( defined( $found_handler= $handlers->{path_handlers}->{path}->{$path}) )
1631
{ push @found_handlers, $found_handler; }
1634
# check for a partial path
1635
if( $handlers->{subpath_handlers}->{gi}->{$gi})
1636
{ my $path= $t->path( $gi);
1638
{ # test each sub path
1639
if( defined( $found_handler= $handlers->{subpath_handlers}->{path}->{$path}) )
1640
{ push @found_handlers, $found_handler; }
1641
$path=~ s{^[^/]*/?}{}; # remove initial gi and /
1645
# check for a gi (simple gi's are stored directly in the handlers field)
1646
if( defined $handlers->{handlers}->{gi}->{$gi})
1647
{ push @found_handlers, $handlers->{handlers}->{gi}->{$gi}; }
1649
if( defined $handlers->{handlers}->{level}->{$t->depth})
1650
{ push @found_handlers, $handlers->{handlers}->{level}->{$t->depth}; }
1652
# if no handler found call default handler if defined
1653
if( !@found_handlers && defined $handlers->{handlers}->{$DEFAULT})
1654
{ push @found_handlers, $handlers->{handlers}->{$DEFAULT}; }
1656
if( @found_handlers and $t->{twig_do_not_chain_handlers})
1657
{ @found_handlers= ($found_handlers[0]); }
1659
return @found_handlers; # empty if no handler found
1665
{ my( $t, $name)= @_;
1667
my $uri= $p->namespace( $name);
1668
# try to get the namespace from default if none is found (for attributes)
1669
# this should probably be an option
1670
if( !$uri and( $name!~/^xml/)) { $uri= $p->expand_ns_prefix( '#default'); }
1672
{ if (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri})
1673
{ return "$mapped_prefix:$name"; }
1675
{ my $prefix= _a_proper_ns_prefix( $p, $uri);
1676
return $prefix ? "$prefix:$name" : $name;
1685
my ($p, $string)= @_;
1688
# if keep_encoding was set then use the original string instead of
1689
# the parsed (UTF-8 converted) one
1690
if( $t->{twig_keep_encoding})
1691
{ $string= $p->original_string(); }
1693
if( $t->{twig_input_filter})
1694
{ $string= $t->{twig_input_filter}->( $string); }
1696
if( $t->{twig_char_handler})
1697
{ $string= $t->{twig_char_handler}->( $string); }
1699
my $elt= $t->{twig_current};
1701
if( $t->{twig_in_cdata})
1702
{ # text is the continuation of a previously created pcdata
1703
$elt->append_cdata( $t->{twig_stored_spaces}.$string); }
1704
elsif( $t->{twig_in_pcdata})
1705
{ # text is the continuation of a previously created cdata
1706
if( $t->{extra_data})
1707
{ $elt->{extra_data_in_pcdata} ||=[];
1708
push @{$elt->{extra_data_in_pcdata}}, { text => $t->{extra_data}, offset => length( $elt->{pcdata}) };
1709
$t->{extra_data}='';
1711
$elt->append_pcdata( $string);
1714
{ # text is just space, which might be discarded later
1715
if( $string=~/\A\s*\Z/s)
1716
{ if( $t->{extra_data})
1717
{ # we got extra data (comment, pi), lets add the spaces to it
1718
$t->{extra_data} .= $string;
1721
{ # no extra data, just store the spaces
1722
$t->{twig_stored_spaces}.= $string;
1726
{ my $new_elt= _insert_pcdata( $t, $t->{twig_stored_spaces}.$string);
1727
$elt->del_twig_current;
1728
$new_elt->set_twig_current;
1729
$t->{twig_current}= $new_elt;
1730
$t->{twig_in_pcdata}=1;
1731
if( $t->{extra_data})
1732
{ $new_elt->{extra_data_in_pcdata}=[];
1733
push @{$new_elt->{extra_data_in_pcdata}}, { text => $t->{extra_data}, offset => 0 };
1734
$t->{extra_data}='';
1740
sub _twig_cdatastart
1745
$t->{twig_in_cdata}=1;
1746
my $cdata= $t->{twig_elt_class}->new( '#CDATA');
1747
my $twig_current= $t->{twig_current};
1749
if( $t->{twig_in_pcdata})
1750
{ # create the node as a sibling of the #PCDATA
1751
$cdata->set_prev_sibling( $twig_current);
1752
$twig_current->set_next_sibling( $cdata);
1753
my $parent= $twig_current->_parent;
1754
$cdata->set_parent( $parent);
1755
$parent->set_last_child( $cdata);
1756
$t->{twig_in_pcdata}=0;
1759
{ # we have to create a PCDATA element if we need to store spaces
1760
if( $t->_space_policy($twig_current->gi) && $t->{twig_stored_spaces})
1761
{ _insert_pcdata( $t, $t->{twig_stored_spaces}); }
1762
$t->{twig_stored_spaces}='';
1764
# create the node as a child of the current element
1765
$cdata->set_parent( $twig_current);
1766
if( my $prev_sibling= $twig_current->_last_child)
1767
{ $cdata->set_prev_sibling( $prev_sibling);
1768
$prev_sibling->set_next_sibling( $cdata);
1771
{ $twig_current->set_first_child( $cdata); }
1772
$twig_current->set_last_child( $cdata);
1776
$twig_current->del_twig_current;
1777
$t->{twig_current}= $cdata;
1778
$cdata->set_twig_current;
1779
if( $t->{extra_data}) { $cdata->set_extra_data( $t->{extra_data}); $t->{extra_data}='' };
1787
$t->{twig_in_cdata}=0;
1789
my $elt= $t->{twig_current};
1790
$elt->del_twig_current;
1791
my $cdata= $elt->cdata;
1792
$elt->set_cdata( $cdata);
1794
if( $t->{twig_handlers})
1795
{ # look for handlers
1796
my @handlers= _handler( $t, $t->{twig_handlers}, CDATA, $elt);
1797
local $_= $elt; # so we can use $_ in the handlers
1798
foreach my $handler ( @handlers) { $handler->($t, $elt) || last; }
1801
$elt= $elt->_parent;
1802
$t->{twig_current}= $elt;
1803
$elt->set_twig_current;
1808
my( $p, $target, $data)= @_;
1810
return unless( $t->{twig_process_pi} || $t->{twig_keep_pi});
1812
if( $t->{twig_input_filter})
1813
{ $target = $t->{twig_input_filter}->( $target) ;
1814
$data = $t->{twig_input_filter}->( $data) ;
1817
my $twig_current= $t->{twig_current}; # always defined
1819
# if pi's are to be kept then we piggiback them to the current element
1820
if( $t->{twig_keep_pi})
1822
if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target})
1823
{ $t->{extra_data}.= $handler->( $t, $target, $data); }
1824
elsif( $handler= $t->{twig_handlers}->{pi_handlers}->{''})
1825
{ $t->{extra_data}.= $handler->( $t, $target, $data); }
1827
{ if( $t->{twig_stored_spaces})
1828
{ $t->{extra_data}.= $t->{twig_stored_spaces};
1829
$t->{twig_stored_spaces}= '';
1831
# then the recognized/original string (input filtered if needed)
1832
my $extra_data= $XML::Twig::Elt::keep_encoding ? $p->recognized_string() : $p->original_string();
1833
$extra_data= $t->{twig_input_filter}->( $extra_data) if( $t->{twig_input_filter});
1834
$t->{extra_data}.= $extra_data;
1840
{ # pi's are processed
1841
my $pi= $t->{twig_elt_class}->new( PI);
1842
$pi->set_pi( $target, $data);
1845
{ _pi_handlers( $t, $pi, $target);
1846
_add_prolog_data( $t, $pi);
1850
if( $t->{twig_in_pcdata})
1851
{ # create the node as a sibling of the #PCDATA
1852
$pi->paste_after( $twig_current);
1853
$t->{twig_in_pcdata}=0;
1856
{ # we have to create a PCDATA element if we need to store spaces
1857
if( $t->_space_policy($twig_current->gi) && $t->{twig_stored_spaces})
1858
{ _insert_pcdata( $t, $t->{twig_stored_spaces}); }
1859
$t->{twig_stored_spaces}='';
1860
# create the node as a child of the current element
1861
$pi->paste_last_child( $twig_current);
1864
$twig_current->del_twig_current;
1865
my $parent= $pi->_parent;
1866
$t->{twig_current}= $parent;
1867
$parent->set_twig_current;
1869
_pi_handlers( $t, $pi, $target);
1875
{ my( $t, $pi, $target)= @_;
1876
if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target})
1877
{ local $_= $pi; $handler->( $t, $pi); }
1878
elsif( $handler= $t->{twig_handlers}->{pi_handlers}->{''})
1879
{ local $_= $pi; $handler->( $t, $pi); }
1885
my( $p, $comment_text)= @_;
1887
return unless( $t->{twig_process_comments} || $t->{twig_keep_comments});
1889
my $twig_current= $t->{twig_current}; # always defined
1891
# if comments are to be kept then we piggiback them to the current element
1892
if( $t->{twig_keep_comments})
1893
{ # first add spaces
1894
if( $t->{twig_stored_spaces})
1895
{ $t->{extra_data}.= $t->{twig_stored_spaces};
1896
$t->{twig_stored_spaces}= '';
1898
# then the recognized/original string (input filtered if needed)
1899
my $extra_data= $XML::Twig::Elt::keep_encoding ? $p->recognized_string() : $p->original_string();
1900
$extra_data= $t->{twig_input_filter}->( $extra_data) if( $t->{twig_input_filter});
1901
$t->{extra_data}.= $extra_data;
1905
$comment_text= $t->{twig_input_filter}->( $comment_text) if( $t->{twig_input_filter});
1907
my $comment= $t->{twig_elt_class}->new( COMMENT);
1908
$comment->set_comment( $comment_text);
1911
{ _add_prolog_data( $t, $comment);
1912
_comment_handler( $t, $comment);
1916
if( $t->{twig_in_pcdata})
1917
{ # create the node as a sibling of the #PCDATA
1918
$comment->paste_after( $twig_current);
1919
$t->{twig_in_pcdata}=0;
1922
{ # we have to create a PCDATA element if we need to store spaces
1923
if( $t->_space_policy($twig_current->gi) && $t->{twig_stored_spaces})
1924
{ _insert_pcdata( $t, $t->{twig_stored_spaces}); }
1925
$t->{twig_stored_spaces}='';
1926
# create the node as a child of the current element
1927
$comment->paste_last_child( $twig_current);
1930
_comment_handler( $t, $comment);
1932
$twig_current->del_twig_current;
1934
my $parent= $comment->_parent;
1935
$t->{twig_current}= $parent;
1936
$parent->set_twig_current;
1940
sub _comment_handler
1941
{ my( $t, $comment)= @_;
1942
if( $t->{twig_handlers}->{handlers}->{gi}->{'#COMMENT'})
1943
{ # look for handlers
1945
my @handlers= _handler( $t, $t->{twig_handlers}, '#COMMENT', $comment);
1946
foreach my $handler ( @handlers)
1947
{ $handler->($t, $comment) || last; }
1952
sub _add_prolog_data
1953
{ my($t, $prolog_data)= @_;
1954
# comment before the first element
1955
$t->{prolog_data} ||= $t->{twig_elt_class}->new( '#PROLOG_DATA');
1956
# create the node as a child of the current element
1957
$prolog_data->paste_last_child( $t->{prolog_data});
1965
# restore the selected filehandle if needed
1966
$t->_set_fh_to_selected_fh();
1968
select $t->{twig_original_selected_fh} if($t->{twig_original_selected_fh}); # probably dodgy
1970
# tries to clean-up (probably not very well at the moment)
1972
undef $t->{twig_parser};
1974
undef $t->{twig_parsing};
1980
{ my( $t, $string)= @_;
1981
# create a new #PCDATA element
1982
my $parent= $t->{twig_current}; # always defined
1983
my $elt= $t->{twig_elt_class}->new( PCDATA);
1984
$elt->set_pcdata( $string);
1985
my $prev_sibling= $parent->_last_child;
1987
{ $prev_sibling->set_next_sibling( $elt);
1988
$elt->set_prev_sibling( $prev_sibling);
1991
{ $parent->set_first_child( $elt); }
1993
$elt->set_parent( $parent);
1994
$parent->set_last_child( $elt);
1995
$t->{twig_stored_spaces}='';
2002
$policy=0 if( $t->{twig_discard_spaces});
2003
$policy=1 if( $t->{twig_keep_spaces});
2004
$policy=1 if( $t->{twig_keep_spaces_in}
2005
&& $t->{twig_keep_spaces_in}->{$gi});
2006
$policy=0 if( $t->{twig_discard_spaces_in}
2007
&& $t->{twig_discard_spaces_in}->{$gi});
2012
sub _twig_entity($$$$$$)
2014
my( $p, $name, $val, $sysid, $pubid, $ndata)= @_;
2016
my $ent=XML::Twig::Entity->new( $name, $val, $sysid, $pubid, $ndata);
2017
$t->{twig_entity_list}->add( $ent);
2018
if( $parser_version > 2.27)
2019
{ # this is really ugly, but with some versions of XML::Parser the value
2020
# of the entity is not properly returned by the default handler
2021
my $ent_decl= $ent->text;
2022
if( $t->{twig_keep_encoding})
2023
{ if( defined $ent->{val} && ($ent_decl !~ /["']/))
2024
{ my $val= $ent->{val};
2025
$ent_decl .= $val =~ /"/ ? qq{'$val' } : qq{"$val" };
2027
# for my solaris box (perl 5.6.1, XML::Parser 2.31, expat?)
2028
$t->{twig_doctype}->{internal}=~ s{<!ENTITY\s+$name\s+$}{substr( $ent_decl, 0, -1)}e;
2030
$t->{twig_doctype}->{internal} .= $ent_decl
2031
unless( $t->{twig_doctype}->{internal}=~ m{<!ENTITY\s+$name\s+});
2039
$t->{twig_xmldecl}||={}; # could have been set by set_output_encoding
2040
$t->{twig_xmldecl}->{version}= shift;
2041
$t->{twig_xmldecl}->{encoding}= shift;
2042
$t->{twig_xmldecl}->{standalone}= shift;
2047
my( $p, $name, $sysid, $pub, $internal)= @_;
2049
$t->{twig_doctype}||= {}; # create
2050
$t->{twig_doctype}->{name}= $name; # always there
2051
$t->{twig_doctype}->{sysid}= $sysid; #
2052
$t->{twig_doctype}->{pub}= $pub; #
2054
# now let's try to cope with XML::Parser 2.28 and above
2055
if( $parser_version > 2.27)
2056
{ @saved_default_handler= $p->setHandlers( Default => \&_twig_store_internal_dtd,
2057
Entity => \&_twig_entity,
2059
$p->setHandlers( DoctypeFin => \&_twig_stop_storing_internal_dtd);
2060
$t->{twig_doctype}->{internal}='';
2063
# for XML::Parser before 2.28
2064
{ $t->{twig_doctype}->{internal}=$internal; }
2066
# now check if we want to get the DTD info
2067
if( $t->{twig_read_external_dtd} && $sysid)
2068
{ # let's build a fake document with an internal DTD
2070
my $tmpfile= "twig_tmp$$";
2071
open( TMP, ">$tmpfile")
2072
or croak "cannot create temp file $tmpfile: $!";
2073
print TMP "<!DOCTYPE $name [\n"; # print the doctype
2075
{ open( DTD, "<$sysid")
2076
or croak "cannot open dtd file $sysid: $!";
2080
print TMP $dtd; # add the dtd
2082
print TMP "]>"; # close the dtd
2083
print TMP "<$name/>\n"; # XML::Parser needs an element
2087
$t->save_global_state(); # save the globals (they will be reset by the following new)
2088
my $t_dtd= XML::Twig->new; # create a temp twig
2089
$t->restore_global_state();
2090
$t_dtd->parsefile( $tmpfile); # parse it
2091
$t->{twig_dtd}= $t_dtd->{twig_dtd}; # grab the dtd info
2092
#$t->{twig_dtd_is_external}=1;
2093
$t->{twig_entity_list}= $t_dtd->{twig_entity_list}; # grab the entity info
2102
my( $p, $name, $model)= @_;
2104
$t->{twig_dtd}||= {}; # may create the dtd
2105
$t->{twig_dtd}->{model}||= {}; # may create the model hash
2106
$t->{twig_dtd}->{elt_list}||= []; # ordered list of elements
2107
push @{$t->{twig_dtd}->{elt_list}}, $name; # store the elt
2108
$t->{twig_dtd}->{model}->{$name}= $model; # store the model
2109
if( ($parser_version > 2.27) && ($t->{twig_doctype}->{internal}=~ m{(^|>)\s*$}) )
2110
{ my $text= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string;
2112
{ # this version of XML::Parser does not return the text in the *_string method
2113
# we need to rebuild it
2114
$text= "<!ELEMENT $name $model>";
2116
$t->{twig_doctype}->{internal} .= $text;
2122
my( $p, $gi, $att, $type, $default, $fixed)= @_;
2123
#warn "in attlist: gi: '$gi', att: '$att', type: '$type', default: '$default', fixed: '$fixed'\n";
2125
$t->{twig_dtd}||= {}; # create dtd if need be
2126
$t->{twig_dtd}->{$gi}||= {}; # create elt if need be
2127
#$t->{twig_dtd}->{$gi}->{att}||= {}; # create att if need be
2128
if( ($parser_version > 2.27) && ($t->{twig_doctype}->{internal}=~ m{(^|>)\s*$}) )
2129
{ my $text= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string;
2131
{ # this version of XML::Parser does not return the text in the *_string method
2132
# we need to rebuild it
2133
my $att_decl="$att $type";
2134
$att_decl .= " #FIXED" if( $fixed);
2135
$att_decl .= " $default" if( defined $default);
2136
# 2 cases: there is already an attlist on that element or not
2137
if( $t->{twig_dtd}->{att}->{$gi})
2138
{ # there is already an attlist, add to it
2139
$t->{twig_doctype}->{internal}=~ s{(<!ATTLIST\s*$gi )(.*?)\n?>}
2140
{ "$1$2\n" . ' ' x length( $1) . "$att_decl\n>"}es;
2143
{ # create the attlist
2144
$t->{twig_doctype}->{internal}.= "<!ATTLIST $gi $att_decl>"
2148
$t->{twig_dtd}->{att}->{$gi}->{$att}= {} ;
2149
$t->{twig_dtd}->{att}->{$gi}->{$att}->{type}= $type;
2150
$t->{twig_dtd}->{att}->{$gi}->{$att}->{default}= $default if( defined $default);
2151
$t->{twig_dtd}->{att}->{$gi}->{$att}->{fixed}= $fixed;
2156
my( $p, $string)= @_;
2160
# process only if we have an entity
2161
return unless( $string=~ m{^&([^;]*);$});
2162
# the entity has to be pure pcdata, or we have a problem
2163
if( ($p->original_string=~ m{^<}) && ($p->original_string=~ m{>$}) )
2164
{ # string is a tag, entity is in an attribute
2165
$t->{twig_entities_in_attribute}=1 if( $t->{twig_do_not_escape_amp_in_atts});
2169
if( $t->{twig_keep_encoding})
2170
{ _twig_char( $p, $string);
2171
$ent= substr( $string, 1, -1);
2174
{ $ent= _twig_insert_ent( $t, $string);
2181
sub _twig_insert_ent
2183
my( $t, $string)=@_;
2185
my $twig_current= $t->{twig_current};
2187
my $ent= $t->{twig_elt_class}->new( '#ENT');
2188
$ent->set_ent( $string);
2190
_add_or_discard_stored_spaces( $t, force => 0);
2192
if( $t->{twig_in_pcdata})
2193
{ # create the node as a sibling of the #PCDATA
2195
$ent->set_prev_sibling( $twig_current);
2196
$twig_current->set_next_sibling( $ent);
2197
my $parent= $twig_current->_parent;
2198
$ent->set_parent( $parent);
2199
$parent->set_last_child( $ent);
2200
# the twig_current is now the parent
2201
$twig_current->del_twig_current;
2202
$t->{twig_current}= $parent;
2204
$t->{twig_in_pcdata}=0;
2207
{ # create the node as a child of the current element
2208
$ent->set_parent( $twig_current);
2209
if( my $prev_sibling= $twig_current->_last_child)
2210
{ $ent->set_prev_sibling( $prev_sibling);
2211
$prev_sibling->set_next_sibling( $ent);
2214
{ $twig_current->set_first_child( $ent) if( $twig_current); }
2215
$twig_current->set_last_child( $ent) if( $twig_current);
2218
# meant to trigger entity handler, does not seem to be activated at this time
2219
#if( my $handler= $t->{twig_handlers}->{gi}->{'#ENT'})
2220
# { local $_= $ent; $handler->( $t, $ent); }
2226
{ return $_[0]->{twig_parser}; }
2228
# returns the declaration text (or a default one)
2231
return '' unless( $t->{twig_xmldecl} || $t->{output_encoding});
2233
my $decl= $t->{twig_xmldecl};
2235
{ my $version= $decl->{version};
2236
$decl_string= q{<?xml};
2237
$decl_string .= qq{ version="$version"};
2239
# encoding can either have been set (in $decl->{output_encoding})
2240
# or come from the document (in $decl->{encoding})
2241
if( $t->{output_encoding})
2242
{ my $encoding= $t->{output_encoding};
2243
$decl_string .= qq{ encoding="$encoding"};
2245
elsif( $decl->{encoding})
2246
{ my $encoding= $decl->{encoding};
2247
$decl_string .= qq{ encoding="$encoding"};
2250
if( defined( $decl->{standalone}))
2251
{ $decl_string .= q{ standalone="};
2252
$decl_string .= $decl->{standalone} ? "yes" : "no";
2253
$decl_string .= q{"};
2256
$decl_string .= "?>\n";
2259
{ my $encoding= $t->{output_encoding};
2260
$decl_string= qq{<?xml version="1.0" encoding="$encoding"?>};
2263
my $output_filter= XML::Twig::Elt::output_filter();
2264
return $output_filter ? $output_filter->( $decl_string) : $decl_string;
2267
# returns the doctype text (or none)
2268
# that's the doctype just has it was in the original document
2271
my $doctype= $t->{twig_doctype} or return '';
2272
my $string= "<!DOCTYPE " . $doctype->{name};
2273
$string .= qq{ SYSTEM "$doctype->{sysid}"} if( $doctype->{sysid});
2274
$string .= qq{ PUBLIC "$doctype->{pub}" } if( $doctype->{pub});
2275
if( $doctype->{internal})
2276
{ # !@#$%^&* code to deal with various expat/XML::Parser versions
2277
$string.= " [" unless( $doctype->{internal}=~ m{^\s*\[});
2278
$string.= " " if( $doctype->{internal}=~ m{^\[});
2279
$string .= $doctype->{internal};
2280
$string=~ s{\n?]?>?$}{\n]>};
2286
{ my( $t, $name, $system, $public, $internal)= @_;
2287
$t->{twig_doctype}= {};
2288
my $doctype= $t->{twig_doctype};
2289
$doctype->{name} = $name if( defined $name);
2290
$doctype->{sysid} = $system if( defined $system);
2291
$doctype->{pub} = $public if( defined $public);
2292
$doctype->{internal} = $internal if( defined $internal);
2295
# return the dtd object
2298
return $t->{twig_dtd};
2301
# return an element model, or the list of element models
2305
return $t->dtd->{model}->{$elt} if( $elt);
2306
return sort keys %{$t->dtd->{model}};
2310
# return the entity_list object
2313
return $t->{twig_entity_list};
2316
# return the list of entity names
2319
return sort keys %{$t->{twig_entity_list}} ;
2322
# return the entity object
2325
my $entity_name= shift;
2326
return $t->{twig_entity_list}->{$entity_name};
2332
my $fh= _is_fh($_[0]) ? shift : $t->{twig_output_fh} || select() || \*STDOUT;
2334
print {$fh} $t->prolog( @_);
2339
my %args= _normalize_args( @_);
2342
return $prolog if( $t->{no_prolog});
2344
my $update_dtd = $args{UpdateDTD} || '';
2346
$prolog .= $t->xmldecl;
2347
return $prolog if( defined( $t->{no_dtd_output}));
2351
my $doctype= $t->{twig_doctype};
2355
{ $dtd .= "<!DOCTYPE ".$doctype->{name};
2356
$dtd .= " PUBLIC \"$doctype->{pub}\"" if( $doctype->{pub});
2357
$dtd .= " SYSTEM \"$doctype->{sysid}\"" if( $doctype->{sysid} && !$doctype->{pub});
2359
$dtd .= $t->{twig_entity_list}->text;
2362
elsif( !$t->{'twig_dtd'} && keys %{$t->entity_list})
2363
{ $dtd .= "<!DOCTYPE " . $t->root->gi . " [\n" . $t->entity_list->text . "\n]>"; }
2365
{ my $dtd= $t->{'twig_dtd'};
2366
$dtd .= $t->dtd_text;
2371
$dtd .= "<!DOCTYPE ". $doctype->{name} if( $doctype->{name});
2372
$dtd .= " PUBLIC \"$doctype->{pub}\"" if( $doctype->{pub});
2373
$dtd .= " SYSTEM" if( $doctype->{sysid} && !$doctype->{pub});
2374
$dtd .= ' "' . $doctype->{sysid} . '"' if( $doctype->{sysid});
2375
if( my $internal= $doctype->{internal})
2376
{ # add opening and closing brackets if not already there
2377
# plus some spaces and newlines for a nice formating
2378
# I test it here because I can't remember which version of
2379
# XML::Parser need it or not, so this is actually quite safe
2380
$internal=~ s{^\s*(\[\s*)?}{ [\n};
2381
$internal=~ s{\s*(\]\s*(>\s*)?)?\s*$}{\n]>\n};
2388
# terrible hack, as I can't figure out in which case the darn prolog
2389
# should get an extra >
2390
$dtd=~ s/(>\s*)*$/>\n/ if($dtd);
2394
my $output_filter= XML::Twig::Elt::output_filter();
2395
return $output_filter ? $output_filter->( $prolog) : $prolog;
2401
sub _print_prolog_data
2403
my $fh= _is_fh($_[0]) ? shift : undef;
2404
if( $fh) { print $fh $t->_prolog_data( @_); }
2405
else { print $t->_prolog_data( @_); }
2410
return'' unless( $t->{prolog_data});
2411
my $prolog_data_text='';
2412
foreach ( $t->{prolog_data}->children)
2413
{ $prolog_data_text .= $_->sprint . "\n"; }
2414
return$ prolog_data_text;
2419
my $fh= _is_fh( $_[0]) ? shift : undef;
2420
my %args= _normalize_args( @_);
2423
if( defined $args{PrettyPrint})
2424
{ $old_pretty= $t->set_pretty_print( $args{PrettyPrint});
2425
delete $args{PrettyPrint};
2428
my $old_empty_tag_style;
2429
if( defined $args{EmptyTags})
2430
{ $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags});
2431
delete $args{EmptyTags};
2435
{ $t->print_prolog( $fh, %args);
2436
$t->_print_prolog_data( $fh, %args);
2439
{ $t->print_prolog( %args);
2440
$t->_print_prolog_data( %args);
2443
$t->{twig_root}->print( $fh) if( $t->{twig_root});
2444
$t->set_pretty_print( $old_pretty) if( defined $old_pretty);
2445
$t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style);
2451
my $fh= _is_fh( $_[0]) ? shift : undef;
2452
my $old_select= defined $fh ? select $fh : undef;
2453
my $up_to= ref $_[0] ? shift : undef;
2454
my %args= _normalize_args( @_);
2457
if( defined $args{PrettyPrint})
2458
{ $old_pretty= $t->set_pretty_print( $args{PrettyPrint});
2459
delete $args{PrettyPrint};
2462
my $old_empty_tag_style;
2463
if( $args{EmptyTags})
2464
{ $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags});
2465
delete $args{EmptyTags};
2469
# the "real" last element processed, as _twig_end has closed it
2472
{ $last_elt= $up_to; }
2473
elsif( $t->{twig_current})
2474
{ $last_elt= $t->{twig_current}->_last_child; }
2476
{ $last_elt= $t->{twig_root}; }
2478
# flush the DTD unless it has ready flushed (ie root has been flushed)
2479
my $elt= $t->{twig_root};
2480
$t->print_prolog( %args) unless( $elt->_flushed);
2484
if( $last_elt && $last_elt->in( $elt))
2486
unless( $elt->_flushed)
2487
{ # just output the front tag
2488
print $elt->start_tag();
2491
$next_elt= $elt->_first_child;
2494
{ # an element before the last one or the last one,
2495
$next_elt= $elt->_next_sibling;
2498
last if( $last_elt && ($elt == $last_elt));
2502
select $old_select if( defined $old_select);
2503
$t->set_pretty_print( $old_pretty) if( defined $old_pretty);
2504
$t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style);
2508
# flushes up to an element
2509
# this method just reorders the arguments and calls flush
2515
$t->flush( $fh, $up_to, @_);
2518
{ $t->flush( $up_to, @_); }
2522
# same as print except the entire document text is returned as a string
2525
my %args= _normalize_args( @_);
2528
if( defined $args{PrettyPrint})
2529
{ $old_pretty= $t->set_pretty_print( $args{PrettyPrint});
2530
delete $args{PrettyPrint};
2533
my $old_empty_tag_style;
2534
if( defined $args{EmptyTags})
2535
{ $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags});
2536
delete $args{EmptyTags};
2539
my $prolog= $t->prolog( %args) || '';
2540
my $prolog_data= $t->_prolog_data( %args) || '';
2542
my $string= $prolog . $prolog_data . $t->{twig_root}->sprint;
2544
$t->set_pretty_print( $old_pretty) if( defined $old_pretty);
2545
$t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style);
2551
# this method discards useless elements in a tree
2552
# it does the same thing as a flush except it does not print it
2553
# the second argument is an element, the last purged element
2554
# (this argument is usually set through the purge_up_to method)
2559
# the "real" last element processed, as _twig_end has closed it
2562
{ $last_elt= $up_to; }
2563
elsif( $t->{twig_current})
2564
{ $last_elt= $t->{twig_current}->_last_child; }
2566
{ $last_elt= $t->{twig_root}; }
2568
my $elt= $t->{twig_root};
2572
if( $last_elt && $last_elt->in( $elt))
2573
{ $elt->_set_flushed;
2574
$next_elt= $elt->_first_child;
2577
{ # an element before the last one or the last one,
2578
$next_elt= $elt->_next_sibling;
2580
last if( $last_elt && ($elt == $last_elt) );
2586
# flushes up to an element. This method just calls purge
2594
{ return $_[0]->{twig_root}; }
2596
# create accessor methods on attribute names
2597
sub create_accessors
2598
{ my $twig_or_class= shift;
2599
my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class}
2603
foreach my $att (@_)
2604
{ croak "attempt to redefine existing method $att using create_accessors"
2605
if( $elt_class->can( $att));
2606
*{"$elt_class\::$att"}=
2609
if( @_) { $elt->{att}->{$att}= $_[0]; }
2610
$elt->{att}->{$att};
2616
#start-extract twig_document (used to generate XML::(DOM|GDOME)::Twig)
2618
{ my( $t, $cond)= @_;
2619
my $root= $t->root || return undef;
2620
return $root if( $root->passes( $cond));
2621
return $root->next_elt( $cond);
2625
{ my( $t, $offset, $cond)= @_;
2626
$offset -- if( $t->root->matches( $cond) );
2627
return $t->root->next_n_elt( $offset, $cond);
2632
if( isa( $_[0], 'ARRAY'))
2633
{ my $elt_array= shift;
2634
return _unique_elts( map { $_->get_xpath( @_) } @$elt_array);
2637
{ return $twig->root->get_xpath( @_); }
2640
# get a list of elts and return a sorted list of unique elts
2642
{ my @sorted= sort { $a ->cmp( $b) } @_;
2644
while( my $current= shift @sorted)
2645
{ push @unique, $current unless( @unique && ($unique[-1] == $current)); }
2651
if( isa( $_[0], 'ARRAY'))
2652
{ my $elt_array= shift;
2653
return join( '', map { $_->findvalue( @_) } @$elt_array);
2656
{ return $twig->root->findvalue( @_); }
2661
XML::Twig::Elt->set_id_seed( @_);
2664
# return an array ref to an index, or undef
2666
{ my( $twig, $name, $index)= @_;
2667
return defined( $index) ? $twig->{_twig_index}->{$name}->[$index] : $twig->{_twig_index}->{$name};
2670
# return a list with just the root
2671
# if a condition is given then return an empty list unless the root matches
2673
{ my( $t, $cond)= @_;
2675
unless( $cond && !($root->passes( $cond)) )
2682
{ return ($_[0]->root); }
2685
{ my( $t, $cond)= @_;
2687
if( $root->passes( $cond) )
2688
{ return ($root, $root->descendants( $cond)); }
2690
{ return ( $root->descendants( $cond)); }
2695
$t->root->simplify( @_);
2700
$t->root->subs_text( @_);
2703
#end-extract twig_document
2705
sub set_keep_encoding
2706
{ return XML::Twig::Elt::set_keep_encoding( @_); }
2708
sub set_expand_external_entities
2709
{ return XML::Twig::Elt::set_expand_external_entities( @_); }
2711
# WARNING: at the moment the id list is not updated reliably
2713
{ return $_[0]->{twig_id_list}->{$_[1]}; }
2715
# change it in ALL twigs at the moment
2717
{ my( $twig, $old_gi, $new_gi)= @_;
2719
return unless($index= $XML::Twig::gi2index{$old_gi});
2720
$XML::Twig::index2gi[$index]= $new_gi;
2721
delete $XML::Twig::gi2index{$old_gi};
2722
$XML::Twig::gi2index{$new_gi}= $index;
2726
# builds the DTD from the stored (possibly updated) data
2729
my $dtd= $t->{twig_dtd};
2730
my $doctype= $t->{twig_doctype} or return '';
2731
my $string= "<!DOCTYPE ".$doctype->{name};
2733
unless( $parser_version > 3.27) { $string .= " [\n"; }
2735
foreach my $gi (@{$dtd->{elt_list}})
2736
{ $string.= "<!ELEMENT $gi ".$dtd->{model}->{$gi}.">\n" ;
2737
if( $dtd->{att}->{$gi})
2738
{ my $attlist= $dtd->{att}->{$gi};
2739
$string.= "<!ATTLIST $gi\n";
2740
foreach my $att ( sort keys %{$attlist})
2741
{ $string.= " $att $attlist->{$att}->{type} ".
2742
"$attlist->{$att}->{default}";
2743
if( $attlist->{$att}->{fixed})
2744
{ $string .= " #FIXED"};
2750
$string.= $t->entity_list->text if( $t->entity_list);
2755
# prints the DTD from the stored (possibly updated) data
2758
my $fh= _is_fh( $_[0]) ? shift : undef;
2759
if( $fh) { print $fh $t->dtd_text; }
2760
else { print $t->dtd_text; }
2763
# build the subs that call directly expat
2765
{ my @expat_methods= qw( depth in_element within_element context
2766
current_line current_column current_byte
2767
recognized_string original_string
2770
base current_element element_index
2771
position_in_context);
2772
foreach my $method (@expat_methods)
2774
*{$method}= sub { my $t= shift;
2775
croak "calling $method after parsing is finished"
2776
unless( $t->{twig_parsing});
2777
return $t->{twig_parser}->$method(@_);
2784
if( $t->{twig_map_xmlns})
2785
{ return "/" . join( "/", map { $t->_replace_prefix( $_)} ($t->{twig_parser}->context, $gi)); }
2787
{ return "/" . join( "/", ($t->{twig_parser}->context, $gi)); }
2792
return $t->{twig_parser}->finish;
2795
# just finish the parse by printing the rest of the document
2799
unless( defined $fh)
2800
{ $t->_set_fh_to_twig_output_fh(); }
2802
{ $old_fh= select $fh;
2803
$t->{twig_original_selected_fh}= $old_fh if( $old_fh);
2806
my $p=$t->{twig_parser};
2807
if( $t->{twig_keep_encoding})
2808
{ $p->setHandlers( %twig_handlers_finish_print); }
2810
{ $p->setHandlers( %twig_handlers_finish_print_original); }
2813
sub set_remove_cdata { return XML::Twig::Elt::set_remove_cdata( @_); }
2815
sub output_filter { return XML::Twig::Elt::output_filter( @_); }
2816
sub set_output_filter { return XML::Twig::Elt::set_output_filter( @_); }
2818
sub output_text_filter { return XML::Twig::Elt::output_text_filter( @_); }
2819
sub set_output_text_filter { return XML::Twig::Elt::set_output_text_filter( @_); }
2821
sub set_input_filter
2822
{ my( $t, $input_filter)= @_;
2823
my $old_filter= $t->{twig_input_filter};
2824
if( !$input_filter || isa( $input_filter, 'CODE') )
2825
{ $t->{twig_input_filter}= $input_filter; }
2826
elsif( $input_filter eq 'latin1')
2827
{ $t->{twig_input_filter}= latin1(); }
2828
elsif( $filter{$input_filter})
2829
{ $t->{twig_input_filter}= $filter{$input_filter}; }
2831
{ croak "invalid input filter: $input_filter"; }
2836
sub set_empty_tag_style
2837
{ return XML::Twig::Elt::set_empty_tag_style( @_); }
2839
sub set_pretty_print
2840
{ return XML::Twig::Elt::set_pretty_print( @_); }
2843
{ return XML::Twig::Elt::set_quote( @_); }
2846
{ return XML::Twig::Elt::set_indent( @_); }
2848
sub set_keep_atts_order
2849
{ shift; return XML::Twig::Elt::set_keep_atts_order( @_); }
2852
{ return XML::Twig::Elt::keep_atts_order( @_); }
2854
sub set_do_not_escape_amp_in_atts
2855
{ return XML::Twig::Elt::set_do_not_escape_amp_in_atts( @_); }
2857
# save and restore package globals (the ones in XML::Twig::Elt)
2858
sub save_global_state
2860
$t->{twig_saved_state}= XML::Twig::Elt::global_state();
2863
sub restore_global_state
2865
XML::Twig::Elt::set_global_state( $t->{twig_saved_state});
2869
{ return XML::Twig::Elt::global_state(); }
2871
sub set_global_state
2872
{ return XML::Twig::Elt::set_global_state( $_[1]); }
2881
if( $t->{twig_root} && isa( $t->{twig_root}, 'XML::Twig'))
2882
{ $t->{twig_root}->delete }
2884
# added to break circular references
2886
undef $t->{twig_root}->{twig} if( $t->{twig_root});
2887
undef $t->{twig_parser};
2889
$t={}; # prevents memory leaks (especially when using mod_perl)
2895
# non standard handlers
2898
# kludge: expat 1.95.2 calls both Default AND Doctype handlers
2899
# so if the default handler finds '<!DOCTYPE' then it must
2900
# unset itself (_twig_print_doctype will reset it)
2901
sub _twig_print_check_doctype
2904
my $string= $p->recognized_string();
2905
if( $string eq '<!DOCTYPE')
2906
{ $p->setHandlers( Default => undef);
2907
$p->{twig}->{expat_1_95_2}=1;
2915
{ print $_[0]->recognized_string(); }
2917
# recognized_string does not seem to work for entities, go figure!
2918
# so this handler is not used
2919
sub _twig_print_entity
2922
# kludge: expat 1.95.2 calls both Default AND Doctype handlers
2923
# so if the default handler finds '<!DOCTYPE' then it must
2924
# unset itself (_twig_print_doctype will reset it)
2925
sub _twig_print_original_check_doctype
2928
my $string= $p->original_string();
2929
if( $string eq '<!DOCTYPE')
2930
{ $p->setHandlers( Default => undef);
2931
$p->{twig}->{expat_1_95_2}=1;
2938
sub _twig_print_original
2939
{ print $_[0]->original_string(); }
2942
sub _twig_print_original_doctype
2944
my( $p, $name, $sysid, $pubid, $internal)= @_;
2946
{ # with recent versions of XML::Parser original_string does not work,
2947
# hence we need to rebuild the doctype declaration
2949
$doctype .= qq{<!DOCTYPE $name} if( $name);
2950
$doctype .= qq{ PUBLIC "$pubid"} if( $pubid);
2951
$doctype .= qq{ SYSTEM} if( $sysid && !$pubid);
2952
$doctype .= qq{ "$sysid"} if( $sysid);
2953
$doctype .= qq{>} unless( $p->{twig}->{expat_1_95_2});
2956
$p->setHandlers( Default => \&_twig_print_original);
2959
sub _twig_print_doctype
2961
my( $p, $name, $sysid, $pubid, $internal)= @_;
2963
{ # with recent versions of XML::Parser original_string does not work,
2964
# hence we need to rebuild the doctype declaration
2966
$doctype .= qq{<!DOCTYPE $name} if( $name);
2967
$doctype .= qq{ PUBLIC "$pubid"} if( $pubid);
2968
$doctype .= qq{ SYSTEM} if( $sysid && !$pubid);
2969
$doctype .= qq{ "$sysid"} if( $sysid);
2970
$doctype .= qq{>} unless( $p->{twig}->{expat_1_95_2});
2973
$p->setHandlers( Default => \&_twig_print_original);
2977
sub _twig_print_original_default
2980
print $p->original_string();
2983
# account for the case where the element is empty
2984
sub _twig_print_end_original
2986
print $p->original_string();
2989
sub _twig_start_check_roots
2991
my( $p, $gi, %att)= @_;
2995
# $tag will always be true if it needs to be printed (the tag string is never empty)
2996
my $tag= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string
2997
: $p->recognized_string
2999
my $fh= $t->{twig_output_fh} || select() || \*STDOUT;
3001
if( _handler( $t, $t->{twig_roots}, $gi, \%att))
3002
{ $p->setHandlers( %twig_handlers); # restore regular handlers
3003
$t->{twig_root_depth}= $p->depth;
3004
_twig_start( $p, $gi, %att);
3006
elsif( $p->depth == 0)
3008
print {$fh} $tag if( $tag);
3009
_twig_start( $p, $gi, %att);
3011
elsif( $t->{twig_starttag_handlers})
3012
{ # look for start tag handlers
3013
my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi, \%att);
3014
my $last_handler_res;
3015
foreach my $handler ( @handlers)
3016
{ $last_handler_res= $handler->($t, $gi, %att);
3017
last unless $last_handler_res;
3020
print {$fh} $tag if( $tag && (!@handlers || $last_handler_res));
3024
print {$fh} $tag if( $tag);
3028
sub _twig_end_check_roots
3030
my( $p, $gi, %att)= @_;
3032
# $tag can be empty (<elt/>), hence the undef and the tests for defined
3033
my $tag= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string
3034
: $p->recognized_string
3036
my $fh= $t->{twig_output_fh} || select() || \*STDOUT;
3038
if( $t->{twig_endtag_handlers})
3039
{ # look for start tag handlers
3040
my @handlers= _handler( $t, $t->{twig_endtag_handlers}, $gi, {});
3041
my $last_handler_res=1;
3042
foreach my $handler ( @handlers)
3043
{ $last_handler_res= $handler->($t, $gi) || last; }
3044
return unless $last_handler_res;
3048
print {$fh} $tag if( defined( $tag));
3051
{ _twig_end( $p, $gi); }
3054
sub _twig_pi_check_roots
3055
{ my( $p, $target, $data)= @_;
3057
my $pi= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string
3058
: $p->recognized_string
3060
my $fh= $t->{twig_output_fh} || select() || \*STDOUT;
3062
if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target}
3063
|| $t->{twig_handlers}->{pi_handlers}->{''}
3065
{ # if handler is called on pi, then it needs to be processed as a regular node
3066
my @flags= qw( twig_process_pi twig_keep_pi);
3067
my @save= @{$t}{@flags}; # save pi related flags
3068
@{$t}{@flags}= (1, 0); # override them, pi needs to be processed
3069
_twig_pi( @_); # call handler on the pi
3070
@{$t}{@flags}= @save;; # restore flag
3074
print {$fh} $pi if( defined( $pi));
3079
sub _twig_ignore_start
3083
return unless( $gi eq $t->{twig_ignore_gi});
3084
$t->{twig_ignore_level}++;
3085
my $action= $t->{twig_ignore_action};
3086
if( $action eq 'print' )
3087
{ _twig_print_original( @_); }
3088
# elsif( $action eq 'string' )
3089
# { $t->{twig_buffered_string} .= $p->original_string(); }
3092
sub _twig_ignore_end
3097
my $action= $t->{twig_ignore_action};
3099
if( $action eq 'print')
3100
{ _twig_print_original( $p, $gi); }
3101
# elsif( $action eq 'string')
3102
# { $t->{twig_buffered_string} .= $p->original_string(); }
3104
return unless( $gi eq $t->{twig_ignore_gi});
3106
$t->{twig_ignore_level}--;
3108
unless( $t->{twig_ignore_level})
3109
{ $t->{twig_ignore_elt}->delete;
3110
$p->setHandlers( @{$t->{twig_saved_handlers}});
3112
if( $t->{twig_endtag_handlers})
3113
{ # look for end tag handlers
3114
my @handlers= _handler( $t, $t->{twig_endtag_handlers}, $gi, {});
3115
my $last_handler_res=1;
3116
foreach my $handler ( @handlers)
3117
{ $last_handler_res= $handler->($t, $gi) || last; }
3126
# get the element (default: current elt)
3127
if( $_[0] && isa( $_[0], 'XML::Twig::Elt'))
3130
{ $elt = $t->{twig_current}; }
3132
my $action= shift || 1;
3133
$t->{twig_ignore_action}= $action;
3135
$t->{twig_ignore_elt}= $elt; # save it
3136
$t->{twig_ignore_gi}= $elt->gi; # save its gi
3137
$t->{twig_ignore_level}++;
3138
my $p= $t->{twig_parser};
3139
my @saved_handlers= $p->setHandlers( %twig_handlers_ignore); # set handlers
3140
if( $action eq 'print')
3141
{ $p->setHandlers( Default => \&_twig_print_original); }
3142
# elsif( $action eq 'string')
3143
# { # not used at the moment
3144
# $t->{twig_buffered_string}='';
3145
# $p->setHandlers( Default => \&twig_buffer_original);
3148
$t->{twig_saved_handlers}= \@saved_handlers; # save current handlers
3151
# select $t->{twig_output_fh} and store the current selected fh
3152
sub _set_fh_to_twig_output_fh
3154
my $output_fh= $t->{twig_output_fh};
3155
if( $output_fh && !$t->{twig_output_fh_selected})
3156
{ # there is an output fh
3157
$t->{twig_selected_fh}= select(); # store the currently selected fh
3158
$t->{twig_output_fh_selected}=1;
3159
select $output_fh; # select the output fh for the twig
3163
# select the fh that was stored in $t->{twig_selected_fh}
3164
# (before $t->{twig_output_fh} was selected)
3165
sub _set_fh_to_selected_fh
3167
return unless( $t->{twig_output_fh});
3168
my $selected_fh= $t->{twig_selected_fh};
3169
$t->{twig_output_fh_selected}=0;
3170
select $selected_fh;
3176
{ return $_[0]->{twig_xmldecl}->{encoding} if( $_[0]->{twig_xmldecl}); }
3179
{ my( $t, $encoding)= @_;
3180
$t->{twig_xmldecl} ||={};
3181
$t->set_xml_version( "1.0") unless( $t->xml_version);
3182
$t->{twig_xmldecl}->{encoding}= $encoding;
3187
{ return $_[0]->{output_encoding}; }
3189
sub set_output_encoding
3190
{ my( $t, $encoding)= @_;
3191
$t->set_output_filter( _encoding_filter( $encoding)) if( $encoding);
3192
return $t->{output_encoding}= $encoding;
3196
{ return $_[0]->{twig_xmldecl}->{version} if( $_[0]->{twig_xmldecl}); }
3199
{ my( $t, $version)= @_;
3200
$t->{twig_xmldecl} ||={};
3201
return $t->{twig_xmldecl}->{version}= $version;
3205
{ return $_[0]->{twig_xmldecl}->{standalone} if( $_[0]->{twig_xmldecl}); }
3208
{ my( $t, $standalone)= @_;
3209
$t->{twig_xmldecl} ||={};
3210
$t->set_xml_version( "1.0") unless( $t->xml_version);
3211
return $t->{twig_xmldecl}->{standalone}= $standalone;
3218
{ croak "cannot use toSAX1 while parsing (use flush_toSAX1)" if (defined $_[0]->{twig_parser});
3219
shift(@_)->_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX1,
3220
\&XML::Twig::Elt::_end_tag_data_SAX1
3224
{ croak "cannot use toSAX2 while parsing (use flush_toSAX2)" if (defined $_[0]->{twig_parser});
3225
shift(@_)->_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX2,
3226
\&XML::Twig::Elt::_end_tag_data_SAX2
3231
{ my( $t, $handler, $start_tag_data, $end_tag_data) = @_;
3233
if( my $start_document = $handler->can( 'start_document'))
3234
{ $start_document->( $handler); }
3236
$t->_prolog_toSAX( $handler);
3238
$t->root->_toSAX( $handler, $start_tag_data, $end_tag_data) if( $t->root);
3239
if( my $end_document = $handler->can( 'end_document'))
3240
{ $end_document->( $handler); }
3245
{ shift(@_)->_flush_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX1,
3246
\&XML::Twig::Elt::_end_tag_data_SAX1
3250
{ shift(@_)->_flush_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX2,
3251
\&XML::Twig::Elt::_end_tag_data_SAX2
3255
{ my( $t, $handler, $start_tag_data, $end_tag_data, $up_to)= @_;
3257
# the "real" last element processed, as _twig_end has closed it
3260
{ $last_elt= $up_to; }
3261
elsif( $t->{twig_current})
3262
{ $last_elt= $t->{twig_current}->_last_child; }
3264
{ $last_elt= $t->{twig_root}; }
3266
my $elt= $t->{twig_root};
3267
unless( $elt->_flushed)
3268
{ # init unless already done (ie root has been flushed)
3269
if( my $start_document = $handler->can( 'start_document'))
3270
{ $start_document->( $handler); }
3272
$t->_prolog_toSAX( $handler)
3277
if( $last_elt && $last_elt->in( $elt))
3279
unless( $elt->_flushed)
3280
{ # just output the front tag
3281
if( my $start_element = $handler->can( 'start_element'))
3282
{ if( my $tag_data= $start_tag_data->( $elt))
3283
{ $start_element->( $handler, $tag_data); }
3287
$next_elt= $elt->_first_child;
3290
{ # an element before the last one or the last one,
3291
$next_elt= $elt->_next_sibling;
3292
$elt->_toSAX( $handler, $start_tag_data, $end_tag_data);
3294
last if( $last_elt && ($elt == $last_elt));
3298
if( !$t->{twig_parsing})
3299
{ if( my $end_document = $handler->can( 'end_document'))
3300
{ $end_document->( $handler); }
3306
{ my( $t, $handler)= @_;
3307
$t->_xmldecl_toSAX( $handler);
3308
$t->_DTD_toSAX( $handler);
3312
{ my( $t, $handler)= @_;
3313
my $decl= $t->{twig_xmldecl};
3314
my $data= { Version => $decl->{version},
3315
Encoding => $decl->{encoding},
3316
Standalone => $decl->{standalone},
3318
if( my $xml_decl= $handler->can( 'xml_decl'))
3319
{ $xml_decl->( $handler, $data); }
3323
{ my( $t, $handler)= @_;
3324
my $doctype= $t->{twig_doctype};
3325
return unless( $doctype);
3326
my $data= { Name => $doctype->{name},
3327
PublicId => $doctype->{pub},
3328
SystemId => $doctype->{sysid},
3331
if( my $start_dtd= $handler->can( 'start_dtd'))
3332
{ $start_dtd->( $handler, $data); }
3334
# I should call code to export the internal subset here
3336
if( my $end_dtd= $handler->can( 'end_dtd'))
3337
{ $end_dtd->( $handler); }
3340
# input/output filters
3343
{ local $SIG{__DIE__};
3344
if( eval 'require Encode')
3346
return encode_convert( 'ISO-8859-15');
3348
elsif( eval 'require Text::Iconv;')
3350
return iconv_convert( 'ISO-8859-15');
3352
elsif( eval 'require Unicode::Map8 && require Unicode::String;')
3354
return unicode_convert( 'ISO-8859-15');
3357
{ return \®exp2latin1; }
3360
sub _encoding_filter
3362
{ local $SIG{__DIE__};
3363
my $encoding= $_[1] || $_[0];
3364
if( eval 'require Encode')
3366
my $sub= encode_convert( $encoding);
3369
elsif( eval 'require Text::Iconv;')
3370
{ return iconv_convert( $encoding); }
3371
elsif( eval 'require Unicode::Map8 && require Unicode::String;')
3372
{ return unicode_convert( $encoding); }
3374
croak "Encode, Text::Iconv or Unicode::Map8 and Unicode::String need to be installed ",
3375
"in order to use encoding options";
3378
# shamelessly lifted from XML::TyePYX (works only with XML::Parse 2.27)
3381
$text=~s{([\xc0-\xc3])(.)}{ my $hi = ord($1);
3383
chr((($hi & 0x03) <<6) | ($lo & 0x3F))
3390
{ require HTML::Entities;
3391
return HTML::Entities::encode_entities($_[0] );
3397
{ $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)}
3398
{_XmlUtf8Decode($1)}egs;
3401
{ $str= encode( ascii => $str, $FB_HTMLCREF); }
3408
{ $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)}
3409
{_XmlUtf8Decode($1, 1)}egs;
3412
{ $str= encode( ascii => $str, $FB_XMLCREF); }
3416
# this one shamelessly lifted from XML::DOM
3417
# does NOT work on 5.8.0
3419
{ my ($str, $hex) = @_;
3420
my $len = length ($str);
3424
{ my @n = unpack "C2", $str;
3425
$n = (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f);
3428
{ my @n = unpack "C3", $str;
3429
$n = (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + ($n[2] & 0x3f);
3432
{ my @n = unpack "C4", $str;
3433
$n = (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12)
3434
+ (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f);
3436
elsif ($len == 1) # just to be complete...
3437
{ $n = ord ($str); }
3439
{ croak "bad value [$str] for _XmlUtf8Decode"; }
3441
my $char= $hex ? sprintf ("&#x%x;", $n) : "&#$n;";
3447
{ my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly
3448
require Unicode::Map8;
3449
require Unicode::String;
3450
import Unicode::String qw(utf8);
3453
BEGIN { $cnv= Unicode::Map8->new($enc)
3454
or croak "Can't create converter to $enc";
3456
sub { return $cnv->to8 (utf8($_[0])->ucs2); }
3459
unless( $sub) { croak $@; }
3464
{ my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly
3465
require Text::Iconv;
3468
BEGIN { $cnv = Text::Iconv->new( 'utf8', $enc)
3469
or croak "Can't create iconv converter to $enc";
3471
sub { return $cnv->convert( $_[0]); }
3475
{ if( $@=~ m{^Unsupported conversion: Invalid argument})
3476
{ croak "Unsupported encoding: $enc"; }
3485
{ my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly
3486
my $sub= eval qq{sub { return encode( "$enc", \$_[0]); } };
3487
croak "can't create Encode-based filter: $@" unless( $sub);
3492
# XML::XPath compatibility
3493
sub getRootNode { return $_[0]; }
3494
sub getParentNode { return undef; }
3495
sub getChildNodes { my @children= ($_[0]->root); return wantarray ? @children : \@children; }
3500
######################################################################
3501
package XML::Twig::Entity_list;
3502
######################################################################
3503
use UNIVERSAL qw( isa ) ;
3506
{ my $class = shift;
3509
bless $self, $class;
3516
my $ent= XML::Twig::Entity->new( @_);
3521
{ my( $list, $ent)= @_;
3522
$list->{$ent->{name}}= $ent;
3525
# can be called with an entity or with an entity name
3528
if( isa( ref $_[0], 'XML::Twig::Entity'))
3529
{ # the second arg is an entity
3531
delete $list->{$ent->{name}};
3534
{ # the second arg was not entity, must be a string then
3536
delete $list->{$name};
3541
{ my ($ent_list, $fh)= @_;
3542
my $old_select= defined $fh ? select $fh : undef;
3544
foreach my $ent_name ( sort keys %{$ent_list})
3545
{ my $ent= $ent_list->{$ent_name};
3546
# we have to test what the entity is or un-defined entities can creep in
3547
$ent->print() if( isa( $ent, 'XML::Twig::Entity'));
3549
select $old_select if( defined $old_select);
3553
{ my ($ent_list)= @_;
3554
return join "\n", map { $ent_list->{$_}->text} sort keys %{$ent_list};
3558
{ my ($ent_list)= @_;
3559
return map { $ent_list->{$_} } sort keys %$ent_list;
3564
######################################################################
3565
package XML::Twig::Entity;
3566
######################################################################
3567
use UNIVERSAL qw( isa ) ;
3570
{ my( $ent, $name, $val, $sysid, $pubid, $ndata)= @_;
3574
$self->{name}= $name;
3576
{ $self->{val}= $val; }
3578
{ $self->{sysid}= $sysid;
3579
$self->{pubid}= $pubid;
3580
$self->{ndata}= $ndata;
3586
sub name { return $_[0]->{name}; }
3587
sub val { return $_[0]->{val}; }
3588
sub sysid { return $_[0]->{sysid}; }
3589
sub pubid { return $_[0]->{pubid}; }
3590
sub ndata { return $_[0]->{ndata}; }
3593
{ my ($ent, $fh)= @_;
3594
if( $fh) { print $fh $ent->text . "\n"; }
3595
else { print $ent->text . "\n"; }
3601
if( exists $ent->{'val'})
3602
{ if( $ent->{'val'}=~ /"/)
3603
{ return "<!ENTITY $ent->{'name'} '$ent->{'val'}'>"; }
3604
return "<!ENTITY $ent->{'name'} \"$ent->{'val'}\">";
3606
elsif( $ent->{'sysid'})
3607
{ my $text= "<!ENTITY $ent->{'name'} ";
3608
$text .= "SYSTEM \"$ent->{'sysid'}\" " if( $ent->{'sysid'});
3609
$text .= "PUBLIC \"$ent->{'pubid'}\" " if( $ent->{'pubid'});
3610
$text .= "NDATA $ent->{'ndata'}" if( $ent->{'ndata'});
3619
######################################################################
3620
package XML::Twig::Elt;
3621
######################################################################
3623
use UNIVERSAL qw( isa ) ;
3625
use constant PCDATA => '#PCDATA';
3626
use constant CDATA => '#CDATA';
3627
use constant PI => '#PI';
3628
use constant COMMENT => '#COMMENT';
3629
use constant ENT => '#ENT';
3631
use constant ASIS => '#ASIS'; # pcdata elements not to be XML-escaped
3633
use constant ELT => '#ELT';
3634
use constant TEXT => '#TEXT';
3635
use constant EMPTY => '#EMPTY';
3637
use constant CDATA_START => "<![CDATA[";
3638
use constant CDATA_END => "]]>";
3639
use constant PI_START => "<?";
3640
use constant PI_END => "?>";
3641
use constant COMMENT_START => "<!--";
3642
use constant COMMENT_END => "-->";
3644
use constant XMLNS_URI => 'http://www.w3.org/2000/xmlns/';
3645
my $XMLNS_URI = XMLNS_URI;
3649
{ # set some aliases for methods
3653
*set_name = *set_gi;
3654
*find_nodes = *get_xpath; # as in XML::DOM
3655
*findnodes = *get_xpath; # as in XML::LibXML
3656
*field = *first_child_text;
3657
*trimmed_field = *first_child_trimmed_text;
3658
*is_field = *contains_only_text;
3661
*has_child = *first_child;
3662
*has_children = *first_child;
3663
*all_children_pass = *all_children_are;
3664
*all_children_match= *all_children_are;
3665
*getElementsByTagName= *descendants;
3666
*find_by_tag_name= *descendants_or_self;
3668
*first_child_is = *first_child_matches;
3669
*last_child_is = *last_child_matches;
3670
*next_sibling_is = *next_sibling_matches;
3671
*prev_sibling_is = *prev_sibling_matches;
3672
*next_elt_is = *next_elt_matches;
3673
*prev_elt_is = *prev_elt_matches;
3674
*parent_is = *parent_matches;
3675
*child_is = *child_matches;
3676
*inherited_att = *inherit_att;
3678
*sort_children_by_value= *sort_children_on_value;
3682
# imports from XML::Twig
3683
*_is_fh= *XML::Twig::_is_fh;
3685
# XML::XPath compatibility
3686
*string_value = *text;
3687
*toString = *sprint;
3689
*getRootNode = *twig;
3690
*getNextSibling = *_next_sibling;
3691
*getPreviousSibling = *_prev_sibling;
3692
*isElementNode = *is_elt;
3693
*isTextNode = *is_text;
3696
*isProcessingInstructionNode= *is_pi;
3697
*isComment = *is_comment;
3698
*isCommentNode = *is_comment;
3699
*getTarget = *target;
3701
# try using weak references
3702
# test whether we can use weak references
3703
{ local $SIG{__DIE__};
3704
if( eval 'require Scalar::Util' && defined( &Scalar::Util::weaken) )
3705
{ import Scalar::Util qw(weaken); }
3706
elsif( eval 'require WeakRef')
3712
# can be called as XML::Twig::Elt->new( [[$gi, $atts, [@content]])
3713
# - gi is an optional gi given to the element
3714
# - $atts is a hashref to attributes for the element
3715
# - @content is an optional list of text and elements that will
3716
# be inserted under the element
3719
$class= ref $class || $class;
3721
bless ($elt, $class);
3723
return $elt unless @_;
3725
# if a gi is passed then use it
3730
my $atts= ref $_[0] eq 'HASH' ? shift : undef;
3733
{ $elt->set_pcdata( shift); }
3735
{ $elt->set_ent( shift); }
3736
elsif( $gi eq CDATA)
3737
{ $elt->set_cdata( shift); }
3738
elsif( $gi eq COMMENT)
3739
{ $elt->set_comment( shift); }
3741
{ $elt->set_pi( shift, shift); }
3743
{ # the rest of the arguments are the content of the element
3745
{ $elt->set_content( @_); }
3747
{ $elt->set_empty( 1); }
3751
{ # the attribute hash can be used to pass the asis status
3752
if( defined $atts->{'#ASIS'}) { $elt->set_asis( $atts->{'#ASIS'} ); delete $atts->{'#ASIS'}; }
3753
if( defined $atts->{'#EMPTY'}) { $elt->set_empty( $atts->{'#EMPTY'}); delete $atts->{'#EMPTY'}; }
3754
$elt->set_atts( $atts) if( keys %$atts);
3760
# this function creates an XM:::Twig::Elt from a string
3761
# it is quite clumsy at the moment, as it just creates a
3762
# new twig then returns its root
3763
# there might also be memory leaks there
3764
# additional arguments are passed to new XML::Twig
3769
my $t= XML::Twig->new(%args);
3770
$t->parse( $string);
3773
delete $elt->{twig}; # get rid of the twig data
3774
delete $elt->{twig_current}; # better get rid of this too
3780
{ my ($elt, $gi)= @_;
3781
unless( defined $XML::Twig::gi2index{$gi})
3782
{ # new gi, create entries in %gi2index and @index2gi
3783
push @XML::Twig::index2gi, $gi;
3784
$XML::Twig::gi2index{$gi}= $#XML::Twig::index2gi;
3786
$elt->{gi}= $XML::Twig::gi2index{$gi};
3790
sub gi { return $XML::Twig::index2gi[$_[0]->{gi}]; }
3794
return _local_name( $elt->gi);
3799
return _ns_prefix( $elt->gi);
3802
# namespace prefix for any qname (can be used for elements or attributes)
3805
if( $qname=~ m{^([^:]*):})
3808
{ return( ''); } # should it be '' ?
3811
# local name for any qname (can be used for elements or attributes)
3814
(my $local= $qname)=~ s{^[^:]*:}{};
3819
{ my %DEFAULT_NS= ( xml => "http://www.w3.org/XML/1998/namespace",
3820
xmlns => "http://www.w3.org/2000/xmlns/",
3826
my $prefix= defined $_[0] ? shift() : $elt->ns_prefix;
3827
my $ns_att= $prefix ? "xmlns:$prefix" : "xmlns";
3828
my $expanded= $DEFAULT_NS{$prefix} || $elt->inherit_att( $ns_att) || '';
3834
# return #ELT for an element and #PCDATA... for others
3836
{ my $gi_nb= $_[0]->{gi}; # the number, not the string
3837
return ELT if( $gi_nb >= $XML::Twig::SPECIAL_GI);
3841
# return the gi if it's a "real" element, 0 otherwise
3843
{ return $_[0]->gi if( $_[0]->{gi} >= $XML::Twig::SPECIAL_GI);
3850
return (exists $elt->{'pcdata'});
3855
return (exists $elt->{'cdata'});
3860
return (exists $elt->{'target'});
3865
return (exists $elt->{'comment'});
3870
return (exists $elt->{ent} || $elt->{ent_name});
3876
return (exists( $elt->{'pcdata'}) || (exists $elt->{'cdata'}));
3880
{ return $_[0]->{empty} || 0; }
3883
{ $_[0]->{empty}= defined( $_[1]) ? $_[1] : 1; return $_[0]; }
3886
{ delete $_[0]->{empty} if( $_[0]->is_empty); return $_[0]; }
3892
foreach my $descendant ($elt, $elt->_descendants )
3893
{ $descendant->{asis}= 1;
3894
if( $descendant->is_cdata)
3895
{ $descendant->set_gi( PCDATA);
3896
$descendant->set_pcdata( $descendant->cdata);
3905
foreach my $descendant ($elt, $elt->descendants)
3906
{ delete $descendant->{asis} if $descendant->{asis};}
3911
{ return $_[0]->{asis}; }
3915
my $t= $elt->twig || return;
3916
my $curr_elt= $t->{twig_current};
3917
return unless( $curr_elt);
3918
return $curr_elt->in( $elt);
3922
{ $_[0]->set_not_empty;
3923
$_[0]->{'pcdata'}= $_[1];
3924
if( $_[0]->{extra_data_in_pcdata})
3925
{ undef $_[0]->{extra_data_in_pcdata}; # TODO: be smarter!
3931
{ $_[0]->set_not_empty;
3932
$_[0]->{'pcdata'}.= $_[1];
3936
sub pcdata { return $_[0]->{pcdata}; }
3939
sub append_extra_data
3940
{ $_[0]->{extra_data}.= $_[1];
3945
{ $_[0]->{extra_data}= $_[1];
3948
sub extra_data { return $_[0]->{extra_data}; }
3951
{ $_[0]->{'target'}= $_[1];
3955
sub target { return $_[0]->{target}; }
3958
{ $_[0]->{'data'}= $_[1];
3961
sub data { return $_[0]->{data}; }
3964
{ $_[0]->set_target( $_[1]);
3965
$_[0]->set_data( $_[2]);
3969
sub pi_string { return PI_START . $_[0]->target . " " . $_[0]->data . PI_END; }
3971
sub set_comment { $_[0]->{comment}= $_[1]; return $_[0]; }
3972
sub comment { return $_[0]->{comment}; }
3973
sub comment_string { return COMMENT_START . $_[0]->comment . COMMENT_END; }
3975
sub set_ent { $_[0]->{ent}= $_[1]; return $_[0]; }
3976
sub ent { return $_[0]->{ent}; }
3977
sub ent_name { return substr( $_[0]->ent, 1, -1);}
3980
{ $_[0]->set_not_empty;
3981
$_[0]->{cdata}= $_[1];
3986
{ $_[0]->{cdata}.= $_[1];
3989
sub cdata { return $_[0]->{cdata}; }
3991
#start-extract twig_node
3992
sub contains_only_text
3994
return 0 unless $elt->is_elt;
3995
foreach my $child ($elt->children)
3996
{ return 0 if $child->is_elt; }
4001
{ my( $elt, $exp)= @_;
4002
my @children= $elt->children;
4003
foreach my $child (@children)
4004
{ return 0 unless $child->is( $exp); }
4008
sub contains_a_single
4009
{ my( $elt, $exp)= @_;
4010
my $child= $elt->_first_child or return 0;
4011
return 0 unless $child->matches( $exp);
4012
return 0 if( $child->_next_sibling);
4020
while( $elt->_parent) { $elt= $elt->_parent; }
4023
#end-extract twig_node
4027
my $root= $elt->root;
4028
return $root->{twig};
4032
#start-extract twig_node
4034
# returns undef or the element, depending on whether $elt passes $cond
4036
# - empty: the element passes the condition
4037
# - ELT ('#ELT'): the element passes the condition if it is a "real" element
4038
# - TEXT ('#TEXT'): the element passes if it is a CDATA or PCDATA element
4039
# - a string with an XPath condition (only a subset of XPath is actually
4041
# - a regexp: the element passes if its gi matches the regexp
4042
# - a code ref: the element passes if the code, applied on the element,
4045
my %cond_cache; # expression => coderef
4047
sub reset_cond_cache { %cond_cache=(); }
4055
my $original_cond= $cond;
4057
my $not= ($cond=~ s{^\s*!}{}) ? '!' : '';
4059
if( ref $cond eq 'CODE') { return $cond; }
4061
if( ref $cond eq 'Regexp')
4062
{ $test = qq{(\$_[0]->gi=~ /$cond/)}; }
4064
{ # the condition is a string
4066
{ $test = qq{\$_[0]->is_elt}; }
4067
elsif( $cond eq TEXT)
4068
{ $test = qq{\$_[0]->is_text}; }
4069
elsif( $cond=~ m{^\s*($REG_NAME_W)\s*$}o)
4072
{ # 2 options, depending on whether the gi exists in gi2index
4073
# start optimization
4074
my $gi= $XML::Twig::gi2index{$1};
4076
{ # the gi exists, use its index as a faster shortcut
4077
$test = qq{ \$_[0]->{gi} eq "$XML::Twig::gi2index{$1}"};
4081
{ # it does not exist (but might be created later), compare the strings
4082
$test = qq{ \$_[0]->gi eq "$1"};
4086
{ $test = qq{ (1) } }
4088
elsif( $cond=~ m{^\s*($REG_REGEXP)\s*$}o)
4090
$test = qq{ \$_[0]->gi=~ $1 };
4092
elsif( $cond=~ m{^\s*($REG_NAME_W)?\s*\[\s*(\!\s*)?\@($REG_NAME)\s*\]\s*$}o)
4094
my( $gi, $not, $att)= ($1, $2, $3);
4096
if( $gi && ($gi ne '*'))
4097
{ $test = qq{ (\$_[0]->gi eq "$gi")
4098
&& $not(defined \$_[0]->att( "$att"))
4102
{ $test = qq{ $not (defined \$_[0]->att( "$att"))}; }
4104
elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1
4105
\[\s*(-?)\s*(\d+)\s*\] # [$2]
4108
{ my( $gi, $neg, $index)= ($1, $2, $3);
4109
my $siblings= $neg ? q{$_[0]->_next_siblings} : q{$_[0]->_prev_siblings};
4110
if( $gi && ($gi ne '*'))
4111
{ $test= qq{((\$_[0]->gi eq "$gi") && (scalar( grep { \$_->gi eq "$gi" } $siblings) + 1 == $index))}; }
4113
{ $test= qq{(scalar( $siblings) + 1 == $index)}; }
4115
elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1
4116
\[ \s* \@($REG_NAME) # [@$2
4117
\s*($REG_OP)\s* # = (or other op) $3
4118
($REG_VALUE) # "$4" or '$4'
4121
my( $gi, $att, $op, $string)= ($1, $2, _op( $3), $4);
4122
if( $gi && ($gi ne '*'))
4123
{ $test = qq{ (\$_[0]->gi eq "$gi")
4124
&& (defined \$_[0]->att( "$att"))
4125
&& ( \$_[0]->att( "$att") $op $string)
4129
{ $test = qq{ (defined \$_[0]->att( "$att"))
4130
&& ( \$_[0]->att( "$att") $op $string)
4134
elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1
4135
\[ \s* \@($REG_NAME) # [@$2
4136
\s*($REG_OP)\s* # = (or other op) $3
4137
($REG_VALUE) # "$4" or '$4'
4138
\s*(and|or)\s* # and or or ($5)
4140
\s*($REG_OP)\s* # = (or other op) $7
4141
($REG_VALUE) # "$8" or '$8'
4144
my( $gi, $att1, $op1, $string1, $connector, $att2, $op2, $string2)= ($1, $2, _op( $3), $4, $5, $6, _op( $7), $8);
4145
if( $gi && ($gi ne '*'))
4146
{ $test = qq{ ( (\$_[0]->gi eq "$gi")
4147
and ( ( (defined \$_[0]->att( "$att1")) and( \$_[0]->att( "$att1") $op1 $string1) )
4149
( (defined \$_[0]->att( "$att2")) and( \$_[0]->att( "$att2") $op2 $string2) )
4155
{ $test = qq{ ( ( (defined \$_[0]->att( "$att1")) and ( \$_[0]->att( "$att1") $op1 $string1) )
4157
( (defined \$_[0]->att( "$att2")) and( \$_[0]->att( "$att2") $op2 $string2) )
4162
elsif( $cond=~ m{^\s*\.([\w-]+)\s*$}o)
4165
$test = qq{(\$_[0]->in_class( "$class")) };
4167
elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1
4168
\[ \s* \@($REG_NAME) # [@$2
4169
\s*($REG_MATCH)\s* # =~ or !~ ($3)
4170
($REG_REGEXP) # /$4/
4172
{ # gi[@att=~ /regexp/] or gi[@att!~/regexp/]
4173
my( $gi, $att, $match, $regexp)= ($1, $2, $3, $4);
4174
if( $gi && ($gi ne '*'))
4175
{ $test = qq{ (\$_[0]->gi eq "$gi")
4176
&& ( defined \$_[0]->att( "$att"))
4177
&& ( \$_[0]->att( "$att") $match $regexp)
4181
{ # *[@att=~/regexp/ or *[@att!~/regexp/
4182
$test = qq{( defined \$_[0]->att( "$att"))
4183
&& ( \$_[0]->att( "$att") $match $regexp)
4187
elsif( $cond=~ m{^\s*\@($REG_NAME)\s*$}o)
4190
$test = qq{ (defined \$_[0]->att( "$att"))};
4192
elsif( $cond=~ m{^\s*
4194
\s*($REG_OP)\s* # = (or other op) $2
4195
($REG_VALUE) # "$3" or '$3'
4198
my( $att, $op, $string)= ( $1, _op( $2), $3);
4199
$test = qq{ (defined \$_[0]->att( "$att"))
4200
&& ( \$_[0]->att( "$att") $op $string)
4203
elsif( $cond=~ m{^\s*
4205
\s*($REG_OP)\s* # = (or other op) $2
4206
($REG_VALUE) # "$3" or '$3'
4207
\s*(and|or)\s* # and or or ($4)
4209
\s*($REG_OP)\s* # = (or other op) $6
4210
($REG_VALUE) # "$7" or '$7'
4213
my( $att1, $op1, $string1, $connector, $att2, $op2, $string2 )= ( $1, _op( $2), $3, $4, $5, _op( $6), $7);
4214
$test = qq{ ( ( (defined \$_[0]->att( "$att1"))
4215
&& ( \$_[0]->att( "$att1") $op1 $string1)
4218
( (defined \$_[0]->att( "$att2"))
4219
&& ( \$_[0]->att( "$att2") $op2 $string2)
4224
elsif( $cond=~ m{^\s*
4225
\@($REG_NAME) # [@$1
4226
\s*(=~|!~)\s* # =~ or !~ ($2)
4227
($REG_REGEXP) # /$3/
4229
{ # @att=~ /regexp/ or @att!~/regexp/
4230
my( $att, $match, $regexp)= ( $2, $3, $4);
4231
$test = qq{( defined \$_[0]->att( "$att"))
4232
&& ( \$_[0]->att( "$att") $match $regexp)
4235
elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1
4236
\[\s*(?:text|string)(?:\(\s*\))? # [string()
4237
\s*($REG_OP)\s* # = or other op ($2)
4238
($REG_VALUE) # "$3" or '$3'
4240
{ # gi[string()= "val"]
4241
my ($gi, $op, $text)= ($1, _op( $2), $3);
4242
if( $gi && ($gi ne '*'))
4243
{ $test = qq{(\$_[0]->gi eq "$gi") && ( \$_[0]->text eq $text)}; }
4245
{ $test = qq{ \$_[0]->text eq $text }; }
4247
elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1
4248
\[\s*(?:text|string)(?:\(\s*\))? # [string()
4249
\s*($REG_MATCH)\s* # =~ or !~ ($2)
4250
($REG_REGEXP) # /$3/
4252
{ # gi[string()=~ /regexp/]
4253
my( $gi, $match, $regexp)= ($1, $2, $3);
4254
if( $gi && ($gi ne '*'))
4255
{ $test = qq{(\$_[0]->gi eq "$gi") && ( \$_[0]->text $match $regexp) }; }
4257
{ $test = qq{ \$_[0]->text $match $regexp }; }
4259
elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1
4260
\[\s*(?:text|string)\s*\(\s* # [string(
4261
($REG_NAME)\s*\) # $2)
4262
\s*($REG_OP)\s* # = or other op $3
4263
($REG_VALUE) # "$4" or '$4'
4265
{ # gi[string(gi2)= "text"]
4266
my ($gi, $gi2, $op, $text)= ($1, $2, _op($3), $4);
4267
$text=~ s/([{}])/\\$1/g;
4268
if( $gi && ($gi ne '*'))
4269
{ $test = qq{ (\$_[0]->gi eq "$gi")
4270
&& ( \$_[0]->first_child( qq{$gi2\[text() $op $text]}))
4274
{ $test = qq{ \$_[0]->first_child(qq{$gi2\[text() $op $text]}) } ; }
4276
elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1
4277
\[\s*(?:text|string)\(\s* # [string(
4278
($REG_NAME)\s*\) # $2)
4279
\s*(=~|!~)\s* # =~ or !~ ($3)
4280
($REG_REGEXP) # /$4/
4282
{ # gi[string(gi2)=~ /regexp/]
4283
my( $gi, $gi2, $match, $regexp)= ($1, $2, $3, $4);
4284
if( $gi && ($gi ne '*'))
4285
{ $test = qq{ (\$_[0]->gi eq "$gi")
4286
&& ( \$_[0]->field( "$gi2") $match $regexp)
4290
{ $test = qq{\$_[0]->field( "$gi2") $match $regexp}; }
4293
{ croak "wrong condition '$original_cond'"; }
4296
my $s= eval "sub { return \$_[0] if( $not($test)) }";
4298
{ croak "wrong navigation condition $original_cond ($@);" }
4304
if( $op eq '=') { $op= 'eq'; }
4305
elsif( $op eq '!=') { $op= 'ne'; }
4310
{ my( $elt, $cond)= @_;
4311
return $elt unless $cond;
4312
my $sub= ($cond_cache{$cond} ||= _install_cond( $cond));
4313
return $sub->( $elt);
4316
# end-extract twig_nodes
4319
{ $_[0]->{parent}= $_[1];
4320
weaken( $_[0]->{parent}) if( $XML::Twig::weakrefs);
4323
#start-extract twig_node
4326
my $cond= shift || return $elt->_parent;
4327
do { $elt= $elt->_parent || return; } until (!$elt || $elt->passes( $cond));
4330
#end-extract twig_node
4333
{ $_[0]->set_not_empty;
4334
$_[0]->{'first_child'}= $_[1];
4337
#start-extract twig_node
4340
my $cond= shift || return $elt->_first_child;
4341
my $child= $elt->_first_child;
4342
my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
4343
while( $child && !$test_cond->( $child))
4344
{ $child= $child->_next_sibling; }
4347
#end-extract twig_node
4349
sub _first_child { return $_[0]->{first_child}; }
4350
sub _last_child { return $_[0]->{last_child}; }
4351
sub _next_sibling { return $_[0]->{next_sibling}; }
4352
sub _prev_sibling { return $_[0]->{prev_sibling}; }
4353
sub _parent { return $_[0]->{parent}; }
4354
sub _next_siblings { my $elt= shift; my @siblings; while( $elt= $elt->{next_sibling}) { push @siblings, $elt; } return @siblings; }
4355
sub _prev_siblings { my $elt= shift; my @siblings; while( $elt= $elt->{prev_sibling}) { push @siblings, $elt; } return @siblings; }
4358
# arguments $record, $cond, @content
4360
{ my $record = shift;
4362
my $child= $record->first_child( $cond);
4364
{ $child->set_content( @_); }
4366
{ if( $cond=~ m{^\s*($REG_NAME)})
4368
$child= $record->insert_new_elt( last_child => $gi, @_);
4371
{ croak "can't create a field name from $cond"; }
4377
{ $_[0]->set_not_empty;
4378
$_[0]->{'last_child'}= $_[1];
4379
weaken( $_[0]->{'last_child'}) if( $XML::Twig::weakrefs);
4382
#start-extract twig_node
4385
my $cond= shift || return $elt->_last_child;
4386
my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
4387
my $child= $elt->_last_child;
4388
while( $child && !$test_cond->( $child) )
4389
{ $child= $child->_prev_sibling; }
4392
#end-extract twig_node
4395
sub set_prev_sibling
4396
{ $_[0]->{'prev_sibling'}= $_[1];
4397
weaken( $_[0]->{'prev_sibling'}) if( $XML::Twig::weakrefs);
4400
#start-extract twig_node
4403
my $cond= shift || return $elt->_prev_sibling;
4404
my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
4405
my $sibling= $elt->_prev_sibling;
4406
while( $sibling && !$test_cond->( $sibling) )
4407
{ $sibling= $sibling->_prev_sibling; }
4410
#end-extract twig_node
4412
sub set_next_sibling { $_[0]->{'next_sibling'}= $_[1]; }
4414
#start-extract twig_node
4417
my $cond= shift || return $elt->_next_sibling;
4418
my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
4419
my $sibling= $elt->_next_sibling;
4420
while( $sibling && !$test_cond->( $sibling) )
4421
{ $sibling= $sibling->_next_sibling; }
4425
# methods dealing with the class attribute, convenient if you work with xhtml
4426
sub class { my( $elt)= @_; return $elt->att( 'class'); }
4427
sub set_class { my( $elt, $class)= @_; $elt->set_att( class => $class); }
4429
# adds a class to an element
4431
{ my( $elt, $new_class)= @_;
4432
return $elt unless $new_class;
4433
my $class= $elt->class;
4434
my %class= $class ? map { $_ => 1 } split /\s+/, $class : ();
4435
$class{$new_class}= 1;
4436
$elt->set_class( join( ' ', sort keys %class));
4439
sub att_to_class { my( $elt, $att)= @_; $elt->set_class( $elt->att( $att)); }
4440
sub add_att_to_class { my( $elt, $att)= @_; $elt->add_to_class( $elt->att( $att)); }
4441
sub move_att_to_class { my( $elt, $att)= @_; $elt->add_to_class( $elt->att( $att));
4442
$elt->del_att( $att);
4444
sub tag_to_class { my( $elt)= @_; $elt->set_class( $elt->tag); }
4445
sub add_tag_to_class { my( $elt)= @_; $elt->add_to_class( $elt->tag); }
4446
sub set_tag_class { my( $elt, $new_tag)= @_; $elt->add_tag_to_class; $elt->set_tag( $new_tag); }
4449
{ my( $elt, $class)= @_;
4450
my $elt_class= $elt->class;
4451
return unless( defined $elt_class);
4452
return $elt->class=~ m{(?:^|\s)\Q$class\E(?:\s|$)} ? $elt : 0;
4455
#end-extract twig_node
4457
# get or set all attributes
4458
# argument can be a hash or a hasref
4462
tie %atts, 'Tie::IxHash' if( keep_atts_order());
4463
%atts= ( isa( $_[0] || '', 'HASH')) ? %{$_[0]} : @_;
4464
$elt->{'att'}= \%atts;
4468
sub atts { return $_[0]->{att}; }
4469
sub att_names { return sort keys %{$_[0]->{att}}; }
4470
sub del_atts { $_[0]->{att}={}; return $_[0]; }
4472
# get or set a single attribute (set works for several atts)
4476
unless( $elt->{att})
4478
tie %{$elt->{att}}, 'Tie::IxHash' if( keep_atts_order());
4481
while(@_) { my( $att, $val)= (shift, shift);
4482
$elt->{att}->{$att}= $val;
4487
sub att { return $_[0]->{att}->{$_[1]}; }
4490
while( @_) { delete $elt->{'att'}->{shift()}; }
4494
# delete an attribute from all descendants of an element
4496
{ my( $elt, $att)= @_;
4497
$_->del_att( $att) foreach ($elt->descendants_or_self( qq{*[\@$att]}));
4501
{ my( $elt, $old_name, $new_name)= @_;
4502
my $value= $elt->att( $old_name);
4503
return $elt unless( defined $value);
4504
$elt->del_att( $old_name)
4505
->set_att( $new_name => $value);
4509
sub set_twig_current { $_[0]->{twig_current}=1; }
4510
sub del_twig_current { delete $_[0]->{twig_current}; }
4513
# get or set the id attribute
4515
{ my( $elt, $id)= @_;
4516
$elt->set_att($ID, $id);
4517
my $t= $elt->twig || return;
4518
$elt->twig->{twig_id_list}->{$id}= $elt;
4519
weaken( $elt->twig->{twig_id_list}->{$id}) if( $XML::Twig::weakrefs);
4522
sub id { return $_[0]->{att}->{$ID}; }
4524
# methods used to add ids to elements that don't have one
4526
{ my $id_nb = "0001";
4527
my $id_seed = "twig_id_";
4530
{ $id_seed= $_[1]; $id_nb=1; }
4534
$elt->set_id( $id_seed . $id_nb++) unless( $elt->id);
4541
# delete the id attribute and remove the element from the id list
4544
unless( exists $elt->{'att'}) { return $elt };
4545
my $id= $elt->{'att'}->{$ID} or return $elt;
4547
delete $elt->{'att'}->{$ID};
4549
my $t= shift || $elt->twig;
4550
unless( $t) { return $elt; }
4551
if( exists $t->{twig_id_list}->{$id}) { delete $t->{twig_id_list}->{$id}; }
4556
# return the list of children
4557
#start-extract twig_node
4561
my $child= $elt->first_child( @_);
4563
{ push @children, $child;
4564
$child= $child->next_sibling( @_);
4572
my $child= $elt->_first_child();
4574
{ push @children, $child;
4575
$child= $child->_next_sibling;
4583
my $child= $elt->first_child( @_);
4585
{ push @children, $child->copy;
4586
$child= $child->next_sibling( @_);
4596
my $child= $elt->_first_child;
4598
{ $count++ if( $child->passes( $cond));
4599
$child= $child->_next_sibling;
4606
return map { $_->text} $elt->children( @_);
4609
sub all_children_are
4610
{ my( $parent, $cond)= @_;
4611
foreach my $child ($parent->children)
4612
{ return 0 unless( $child->passes( $cond)); }
4618
{ my( $elt, $cond)= @_;
4620
while( $elt->_parent)
4621
{ $elt= $elt->_parent;
4622
push @ancestors, $elt
4623
if( $elt->passes( $cond));
4628
sub ancestors_or_self
4629
{ my( $elt, $cond)= @_;
4632
{ push @ancestors, $elt
4633
if( $elt->passes( $cond));
4634
$elt= $elt->_parent;
4641
{ my( $elt, $include_self)= @_;
4642
my @ancestors= $include_self ? ($elt) : ();
4643
while( $elt= $elt->_parent)
4644
{ push @ancestors, $elt;
4653
my %tags= map { ($_, 1) } @_;
4656
{ if( (defined $elt->att( $att))
4657
&& ( !%tags || $tags{$elt->gi})
4659
{ return $elt->att( $att); }
4660
} while( $elt= $elt->_parent);
4665
sub current_ns_prefixes
4668
$prefix{''}=1 if( $elt->namespace( ''));
4670
{ my @ns= grep { !m{^xml} } map { m{^([^:]+):} } ($elt->gi, $elt->att_names);
4671
$prefix{$_}=1 foreach (@ns);
4672
$elt= $elt->_parent;
4675
return sort keys %prefix;
4678
# kinda counter-intuitive actually:
4679
# the next element is found by looking for the next open tag after from the
4680
# current one, which is the first child, if it exists, or the next sibling
4681
# or the first next sibling of an ancestor
4682
# optional arguments are:
4683
# - $subtree_root: a reference to an element, when the next element is not
4684
# within $subtree_root anymore then next_elt returns undef
4685
# - $cond: a condition, next_elt returns the next element matching the condition
4689
my $subtree_root= 0;
4690
$subtree_root= shift if( defined $_[0] and (isa( $_[0], 'XML::Twig::Elt')));
4694
my $ind; # optimization
4696
if( $cond) # optimization
4697
{ unless( defined( $ind= $XML::Twig::gi2index{$cond}) ) # optimization
4698
{ $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); } # optimization
4702
{ if( $next_elt= $elt->_first_child)
4703
{ # simplest case: the elt has a child
4705
elsif( $next_elt= $elt->_next_sibling)
4706
{ # no child but a next sibling (just check we stay within the subtree)
4708
# case where elt is subtree_root, is empty and has a sibling
4709
return undef if( $subtree_root && ($elt == $subtree_root));
4713
{ # case where the element has no child and no next sibling:
4714
# get the first next sibling of an ancestor, checking subtree_root
4716
# case where elt is subtree_root, is empty and has no sibling
4717
return undef if( $subtree_root && ($elt == $subtree_root));
4719
# backtrack until we find a parent with a next sibling
4720
$next_elt= $elt->_parent || return undef;
4721
until( $next_elt->_next_sibling)
4722
{ return undef if( $subtree_root && ($subtree_root == $next_elt));
4723
$next_elt= $next_elt->_parent || return undef;
4725
return undef if( $subtree_root && ($subtree_root == $next_elt));
4726
$next_elt= $next_elt->_next_sibling;
4728
$elt= $next_elt; # just in case we need to loop
4729
} until( ! defined $elt
4731
|| (defined $ind && ($elt->{gi} eq $ind)) # optimization
4732
|| (defined $test_cond && ($test_cond->( $elt)))
4738
# return the next_elt within the element
4739
# just call next_elt with the element as first and second argument
4740
sub first_descendant { return $_[0]->next_elt( @_); }
4742
# counter-intuitive too:
4743
# the previous element is found by looking
4744
# for the first open tag backwards from the current one
4745
# it's the last descendant of the previous sibling
4746
# if it exists, otherwise it's simply the parent
4753
{ if( $prev_elt= $elt->_prev_sibling)
4754
{ while( $prev_elt->_last_child)
4755
{ $prev_elt= $prev_elt->_last_child; }
4758
{ $prev_elt= $elt->_parent || return; }
4759
$elt= $prev_elt; # in case we need to loop
4760
} until( $elt->passes( $cond));
4769
foreach (1..$offset)
4770
{ $elt= $elt->next_elt( @_) || return undef; }
4774
# checks whether $elt is included in $ancestor, returns 1 in that case
4776
{ my ($elt, $ancestor)= @_;
4777
if( isa( $ancestor, 'XML::Twig::Elt'))
4779
while( $elt= $elt->_parent) { return $elt if( $elt == $ancestor); }
4783
while( $elt= $elt->_parent) { return $elt if( $elt->matches( $ancestor)); }
4788
sub first_child_text
4790
my $dest=$elt->first_child(@_) or return '';
4794
sub first_child_trimmed_text
4796
my $dest=$elt->first_child(@_) or return '';
4797
return $dest->trimmed_text;
4800
sub first_child_matches
4802
my $dest= $elt->_first_child or return undef;
4803
return $dest->passes( @_);
4808
my $dest=$elt->last_child(@_) or return '';
4812
sub last_child_trimmed_text
4814
my $dest=$elt->last_child(@_) or return '';
4815
return $dest->trimmed_text;
4818
sub last_child_matches
4820
my $dest= $elt->_last_child or return undef;
4821
return $dest->passes( @_);
4826
my $dest=$elt->child(@_) or return '';
4830
sub child_trimmed_text
4832
my $dest=$elt->child(@_) or return '';
4833
return $dest->trimmed_text;
4839
my $dest= $elt->child( $nb) or return undef;
4840
return $dest->passes( @_);
4843
sub prev_sibling_text
4845
my $dest=$elt->prev_sibling(@_) or return '';
4849
sub prev_sibling_trimmed_text
4851
my $dest=$elt->prev_sibling(@_) or return '';
4852
return $dest->trimmed_text;
4855
sub prev_sibling_matches
4857
my $dest= $elt->_prev_sibling or return undef;
4858
return $dest->passes( @_);
4861
sub next_sibling_text
4863
my $dest=$elt->next_sibling(@_) or return '';
4867
sub next_sibling_trimmed_text
4869
my $dest=$elt->next_sibling(@_) or return '';
4870
return $dest->trimmed_text;
4873
sub next_sibling_matches
4875
my $dest= $elt->_next_sibling or return undef;
4876
return $dest->passes( @_);
4881
my $dest=$elt->prev_elt(@_) or return '';
4885
sub prev_elt_trimmed_text
4887
my $dest=$elt->prev_elt(@_) or return '';
4888
return $dest->trimmed_text;
4891
sub prev_elt_matches
4893
my $dest= $elt->prev_elt or return undef;
4894
return $dest->passes( @_);
4899
my $dest=$elt->next_elt(@_) or return '';
4903
sub next_elt_trimmed_text
4905
my $dest=$elt->next_elt(@_) or return '';
4906
return $dest->trimmed_text;
4909
sub next_elt_matches
4911
my $dest= $elt->next_elt or return undef;
4912
return $dest->passes( @_);
4917
my $dest=$elt->parent(@_) or return '';
4921
sub parent_trimmed_text
4923
my $dest=$elt->parent(@_) or return '';
4924
return $dest->trimmed_text;
4929
my $dest= $elt->_parent or return undef;
4930
return $dest->passes( @_);
4935
my $parent= $elt->parent or return 0;
4936
my $first_child= $parent->first_child( @_) or return 0;
4937
return ($first_child == $elt) ? $elt : 0;
4942
my $parent= $elt->parent or return 0;
4943
my $last_child= $parent->last_child( @_) or return 0;
4944
return ($last_child == $elt) ? $elt : 0;
4947
# returns the depth level of the element
4948
# if 2 parameter are used then counts the 2cd element name in the
4951
{ my( $elt, $cond)= @_;
4954
my $name=shift || '';
4955
while( $elt= $elt->_parent) { $level++ if( !$cond || $elt->matches( $cond)); }
4959
# checks whether $elt has an ancestor that satisfies $cond, returns the ancestor
4961
{ my ($elt, $cond, $level)= @_;
4962
$level= -1 unless( $level) ; # $level-- will never hit 0
4965
{ $elt= $elt->_parent or return;
4966
if( $elt->matches( $cond)) { return $elt; }
4973
{ my( $subtree_root, $include_self)= @_;
4974
my @descendants= $include_self ? ($subtree_root) : ();
4976
my $elt= $subtree_root;
4980
{ if( $next_elt= $elt->_first_child)
4981
{ # simplest case: the elt has a child
4983
elsif( $next_elt= $elt->_next_sibling)
4984
{ # no child but a next sibling (just check we stay within the subtree)
4986
# case where elt is subtree_root, is empty and has a sibling
4987
last MAIN if( $elt == $subtree_root);
4990
{ # case where the element has no child and no next sibling:
4991
# get the first next sibling of an ancestor, checking subtree_root
4993
# case where elt is subtree_root, is empty and has no sibling
4994
last MAIN if( $elt == $subtree_root);
4996
# backtrack until we find a parent with a next sibling
4997
$next_elt= $elt->_parent || last;
4998
until( $next_elt->_next_sibling)
4999
{ last MAIN if( $subtree_root == $next_elt);
5000
$next_elt= $next_elt->_parent || last MAIN;
5002
last MAIN if( $subtree_root == $next_elt);
5003
$next_elt= $next_elt->_next_sibling;
5005
$elt= $next_elt || last MAIN;
5006
push @descendants, $elt;
5008
return @descendants;
5013
{ my( $subtree_root, $cond)= @_;
5015
my $elt= $subtree_root;
5017
# this branch is pure optimisation for speed: if $cond is a gi replace it
5018
# by the index of the gi and loop here
5019
# start optimization
5021
if( !$cond || ( defined ( $ind= $XML::Twig::gi2index{$cond})) )
5026
{ if( $next_elt= $elt->_first_child)
5027
{ # simplest case: the elt has a child
5029
elsif( $next_elt= $elt->_next_sibling)
5030
{ # no child but a next sibling (just check we stay within the subtree)
5032
# case where elt is subtree_root, is empty and has a sibling
5033
last if( $subtree_root && ($elt == $subtree_root));
5036
{ # case where the element has no child and no next sibling:
5037
# get the first next sibling of an ancestor, checking subtree_root
5039
# case where elt is subtree_root, is empty and has no sibling
5040
last if( $subtree_root && ($elt == $subtree_root));
5042
# backtrack until we find a parent with a next sibling
5043
$next_elt= $elt->_parent || last undef;
5044
until( $next_elt->_next_sibling)
5045
{ last if( $subtree_root && ($subtree_root == $next_elt));
5046
$next_elt= $next_elt->_parent || last;
5048
last if( $subtree_root && ($subtree_root == $next_elt));
5049
$next_elt= $next_elt->_next_sibling;
5051
$elt= $next_elt || last;
5052
push @descendants, $elt if( !$cond || ($elt->{gi} eq $ind));
5057
{ # branch for a complex condition: use the regular (slow but simple) way
5058
while( $elt= $elt->next_elt( $subtree_root, $cond))
5059
{ push @descendants, $elt; }
5061
return @descendants;
5065
sub descendants_or_self
5066
{ my( $elt, $cond)= @_;
5067
my @descendants= $elt->passes( $cond) ? ($elt) : ();
5068
push @descendants, $elt->descendants( $cond);
5069
return @descendants;
5077
{ $elt= $elt->next_sibling( @_) or return undef; }
5080
{ foreach( 1..(-$nb))
5081
{ $elt= $elt->prev_sibling( @_) or return undef; }
5084
{ return $elt->passes( $_[0]); }
5089
{ my $elt= sibling( @_);
5090
return $elt ? $elt->text : undef;
5098
{ $elt= $elt->first_child( @_) or return undef;
5100
{ $elt= $elt->next_sibling( @_) or return undef; }
5103
{ $elt= $elt->last_child( @_) or return undef;
5105
{ $elt= $elt->prev_sibling( @_) or return undef; }
5113
while( $elt= $elt->prev_sibling( @_))
5114
{ unshift @siblings, $elt; }
5120
return 0 if ($_[0] && !$elt->matches( @_));
5122
$pos++ while( $elt= $elt->prev_sibling( @_));
5130
while( $elt= $elt->next_sibling( @_))
5131
{ push @siblings, $elt; }
5135
# used by get_xpath: parses the xpath expression and generates a sub that performs the
5138
{ my( $xpath_exp, $type)= @_;
5139
my $original_exp= $xpath_exp;
5140
my $sub= 'my $elt= shift; my @results;';
5142
# grab the root if expression starts with a /
5143
if( $xpath_exp=~ s{^/}{})
5144
{ $sub .= '@results= ($elt->twig);'; }
5145
elsif( $xpath_exp=~ s{^\./}{})
5146
{ $sub .= '@results= ($elt);'; }
5148
{ $sub .= '@results= ($elt);'; }
5151
while( $xpath_exp &&
5152
$xpath_exp=~s{^\s*(/?)
5153
# the xxx=~/regexp/ is a pain as it includes /
5154
(\s*((\*|$REG_NAME|\.)\s*)?\[\s*(string\(\s*\)|\@$REG_NAME)\s*$REG_MATCH
5155
\s*$REG_REGEXP\s*\]\s*
5156
# or a regular condition, with no / excepts \/
5165
if( $sub_exp eq '..')
5166
{ croak "error in xpath expression $original_exp" if( $wildcard);
5167
$sub .= '@results= map { $_->parent} @results;';
5169
# test the element itself
5170
elsif( $sub_exp=~ m{^\.(.*)$}s)
5171
{ $sub .= "\@results= grep { \$_->matches( q{$1}) } \@results;" }
5173
elsif( $sub_exp=~ m{($REG_NAME_W)?\s* # * or a gi ($1)
5177
(?:string\(\s*\)|\@$REG_NAME) # regexp condition
5178
\s*$REG_MATCH\s*$REG_REGEXP\s* # or
5179
|[^\]]* # regular condition
5184
if( !$1 or $1 eq '*') { $gi=''; }
5186
if( $cond) { $cond=~ s{^\s*}{}; $cond=~ s{\s*$}{}; }
5189
# "special" conditions, that return just one element
5190
if( $cond && ($cond =~ m{^((-\s*)?\d+)$}) )
5192
$offset-- if( $offset > 0);
5193
$function= $wildcard ? "next_n_elt( $offset, '$gi')"
5194
: "child( $offset, '$gi')";
5195
$sub .= "\@results= map { \$_->$function } \@results;"
5197
elsif( $cond && ($cond =~ m{^last\s*\(\s*\)$}) )
5198
{ croak "error in xpath expression $original_exp, cant use // and last()"
5200
$sub .= "\@results= map { \$_->last_child( '$gi') } \@results;";
5203
{ # go down and get the children or descendants
5204
unless ( defined $gi)
5206
{ $sub .= '@results= map { $_->descendants } @results;' }
5208
{ $sub .= '@results= map { $_->children } @results;'; }
5212
{ $sub .= "\@results= map { \$_->descendants( '$gi') } \@results;"; }
5214
{ $sub .= "\@results= map { \$_->children( '$gi') } \@results;"; }
5216
# now filter using the condition
5222
{ $cond=~ s{^\s*$op\s*}{};
5226
if( $cond =~ s{^string\(\s*\)\s*=\s*($REG_STRING)\s*}{}o) # string()="string" cond
5227
{ $test .= "\$_->text eq $1";
5229
elsif( $cond =~ s{^string\(\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)\s*}{}o) # string()=~/regex/ cond
5230
{ my( $match, $regexp)= ($1, $2);
5231
$test .= "\$_->text $match $regexp";
5233
elsif( $cond=~ s{^@($REG_NAME)\s*($REG_OP)\s*($REG_STRING|$REG_NUMBER)}{}o) # @att="val" cond
5234
{ my( $att, $oper, $val)= ($1, _op( $2), $3);
5235
$test .= qq{((defined \$_->att("$att")) && (\$_->att( "$att") $oper $val))};
5237
elsif( $cond =~ s{^@($REG_NAME)\s*($REG_MATCH)\s*($REG_REGEXP)\s*}{}o) # @att=~/regex/ cond XXX
5238
{ my( $att, $match, $regexp)= ($1, $2, $3);
5239
$test .= qq{((defined \$_->att("$att")) && (\$_->att("$att") $match $regexp))};;
5241
elsif( $cond=~ s{^@($REG_NAME)\s*}{}o) # @att cond
5243
$test .= qq{(defined \$_->att("$1"))};
5245
elsif( $cond=~ s{^\s*(\d+)\s*}{}o) # positive number condition
5247
$test .= qq{(\$_->prev_siblings( \$_->gi) == $1)};
5249
elsif( $cond=~ s{^\s*-\s*(\d+)\s*}{}o) # negative number condition
5251
$test .= qq{(\$_->next_siblings( \$_->gi) == $1)};
5253
} while( ($op)=($cond=~ m{^\s*(and|or)\s*}i));
5254
croak "error in xpath expression $original_exp at $cond" if( $cond);
5255
$sub .= "\@results= grep { $test } \@results;";
5262
{ croak "error in xpath expression $original_exp around $xpath_exp"; }
5264
$sub .= "return \@results; ";
5265
my $s= eval "sub { $sub }";
5266
if( $@) { croak "error in xpath expression $original_exp ($@);" }
5270
{ # extremely elaborate caching mechanism
5271
my %xpath; # xpath_expression => subroutine_code;
5273
{ my( $elt, $xpath_exp, $offset)= @_;
5274
my $sub= ($xpath{$xpath_exp} ||= _install_xpath( $xpath_exp));
5275
return $sub->( $elt) unless( defined $offset);
5276
my @res= $sub->( $elt);
5277
return $res[$offset];
5279
1; # so the module returns 1 as this is the last BEGIN block in the file
5285
return join '', map { $_->text } $elt->get_xpath( @_);
5288
#end-extract twig_node
5291
# XML::XPath compatibility
5292
sub getElementById { return $_[0]->twig->elt_id( $_[1]); }
5293
sub getChildNodes { my @children= $_[0]->children; return wantarray ? @children : \@children; }
5295
sub _flushed { return $_[0]->{flushed}; }
5296
sub _set_flushed { $_[0]->{flushed}=1; }
5297
sub _del_flushed { delete $_[0]->{flushed}; }
5302
my( $parent, $prev_sibling, $next_sibling, $last_elt);
5304
# you can't cut the root, sorry
5305
unless( $parent= $elt->_parent)
5307
# it we cut the current element then its parent becomes the current elt
5308
if( $elt->{twig_current})
5309
{ my $twig_current= $elt->_parent;
5311
$t->{twig_current}= $twig_current;
5312
$twig_current->set_twig_current;
5313
$elt->del_twig_current;
5316
if( $parent->_first_child == $elt)
5317
{ $parent->set_first_child( $elt->_next_sibling);
5318
$parent->set_empty unless( $elt->_next_sibling);
5320
$parent->set_last_child( $elt->_prev_sibling)
5321
if( $parent->_last_child == $elt);
5323
if( $prev_sibling= $elt->_prev_sibling)
5324
{ $prev_sibling->set_next_sibling( $elt->_next_sibling); }
5325
if( $next_sibling= $elt->_next_sibling)
5326
{ $next_sibling->set_prev_sibling( $elt->_prev_sibling); }
5329
$elt->set_parent( undef);
5330
$elt->set_prev_sibling( undef);
5331
$elt->set_next_sibling( undef);
5337
{ my( $elt, $exp)= @_;
5338
my @children= $elt->children( $exp);
5339
foreach (@children) { $_->cut; }
5345
#you cannot erase the current element
5346
if( $elt->{twig_current})
5347
{ croak "trying to erase an element before it has been completely parsed"; }
5348
unless( $elt->_parent)
5349
{ # trying to erase the root (of a twig or of a cut/new element)
5350
my @children= $elt->children;
5351
unless( @children == 1)
5352
{ croak "can only erase an element with no parent if it has a single child"; }
5353
$elt->_move_extra_data_to_children;
5354
my $child= shift @children;
5355
$child->set_parent( undef);
5356
my $twig= $elt->twig;
5357
$twig->set_root( $child);
5361
$elt->_move_extra_data_to_children;
5362
my @children= $elt->children;
5364
{ # elt has children, move them up
5365
if( $elt->_prev_sibling)
5366
{ # connect first child to previous sibling
5367
$elt->_first_child->set_prev_sibling( $elt->_prev_sibling);
5368
$elt->_prev_sibling->set_next_sibling( $elt->_first_child);
5371
{ # elt was the first child
5372
$elt->_parent->set_first_child( $elt->_first_child);
5374
if( $elt->_next_sibling)
5375
{ # connect last child to next sibling
5376
$elt->_last_child->set_next_sibling( $elt->_next_sibling);
5377
$elt->_next_sibling->set_prev_sibling( $elt->_last_child);
5380
{ # elt was the last child
5381
$elt->_parent->set_last_child( $elt->_last_child);
5383
# update parent for all siblings
5384
foreach my $child (@children)
5385
{ $child->set_parent( $elt->_parent); }
5386
# elt is not referenced any more, so it will be DESTROYed
5387
# so we'd better break the links to its children
5388
undef $elt->{'first_child'};
5389
undef $elt->{'last_child'};
5390
undef $elt->{'parent'};
5391
undef $elt->{'prev_sibling'};
5392
undef $elt->{'next_sibling'};
5394
{ # elt had no child, delete it
5403
sub _move_extra_data_to_children
5406
if( my $extra_data= $elt->{extra_data})
5407
{ if( my $first_child= $elt->first_child)
5408
{ $first_child->set_extra_data( $extra_data . ($first_child->extra_data || '')); }
5409
elsif( my $parent= $elt->parent)
5410
{ $parent->append_extra_data( $extra_data); }
5412
{ $elt->twig->append_extra_data( $extra_data); }
5415
# extra_data_before_end_tag
5416
if( my $extra_data_before_end_tag= $elt->{extra_data_before_end_tag})
5417
{ if( my $parent= $elt->parent)
5418
{ $parent->{extra_data_before_end_tag}.= $extra_data_before_end_tag; }
5419
elsif( my $last_child= $elt->last_child)
5420
{ $last_child->{extra_data_before_end_tag}.= $extra_data_before_end_tag; }
5426
{ my %method= ( before => \&paste_before,
5427
after => \&paste_after,
5428
first_child => \&paste_first_child,
5429
last_child => \&paste_last_child,
5430
within => \&paste_within,
5433
# paste elt somewhere around ref
5434
# pos can be first_child (default), last_child, before, after or within
5438
{ croak "cannot paste an element that belongs to a tree"; }
5442
{ $pos= 'first_child';
5443
croak "wrong argument order in paste, should be $_[1] first" if($_[1]);
5448
if( my $method= $method{$pos})
5449
{ unless( ref $_[0]) { croak "Destination undefined in paste"; }
5450
unless( isa( $_[0], "XML::Twig::Elt"))
5451
{ croak "wrong argument type in paste, should be XML::Twig::Elt"; }
5452
# check here so error message lists the caller file/line
5453
if( !$ref->_parent && ($method=~ m{^(before|after)}) ) { croak "cannot paste $1 root"; }
5457
{ croak "tried to paste in wrong position '$pos', allowed positions " .
5458
" are 'first_child', 'last_child', 'before', 'after' and " .
5466
{ my( $elt, $ref)= @_;
5467
my( $parent, $prev_sibling, $next_sibling );
5468
unless( $ref->_parent) { croak "cannot paste before root"; }
5469
$parent= $ref->_parent;
5470
$prev_sibling= $ref->_prev_sibling;
5471
$next_sibling= $ref;
5473
$elt->set_parent( $parent);
5474
$parent->set_first_child( $elt) if( $parent->_first_child == $ref);
5476
$prev_sibling->set_next_sibling( $elt) if( $prev_sibling);
5477
$elt->set_prev_sibling( $prev_sibling);
5479
$next_sibling->set_prev_sibling( $elt);
5480
$elt->set_next_sibling( $ref);
5485
{ my( $elt, $ref)= @_;
5486
my( $parent, $prev_sibling, $next_sibling );
5487
unless( $ref->_parent) { croak "cannot paste after root"; }
5488
$parent= $ref->_parent;
5489
$prev_sibling= $ref;
5490
$next_sibling= $ref->_next_sibling;
5492
$elt->set_parent( $parent);
5493
$parent->set_last_child( $elt) if( $parent->_last_child== $ref);
5495
$prev_sibling->set_next_sibling( $elt);
5496
$elt->set_prev_sibling( $prev_sibling);
5498
$next_sibling->set_prev_sibling( $elt) if( $next_sibling);
5499
$elt->set_next_sibling( $next_sibling);
5504
sub paste_first_child
5505
{ my( $elt, $ref)= @_;
5506
my( $parent, $prev_sibling, $next_sibling );
5508
$next_sibling= $ref->_first_child;
5509
$ref->set_not_empty;
5511
$elt->set_parent( $parent);
5512
$parent->set_first_child( $elt);
5513
$parent->set_last_child( $elt) unless( $parent->_last_child);
5515
$elt->set_prev_sibling( undef);
5517
$next_sibling->set_prev_sibling( $elt) if( $next_sibling);
5518
$elt->set_next_sibling( $next_sibling);
5522
sub paste_last_child
5523
{ my( $elt, $ref)= @_;
5524
my( $parent, $prev_sibling, $next_sibling );
5526
$prev_sibling= $ref->_last_child;
5527
$ref->set_not_empty;
5529
$elt->set_parent( $parent);
5530
$parent->set_last_child( $elt);
5531
$parent->set_first_child( $elt) unless( $parent->_first_child);
5533
$elt->set_prev_sibling( $prev_sibling);
5534
$prev_sibling->set_next_sibling( $elt) if( $prev_sibling);
5536
$elt->set_next_sibling( undef);
5541
{ my( $elt, $ref, $offset)= @_;
5542
my $text= $ref->is_text ? $ref : $ref->next_elt( '#TEXT', $ref);
5543
my $new= $text->split_at( $offset);
5544
$elt->paste_before( $new);
5549
# load an element into a structure similar to XML::Simple's
5553
# normalize option names
5555
%options= map { my ($key, $val)= ($_, $options{$_});
5556
$key=~ s{(\w)([A-Z])}{$1_\L$2}g;
5561
my @allowed_options= qw( keyattr forcearray noattr content_key
5562
var var_regexp variables var_attr
5563
erase group_tags forcecontent
5564
normalise_space normalize_space
5566
my %allowed_options= map { $_ => 1 } @allowed_options;
5567
foreach my $option (keys %options)
5568
{ warn "invalid option $option\n" unless( $allowed_options{$option}); }
5570
$options{normalise_space} ||= $options{normalize_space} || 0;
5572
$options{content_key} ||= 'content';
5573
if( $options{content_key}=~ m{^-})
5574
{ # need to remove the - and to activate extra folding
5575
$options{content_key}=~ s{^-}{};
5576
$options{extra_folding}= 1;
5579
{ $options{extra_folding}= 0; }
5581
$options{forcearray} ||=0;
5582
if( isa( $options{forcearray}, 'ARRAY'))
5583
{ my %forcearray_tags= map { $_ => 1 } @{$options{forcearray}};
5584
$options{forcearray_tags}= \%forcearray_tags;
5585
$options{forcearray}= 0;
5588
$options{erase}||= $options{group_tags}; # for compat with XML::Simple
5589
if( isa( $options{erase}, 'ARRAY'))
5590
{ my %erase_tags= map { $_ => 1 } @{$options{erase}};
5591
$options{erase}= \%erase_tags;
5594
$options{keyattr} ||= ['name', 'key', 'id'];
5595
if( ref $options{keyattr} eq 'ARRAY')
5596
{ foreach my $keyattr (@{$options{keyattr}})
5597
{ my( $prefix, $att)= ($keyattr=~ m{^([+-])?(.*)});
5599
$options{key_for_all}->{$att}= 1;
5600
$options{remove_key_for_all}->{$att}=1 unless( $prefix eq '+');
5601
$options{prefix_key_for_all}->{$att}=1 if( $prefix eq '-');
5604
elsif( ref $options{keyattr} eq 'HASH')
5605
{ while( my( $elt, $keyattr)= each %{$options{keyattr}})
5606
{ my( $prefix, $att)= ($keyattr=~ m{^([+-])?(.*)});
5608
$options{key_for_elt}->{$elt}= $att;
5609
$options{remove_key_for_elt}->{"$elt#$att"}=1 unless( $prefix);
5610
$options{prefix_key_for_elt}->{"$elt#$att"}=1 if( $prefix eq '-');
5615
$options{var}||= $options{var_attr}; # for compat with XML::Simple
5616
if( $options{var}) { $options{var_values}= {}; }
5617
else { $options{var}=''; }
5619
if( $options{variables})
5620
{ $options{var}||= 1;
5621
$options{var_values}= $options{variables};
5624
if( $options{var_regexp} and !$options{var})
5625
{ warn "var option not used, var_regexp option ignored\n"; }
5626
$options{var_regexp} ||= '\$\{?(\w+)\}?';
5628
$elt->_simplify( \%options);
5633
{ my( $elt, $options)= @_;
5638
my @children= $elt->children;
5639
my %atts= $options->{noattr} ? () : %{$elt->atts};
5640
my $nb_atts= keys %atts;
5641
my $nb_children= $elt->children_count + $nb_atts;
5644
foreach (@children) { $nb_children{$_->tag}++; }
5645
foreach (keys %atts) { $nb_children{$_}++; }
5647
my $arrays; # tag => array where elements are stored
5651
foreach my $child (@children)
5652
{ if( $child->is_text)
5653
{ # generate with a content key
5654
my $text= $elt->_text_with_vars( $options);
5655
$text= _normalize_space( $text) if( $options->{normalise_space} >= 2);
5656
if( $options->{force_content}
5658
|| (scalar @children > 1)
5660
{ $data->{$options->{content_key}}= $text; }
5665
{ # element with sub elements
5666
my $child_gi= $child->gi;
5668
my $child_data= $child->_simplify( $options);
5670
# first see if we need to simplify further the child data
5671
# because of grouped tags
5672
if( my $grouped_tag= $options->{group_tags}->{$child_gi})
5673
{ # check that the child data is a hash with a single field
5674
unless( (ref( $child_data) eq 'HASH')
5675
&& (keys %$child_data == 1)
5676
&& defined ( my $grouped_child_data= $child_data->{$grouped_tag})
5678
{ die "error in grouped tag $child_gi"; }
5680
{ $child_data= $grouped_child_data; }
5682
# because of extra folding
5683
if( $options->{extra_folding})
5684
{ if( (ref( $child_data) eq 'HASH')
5685
&& (keys %$child_data == 1)
5686
&& defined( my $content= $child_data->{$options->{content_key}})
5688
{ $child_data= $content; }
5692
if( my $keyatt= $child->_key_attr( $options))
5693
{ # element with key
5694
my $key= $child->att( $keyatt);
5695
$key= _normalize_space( $key) if( $options->{normalise_space} >= 1);
5696
$data->{$child_gi}->{$key}= $child_data;
5698
elsif( $options->{forcearray}
5699
|| $options->{forcearray_tags}->{$child_gi}
5700
|| ( $nb_children{$child_gi} > 1)
5702
{ # element to store in an array
5703
$data->{$child_gi} ||= [];
5704
push @{$data->{$child_gi}}, $child_data;
5707
{ # element to store as a hash field
5708
$data->{$child_gi}= $child_data;
5715
# TODO: deal with att that already have an element by that name
5716
foreach my $att (keys %atts)
5717
{ # do not store if the att is a key that needs to be removed
5718
if( $options->{remove_key_for_all}->{$att}
5719
|| $options->{remove_key_for_elt}->{"$gi#$att"}
5723
my $att_text= _replace_vars_in_text( $atts{$att}, $options);
5724
$att_text= _normalize_space( $att_text) if( $options->{normalise_space} >= 2);
5726
if( $options->{prefix_key_for_all}->{$att}
5727
|| $options->{prefix_key_for_elt}->{"$gi#$att"}
5730
$data->{"-$att"}= $att_text;
5734
$data->{$att}= $att_text;
5742
{ my( $elt, $options)=@_;
5743
return if( $options->{noattr});
5744
if( $options->{key_for_all})
5745
{ foreach my $att ($elt->att_names)
5746
{ if( $options->{key_for_all}->{$att})
5750
elsif( $options->{key_for_elt})
5751
{ if( my $key_for_elt= $options->{key_for_elt}->{$elt->gi} )
5752
{ return $key_for_elt if( defined( $elt->att( $key_for_elt))); }
5758
{ my( $elt, $options)= @_;
5760
if( $options->{var})
5761
{ $text= _replace_vars_in_text( $elt->text, $options);
5762
$elt->_store_var( $options);
5765
{ $text= $elt->text; }
5770
sub _normalize_space
5772
$text=~ s{\s+}{ }sg;
5780
{ return 0 unless( my $atts= $_[0]->atts);
5781
return scalar keys %$atts;
5785
{ return 1 unless( my $atts= $_[0]->atts);
5786
return scalar keys %$atts ? 0 : 1;
5789
sub _replace_vars_in_text
5790
{ my( $text, $options)= @_;
5792
$text=~ s{($options->{var_regexp})}
5793
{ if( defined( my $value= $options->{var_values}->{$2}))
5796
{ warn "unknown variable $2\n";
5804
{ my( $elt, $options)= @_;
5805
if( defined (my $var_name= $elt->att( $options->{var})))
5806
{ $options->{var_values}->{$var_name}= $elt->text;
5811
# split a text element at a given offset
5813
{ my( $elt, $offset)= @_;
5814
my $text_elt= $elt->is_text ? $elt : $elt->first_child( TEXT) || return '';
5815
my $string= $text_elt->text;
5816
my $left_string= substr( $string, 0, $offset);
5817
my $right_string= substr( $string, $offset);
5818
$text_elt->set_pcdata( $left_string);
5819
my $new_elt= XML::Twig::Elt->new( $elt->gi, $right_string);
5820
$new_elt->paste( after => $elt);
5825
# split an element or its text descendants into several, in place
5826
# all elements (new and untouched) are returned
5831
if( $elt->is_text) { @text_chunks= ($elt); }
5832
else { @text_chunks= $elt->descendants( '#TEXT'); }
5833
foreach my $text_chunk (@text_chunks)
5834
{ push @result, $text_chunk->_split( 1, @_); }
5838
# split an element or its text descendants into several, in place
5839
# created elements (those which match the regexp) are returned
5844
if( $elt->is_text) { @text_chunks= ($elt); }
5845
else { @text_chunks= $elt->descendants( '#TEXT'); }
5846
foreach my $text_chunk (@text_chunks)
5847
{ push @result, $text_chunk->_split( 0, @_); }
5851
# split a single text element
5852
# return_all defines what is returned: if it is true
5853
# only returns the elements created by matches in the split regexp
5854
# otherwise all elements (new and untouched) are returned
5856
{ my $encode_is_loaded=0; # so we only load Encode once in 5.8.0+
5860
my $return_all= shift;
5861
my( $regexp, $tag, $atts)= @_;
5862
my @result; # the returned list of elements
5863
my $text= $elt->text;
5865
$tag||= $elt->parent( '#ELT')->gi; # default: same tag as the current one
5866
$atts ||= {}; # default: no attributes
5868
# 2 uses: if split matches then the first substring reuses $elt
5869
# once a split has occured then the last match needs to be put in
5871
my $previous_match= 0;
5873
while( my( $pre_match, @matches)= $text=~ /^(.*?)$regexp(.*)$/gcs)
5874
{ $text= pop @matches;
5875
if( $previous_match)
5876
{ # match, not the first one, create a new text ($gi) element
5877
$pre_match= _utf8_ify( $pre_match);
5878
$elt= $elt->insert_new_elt( after => $gi, $pre_match);
5879
push @result, $elt if( $return_all);
5882
{ # first match in $elt, re-use $elt for the first sub-string
5883
$elt->set_text( _utf8_ify( $pre_match));
5884
$previous_match++; # store the fact that there was a match
5885
push @result, $elt if( $return_all);
5888
# now deal with matches captured in the regexp
5890
{ # match, with capture
5891
foreach my $match (@matches)
5892
{ # create new element, text is the match
5893
my %atts= %$atts; # or the same atts is used for all matches!
5894
$match= _utf8_ify( $match);
5895
$elt= $elt->insert_new_elt( after => $tag, \%atts, $match);
5900
{ # match, no captures
5901
my %atts= %$atts; # or the same atts is used for all matches!
5902
$elt= $elt->insert_new_elt( after => $tag, \%atts);
5906
if( $previous_match && $text)
5907
{ # there was at least 1 match, and there is text left after the match
5908
$elt= $elt->insert_new_elt( after => $gi, $text);
5911
push @result, $elt if( $return_all);
5913
return @result; # return all elements
5916
# evil hack needed in 5.8.0, the utf flag is not set on $<n>...
5918
{ my $string= shift;
5919
if( $] == 5.008 and !keep_encoding())
5920
{ unless( $encode_is_loaded) { require Encode; import Encode; $encode_is_loaded++; }
5921
Encode::_utf8_on( $string); # the flag should be set but is not
5929
{ my %replace_sub; # cache for complex expressions (expression => sub)
5932
{ my( $elt, $regexp, $replace)= @_;
5934
my $replacement_string;
5935
my $is_string= _is_string( $replace);
5936
foreach my $text_elt ($elt->descendants_or_self( '#TEXT'))
5938
{ my $text= $text_elt->text;
5939
$text=~ s{$regexp}{ _replace_var( $replace, $1, $2, $3, $4, $5, $6, $7, $8, $9)}egx;
5940
$text_elt->set_text( $text);
5943
{ my $replace_sub= ( $replace_sub{$replace} ||= _install_replace_sub( $replace));
5944
my $text= $text_elt->text;
5945
my $pos=0; # used to skip text that was previously matched
5946
while( my( $pre_match_string, $match_string, @var)= ($text=~ m{(.*?)($regexp)}g))
5947
{ my $match_start = length( $pre_match_string);
5948
my $match = $text_elt->split_at( $match_start + $pos);
5949
my $match_length = length( $match_string);
5950
my $post_match = $match->split_at( $match_length);
5951
$replace_sub->( $match, @var);
5952
# merge previous text with current one
5954
if( ($next_sibling= $text_elt->_next_sibling)
5955
&& ($text_elt->gi eq $next_sibling->gi)
5957
{ $text_elt->merge_text( $next_sibling); }
5959
$text_elt= $post_match;
5960
$text= $post_match->text;
5961
# merge last text element with next one if needed,
5962
# the match will be against the non-matched text,
5963
# so $pos is used to skip the merged part
5965
if( ($prev_sibling= $post_match->_prev_sibling)
5966
&& ($post_match->gi eq $prev_sibling->gi)
5968
{ $pos= length( $prev_sibling->text);
5969
$post_match->merge_text( $prev_sibling);
5979
{ return ($_[0]=~ m{&e[ln]t}) ? 0: 1 }
5982
{ my( $string, @var)= @_;
5983
unshift @var, undef;
5984
$string=~ s{\$(\d)}{$var[$1]}g;
5988
sub _install_replace_sub
5989
{ my $replace_exp= shift;
5990
my @item= split m{(&e[ln]t\s*\([^)]*\))}, $replace_exp;
5991
my $sub= q{ my( $match, @var)= @_; unshift @var, undef; my $new; };
5993
foreach my $item (@item)
5994
{ if( $item=~ m{^&elt\s*\(([^)]*)\)})
5997
elsif( $item=~ m{^&ent\s*\(\s*([^\s)]*)\s*\)})
5998
{ $exp= " '#ENT' => $1"; }
6000
{ $exp= qq{ '#PCDATA' => "$item"}; }
6001
$exp=~ s{\$(\d)}{\$var[$1]}g; # replace references to matches
6002
$sub.= qq{ \$new= XML::Twig::Elt->new( $exp); };
6003
$sub .= q{ $new->paste( before => $match); };
6005
$sub .= q{ $match->delete; };
6007
my $coderef= eval "sub { $sub }";
6008
if( $@) { croak( "invalid replacement expression $replace_exp: ",$@); }
6016
{ my( $e1, $e2)= @_;
6017
croak "invalid merge: can only merge 2 elements"
6018
unless( isa( $e2, 'XML::Twig::Elt'));
6019
croak "invalid merge: can only merge 2 text elements"
6020
unless( $e1->is_text && $e2->is_text && ($e1->gi eq $e2->gi));
6021
$e1->set_text( $e1->text . $e2->text);
6027
# recursively copy an element and returns the copy (can be huge and long)
6030
my $copy= XML::Twig::Elt->new( $elt->gi);
6032
$copy->set_extra_data( $elt->extra_data) if( $elt->extra_data);
6033
# should create methods on extra_data_before_end_tag
6034
$copy->{extra_data_before_end_tag}= $elt->{extra_data_before_end_tag}
6035
if( $elt->{extra_data_before_end_tag});
6036
$copy->set_asis if( $elt->is_asis);
6037
if( $elt->is_empty) { $copy->set_empty; } # do not swap or speedup will mess up this
6039
if( $elt->is_pcdata)
6040
{ $copy->set_pcdata( $elt->pcdata);
6041
$copy->{extra_data_in_pcdata}= $elt->{extra_data_in_pcdata} if( $elt->{extra_data_in_pcdata});
6043
elsif( $elt->is_cdata)
6044
{ $copy->set_cdata( $elt->cdata);
6045
$copy->{extra_data_in_pcdata}= $elt->{extra_data_in_pcdata} if( $elt->{extra_data_in_pcdata});
6048
{ $copy->set_pi( $elt->target, $elt->data); }
6049
elsif( $elt->is_comment)
6050
{ $copy->set_comment( $elt->comment); }
6051
elsif( $elt->is_ent)
6052
{ $copy->set_ent( $elt->ent); }
6054
{ my @children= $elt->children;
6055
if( my $atts= $elt->atts)
6056
{ my %atts= %{$atts}; # we want to do a real copy of the attributes
6057
$copy->set_atts( \%atts);
6059
foreach my $child (@children)
6060
{ my $child_copy= $child->copy;
6061
$child_copy->paste( 'last_child', $copy);
6070
$elt->DESTROY unless( $XML::Twig::weakrefs);
6077
my $t= shift || $elt->twig; # optional argument, passed in recursive calls
6078
return if( $XML::Twig::weakrefs);
6080
foreach( @{[$elt->children]}) { $_->DESTROY( $t); }
6082
# the id reference needs to be destroyed
6083
# lots of tests to avoid warnings during the cleanup phase
6084
$elt->del_id( $t) if( $ID && $t && defined( $elt->{att}) && exists( $elt->{att}->{$ID}));
6090
# to be called only from a start_tag_handler: ignores the current element
6094
$t->ignore( $elt, @_);
6101
my $empty_tag_style = 0;
6102
my $remove_cdata = 0;
6103
my $keep_encoding = 0;
6104
my $expand_external_entities = 0;
6105
my $keep_atts_order = 0;
6106
my $do_not_escape_amp_in_atts = 0;
6108
my ($NSGMLS, $NICE, $INDENTED, $INDENTEDC, $RECORD1, $RECORD2)= (1..6);
6110
my %pretty_print_style=
6111
( none => 0, # no added \n
6112
nsgmls => $NSGMLS, # nsgmls-style, \n in tags
6113
# below this line styles are UNSAFE (the generated XML can be invalid)
6114
nice => $NICE, # \n after open/close tags except when the
6115
# element starts with text
6116
indented => $INDENTED, # nice plus idented
6117
indented_c => $INDENTEDC, # slightly more compact than indented (closing
6118
# tags are on the same line)
6119
record_c => $RECORD1, # for record-like data (compact)
6120
record => $RECORD2, # for record-like data (not so compact)
6123
my ($HTML, $EXPAND)= (1..2);
6124
my %empty_tag_style=
6125
( normal => 0, # <tag/>
6126
html => $HTML, # <tag />
6127
xhtml => $HTML, # <tag />
6128
expand => $EXPAND, # <tag></tag>
6137
my $xml_space_preserve; # set when an element includes xml:space="preserve"
6139
my $output_filter; # filters the entire output (including < and >)
6140
my $output_text_filter; # filters only the text part (tag names, attributes, pcdata)
6143
# returns those pesky "global" variables so you can switch between twigs
6146
{ pretty => $pretty,
6149
empty_tag_style => $empty_tag_style,
6150
remove_cdata => $remove_cdata,
6151
keep_encoding => $keep_encoding,
6152
expand_external_entities => $expand_external_entities,
6153
output_filter => $output_filter,
6154
output_text_filter => $output_text_filter,
6155
keep_atts_order => $keep_atts_order,
6156
do_not_escape_amp_in_atts => $do_not_escape_amp_in_atts,
6160
# restores the global variables
6161
sub set_global_state
6163
$pretty = $state->{pretty};
6164
$quote = $state->{quote};
6165
$INDENT = $state->{indent};
6166
$empty_tag_style = $state->{empty_tag_style};
6167
$remove_cdata = $state->{remove_cdata};
6168
$keep_encoding = $state->{keep_encoding};
6169
$expand_external_entities = $state->{expand_external_entities};
6170
$output_filter = $state->{output_filter};
6171
$output_text_filter = $state->{output_text_filter};
6172
$keep_atts_order = $state->{keep_atts_order};
6173
$do_not_escape_amp_in_atts = $state->{do_not_escape_amp_in_atts};
6176
# sets global state to defaults
6177
sub init_global_state
6182
empty_tag_style => 0,
6185
expand_external_entities => 0,
6186
output_filter => undef,
6187
output_text_filter => undef,
6188
keep_atts_order => undef,
6189
do_not_escape_amp_in_atts => 0,
6194
# set the pretty_print style (in $pretty) and returns the old one
6195
# can be called from outside the package with 2 arguments (elt, style)
6196
# or from inside with only one argument (style)
6197
# the style can be either a string (one of the keys of %pretty_print_style
6198
# or a number (presumably an old value saved)
6199
sub set_pretty_print
6200
{ my $style= lc( defined $_[1] ? $_[1] : $_[0]); # so we cover both cases
6201
my $old_pretty= $pretty;
6202
if( $style=~ /^\d+$/)
6203
{ croak "invalid pretty print style $style"
6204
unless( $style < keys %pretty_print_style);
6208
{ croak "invalid pretty print style '$style'"
6209
unless( exists $pretty_print_style{$style});
6210
$pretty= $pretty_print_style{$style};
6216
# set the empty tag style (in $empty_tag_style) and returns the old one
6217
# can be called from outside the package with 2 arguments (elt, style)
6218
# or from inside with only one argument (style)
6219
# the style can be either a string (one of the keys of %empty_tag_style
6220
# or a number (presumably an old value saved)
6221
sub set_empty_tag_style
6222
{ my $style= lc( defined $_[1] ? $_[1] : $_[0]); # so we cover both cases
6223
my $old_style= $empty_tag_style;
6224
if( $style=~ /^\d+$/)
6225
{ croak "invalid empty tag style $style"
6226
unless( $style < keys %empty_tag_style);
6227
$empty_tag_style= $style;
6230
{ croak "invalid empty tag style '$style'"
6231
unless( exists $empty_tag_style{$style});
6232
$empty_tag_style= $empty_tag_style{$style};
6238
{ my $style= $_[1] || $_[0];
6239
my $old_quote= $quote;
6240
croak "invalid quote '$style'" unless( exists $quote_style{$style});
6241
$quote= $quote_style{$style};
6245
sub set_remove_cdata
6246
{ my $new_value= defined $_[1] ? $_[1] : $_[0];
6247
my $old_value= $remove_cdata;
6248
$remove_cdata= $new_value;
6254
{ my $new_value= defined $_[1] ? $_[1] : $_[0];
6255
my $old_value= $INDENT;
6256
$INDENT= $new_value;
6260
sub set_keep_encoding
6261
{ my $new_value= defined $_[1] ? $_[1] : $_[0];
6262
my $old_value= $keep_encoding;
6263
$keep_encoding= $new_value;
6267
sub set_do_not_escape_amp_in_atts
6268
{ my $new_value= defined $_[1] ? $_[1] : $_[0];
6269
my $old_value= $do_not_escape_amp_in_atts;
6270
$do_not_escape_amp_in_atts= $new_value;
6274
sub keep_encoding { return $keep_encoding; } # so I can use elsewhere in the module
6276
sub output_filter { return $output_filter; }
6277
sub output_text_filter { return $output_text_filter; }
6279
sub set_output_filter
6280
{ my $new_value= defined $_[1] ? $_[1] : $_[0]; # can be called in object/non-object mode
6281
# if called in object mode with no argument, the filter is undefined
6282
if( isa( $new_value, 'XML::Twig::Elt') || isa( $new_value, 'XML::Twig')) { undef $new_value; }
6283
my $old_value= $output_filter;
6284
if( !$new_value || isa( $new_value, 'CODE') )
6285
{ $output_filter= $new_value; }
6286
elsif( $new_value eq 'latin1')
6287
{ $output_filter= XML::Twig::latin1();
6289
elsif( $XML::Twig::filter{$new_value})
6290
{ $output_filter= $XML::Twig::filter{$new_value}; }
6292
{ croak "invalid output filter '$new_value'"; }
6297
sub set_output_text_filter
6298
{ my $new_value= defined $_[1] ? $_[1] : $_[0]; # can be called in object/non-object mode
6299
# if called in object mode with no argument, the filter is undefined
6300
if( isa( $new_value, 'XML::Twig::Elt') || isa( $new_value, 'XML::Twig')) { undef $new_value; }
6301
my $old_value= $output_text_filter;
6302
if( !$new_value || isa( $new_value, 'CODE') )
6303
{ $output_text_filter= $new_value; }
6304
elsif( $new_value eq 'latin1')
6305
{ $output_text_filter= XML::Twig::latin1();
6307
elsif( $XML::Twig::filter{$new_value})
6308
{ $output_text_filter= $XML::Twig::filter{$new_value}; }
6310
{ croak "invalid output text filter '$new_value'"; }
6315
sub set_expand_external_entities
6316
{ my $new_value= defined $_[1] ? $_[1] : $_[0];
6317
my $old_value= $expand_external_entities;
6318
$expand_external_entities= $new_value;
6322
sub set_keep_atts_order
6323
{ my $new_value= defined $_[1] ? $_[1] : $_[0];
6324
my $old_value= $keep_atts_order;
6325
$keep_atts_order= $new_value;
6330
sub keep_atts_order { return $keep_atts_order; } # so I can use elsewhere in the module
6332
# $elt is an element to print
6333
# $pretty is an optional value, if true a \n is printed after the <
6335
my %empty_should_be_expanded= ( script => 1);
6340
return if( $elt->{gi}<$XML::Twig::SPECIAL_GI);
6342
my $extra_data= $elt->{extra_data} || '';
6346
my $ns_map= $elt->att( '#original_gi');
6347
if( $ns_map) { $gi= _restore_original_prefix( $ns_map, $gi); }
6348
$gi=~ s{^#default:}{}; # remove default prefix
6350
if( $output_text_filter) { $gi= $output_text_filter->( $gi); }
6354
# get the attribute and their values
6355
my $att= $elt->atts;
6357
{ foreach my $att_name ( $keep_atts_order ? keys %{$att} : sort keys %{$att})
6358
{ # skip private attributes (they start with #)
6359
next if( _is_private_name( $att_name));
6361
$tag .= $pretty==$NSGMLS ? "\n" : ' ';
6363
my $output_att_name= $ns_map ? _restore_original_prefix( $ns_map, $att_name) : $att_name;
6364
if( $output_text_filter) { $output_att_name= $output_text_filter->( $output_att_name); }
6366
$tag .= $output_att_name . '=' . $quote . $elt->att_xml_string( $att_name, $quote) . $quote;
6370
$tag .= "\n" if($pretty==$NSGMLS);
6372
if( $elt->{empty} && !$elt->{extra_data_before_end_tag})
6373
{ if( !$empty_tag_style)
6375
elsif( ($empty_tag_style eq $HTML) && ! $empty_should_be_expanded{$elt->gi})
6377
else # $empty_tag_style eq $EXPAND
6378
{ $tag .= "></" . $elt->gi .">"; }
6383
if( $elt->_is_private) { $tag= ''; }
6385
unless( $pretty) { return $extra_data . $tag }
6388
my $return=''; # '' or \n is to be printed before the tag
6389
my $indent=0; # number of indents before the tag
6391
if( $pretty==$RECORD1)
6392
{ my $level= $elt->level;
6393
$return= "\n" if( $level < 2);
6394
$indent= 1 if( $level == 1);
6397
elsif( $pretty==$RECORD2)
6399
$indent= $elt->level;
6402
elsif( $pretty==$NICE)
6403
{ my $parent= $elt->_parent;
6404
unless( !$parent || $parent->{contains_text})
6406
$elt->{contains_text}= 1 if( ($parent && $parent->{contains_text})
6407
|| $elt->contains_text);
6410
elsif( ($pretty==$INDENTED) || ($pretty==$INDENTEDC))
6411
{ my $parent= $elt->_parent;
6412
unless( !$parent || $parent->{contains_text})
6414
$indent= $elt->level;
6416
$elt->{contains_text}= 1 if( ($parent && $parent->{contains_text})
6417
|| $elt->contains_text);
6420
if( $return || $indent)
6421
{ # check for elements in which spaces should be kept
6423
return $extra_data . $tag if( $xml_space_preserve);
6424
if( $t && $t->{twig_keep_spaces_in})
6425
{ foreach my $ancestor ($elt->ancestors)
6426
{ return $extra_data . $tag if( $t->{twig_keep_spaces_in}->{$ancestor->gi}) }
6429
$prefix= $INDENT x $indent;
6431
{ $extra_data=~ s{\s+$}{};
6432
$extra_data=~ s{^\s+}{};
6433
$extra_data= $prefix . $extra_data . $return;
6438
return $return . $extra_data . $prefix . $tag;
6443
return '' if( ($elt->{gi}<$XML::Twig::SPECIAL_GI)
6444
|| ($elt->is_empty && !$elt->{extra_data_before_end_tag})
6449
if( my $map= $elt->att( '#original_gi')) { $gi= _restore_original_prefix( $map, $gi); }
6450
$gi=~ s{^#default:}{}; # remove default prefix
6452
if( $output_text_filter) { $gi= $output_text_filter->( $elt->gi); }
6455
$tag = ($elt->{extra_data_before_end_tag} || '') . $tag;
6457
if( $elt->_is_private) { $tag= ''; }
6459
return $tag unless $pretty;
6462
my $return=0; # 1 if a \n is to be printed before the tag
6463
my $indent=0; # number of indents before the tag
6465
if( $pretty==$RECORD1)
6466
{ $return= 1 if( $elt->level == 0);
6469
elsif( $pretty==$RECORD2)
6470
{ unless( $elt->contains_text)
6472
$indent= $elt->level;
6476
elsif( $pretty==$NICE)
6477
{ my $parent= $elt->_parent;
6478
if( ( ($parent && !$parent->{contains_text}) || !$parent )
6479
&& ( !$elt->{contains_text}
6480
&& ($elt->{has_flushed_child} || $elt->_first_child())
6486
elsif( $pretty==$INDENTED)
6487
{ my $parent= $elt->_parent;
6488
if( ( ($parent && !$parent->{contains_text}) || !$parent )
6489
&& ( !$elt->{contains_text}
6490
&& ($elt->{has_flushed_child} || $elt->_first_child())
6494
$indent= $elt->level;
6498
if( $return || $indent)
6499
{ # check for elements in which spaces should be kept
6501
return $tag if( $xml_space_preserve);
6502
if( $t && $t->{twig_keep_spaces_in})
6503
{ foreach my $ancestor ($elt, $elt->ancestors)
6504
{ return $tag if( $t->{twig_keep_spaces_in}->{$ancestor->gi}) }
6507
$prefix= "\n" if( $return);
6508
$prefix.= $INDENT x $indent;
6511
# add a \n at the end of the document (after the root element)
6512
$tag .= "\n" unless( $elt->parent);
6514
return $prefix . $tag;
6517
sub _restore_original_prefix
6518
{ my( $map, $name)= @_;
6519
my $prefix= _ns_prefix( $name);
6520
if( my $original_prefix= $map->{$prefix})
6521
{ if( $original_prefix eq '#default')
6522
{ $name=~ s{^$prefix:}{}; }
6524
{ $name=~ s{^$prefix(?=:)}{$original_prefix}; }
6529
# $elt is an element to print
6530
# $fh is an optional filehandle to print to
6531
# $pretty is an optional value, if true a \n is printed after the < of the
6537
my $fh= _is_fh( $_[0]) ? shift : undef;
6538
my $old_select= defined $fh ? select $fh : undef;
6539
my $old_pretty= defined ($pretty= shift) ? set_pretty_print( $pretty) : undef;
6541
$xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve');
6546
$xml_space_preserve= 0;
6548
select $old_select if( defined $old_select);
6549
set_pretty_print( $old_pretty) if( defined $old_pretty);
6553
# same as output but does not output the start tag if the element
6554
# is marked as flushed
6557
$elt->twig->flush( @_);
6564
my $fh= _is_fh( $_[0]) ? shift : undef;
6565
my $old_select= defined $fh ? select $fh : undef;
6566
my $old_pretty= defined ($pretty= shift) ? set_pretty_print( $pretty) : undef;
6568
$xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve');
6572
$xml_space_preserve= 0;
6574
select $old_select if( defined $old_select);
6575
set_pretty_print( $old_pretty) if( defined $old_pretty);
6581
# in case there's some comments or PI's piggybacking
6582
#if( $elt->{extra_data})
6583
# { print $output_filter ? $output_filter->($elt->{extra_data})
6584
# : $elt->{extra_data};
6587
if( $elt->{gi} >= $XML::Twig::SPECIAL_GI)
6588
{ my $preserve= ($elt->att( 'xml:space') || '') eq 'preserve';
6589
$xml_space_preserve++ if $preserve;
6590
unless( $elt->_flushed)
6591
{ print $elt->start_tag();
6594
# flush the children
6595
my @children= $elt->children;
6596
foreach my $child (@children)
6597
{ $child->_flush( $pretty);
6599
print $elt->end_tag;
6600
$xml_space_preserve-- if $preserve;
6601
# used for pretty printing
6602
if( my $parent= $elt->parent) { $parent->{has_flushed_child}= 1; }
6604
else # text or special element
6606
if( $elt->is_pcdata) { $text= $elt->pcdata_xml_string;
6607
if( my $parent= $elt->parent)
6608
{ $parent->{contains_text}= 1; }
6610
elsif( $elt->is_cdata) { $text= $elt->cdata_string;
6611
if( my $parent= $elt->parent)
6612
{ $parent->{contains_text}= 1; }
6614
elsif( $elt->is_pi) { $text= $elt->pi_string; }
6615
elsif( $elt->is_comment) { $text= $elt->comment_string; }
6616
elsif( $elt->is_ent) { $text= $elt->ent_string; }
6618
print $output_filter ? $output_filter->( $text) : $text;
6627
if( $elt->{gi} >= $XML::Twig::SPECIAL_GI)
6628
{ # sprint the children
6629
my $child= $elt->_first_child||'';
6631
{ $string.= $child->xml_text;
6632
$child= $child->_next_sibling;
6635
elsif( $elt->is_pcdata) { $string .= $output_filter ? $output_filter->($elt->pcdata_xml_string)
6636
: $elt->pcdata_xml_string;
6638
elsif( $elt->is_cdata) { $string .= $output_filter ? $output_filter->($elt->cdata_xml_string)
6639
: $elt->cdata_string;
6641
elsif( $elt->is_ent) { $string .= $elt->ent_string; }
6647
# same as print but except... it does not print but rather returns the string
6648
# if the second parameter is set then only the content is returned, not the
6649
# start and end tags of the element (but the tags of the included elements are
6653
$xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve');
6654
my $sprint= $output_filter ? $output_filter->( $elt->_sprint( @_)) : $elt->_sprint( @_);
6655
$xml_space_preserve= 0;
6661
my $no_tag= shift || 0;
6662
# in case there's some comments or PI's piggybacking
6664
#if( $elt->{extra_data} && !$no_tag)
6665
# { $string= $elt->{extra_data};
6668
if( $elt->{gi} >= $XML::Twig::SPECIAL_GI)
6670
my $preserve= ($elt->att( 'xml:space') || '') eq 'preserve';
6671
$xml_space_preserve++ if $preserve;
6673
$string.= $elt->start_tag unless( $no_tag);
6675
# sprint the children
6676
my $child= $elt->_first_child||'';
6678
{ $string.= $child->_sprint;
6679
$child= $child->_next_sibling;
6681
$string.= $elt->end_tag unless( $no_tag);
6682
$xml_space_preserve-- if $preserve;
6685
{ $string .= $elt->{extra_data} || '';
6686
if( $elt->is_pcdata) { $string .= $elt->pcdata_xml_string; }
6687
elsif( $elt->is_cdata) { $string .= $elt->cdata_string; }
6688
elsif( $elt->is_pi) { $string .= $elt->pi_string; }
6689
elsif( $elt->is_comment) { $string .= $elt->comment_string; }
6690
elsif( $elt->is_ent) { $string .= $elt->ent_string; }
6696
# just a shortcut to $elt->sprint( 1)
6698
{ $_[0]->sprint( 1); }
6700
sub pcdata_xml_string
6702
if( defined( my $string= $elt->{pcdata}) )
6704
if( $elt->{extra_data_in_pcdata})
6705
{ _gen_mark( $string); # used by _(un)?protect_extra_data
6706
foreach my $data (reverse @{$elt->{extra_data_in_pcdata}})
6707
{ my $substr= substr( $string, $data->{offset});
6708
if( $keep_encoding || $elt->{asis})
6709
{ substr( $string, $data->{offset}, 0, $data->{text}); }
6711
{ substr( $string, $data->{offset}, 0, _protect_extra_data( $data->{text})); }
6713
unless( $keep_encoding || $elt->{asis})
6714
{ $string=~ s/([&<])/$XML::Twig::base_ent{$1}/g ;
6715
_unprotect_extra_data( $string);
6719
{ $string=~ s/([&<])/$XML::Twig::base_ent{$1}/g unless( $keep_encoding || $elt->{asis}); }
6720
return $output_text_filter ? $output_text_filter->( $string) : $string;
6727
my( %char2ent, %ent2char);
6729
{ %char2ent= ( '<' => 'lt', '&' => 'amp');
6730
%ent2char= ( 'lt' => '<', 'amp' => '&');
6733
# generate a unique mark (a string) not found in the string,
6734
# used to mark < and & in the extra data
6737
$mark++ while( index( $_[0], $mark) > -1);
6741
sub _protect_extra_data
6742
{ my( $extra_data)= @_;
6743
$extra_data=~ s{([&<])}{:$mark:$char2ent{$1}:}g;
6747
sub _unprotect_extra_data
6748
{ $_[0]=~ s{:$mark:(\w+):}{$ent2char{$1}}g; }
6753
{ my $cdata= $_[0]->cdata;
6754
unless( defined $cdata) { return ''; }
6756
{ $cdata=~ s/([&<])/$XML::Twig::base_ent{$1}/g; }
6758
{ $cdata= CDATA_START . $cdata . CDATA_END; }
6765
my $quote= shift || '"';
6766
if( defined (my $string= $elt->{att}->{$att}))
6767
{ unless( $keep_encoding)
6768
{ if( $do_not_escape_amp_in_atts)
6769
{ $string=~ s{([$quote<])}{$XML::Twig::base_ent{$1}}g;
6770
$string=~ s{&(?!(\w+|#\d+|[xX][0-9a-fA-F]+);)}{&}g; # dodgy: escape & that do not start an entity
6773
{ $string=~ s{([$quote<&])}{$XML::Twig::base_ent{$1}}g; }
6775
return $output_text_filter ? $output_text_filter->( $string) : $string;
6783
my $ent_text= $ent->{ent};
6784
my( $t, $el, $ent_string);
6785
if( $expand_external_entities
6787
&& ($el= $t->entity_list)
6788
&& ($ent_string= $el->{$ent->ent_name}->{val})
6790
{ return $ent_string; }
6795
# returns just the text, no tags, for an element
6800
if( $elt->is_pcdata) { return $elt->pcdata; }
6801
elsif( $elt->is_cdata) { return $elt->cdata; }
6802
elsif( $elt->is_pi) { return $elt->pi_string;}
6803
elsif( $elt->is_comment) { return $elt->comment; }
6804
elsif( $elt->is_ent) { return $elt->ent ; }
6806
my $child= $elt->_first_child ||'';
6808
{ my $child_text= $child->text;
6809
$string.= defined( $child_text) ? $child_text : '';
6810
$child= $child->_next_sibling;
6812
unless( defined $string) { $string=''; }
6814
return $output_text_filter ? $output_text_filter->( $string) : $string;
6819
my $text= $elt->text;
6820
$text=~ s{\s+}{ }sg;
6826
# remove cdata sections (turns them into regular pcdata) in an element
6829
foreach my $cdata ($elt->descendants_or_self( CDATA))
6830
{ if( $keep_encoding)
6831
{ my $data= $cdata->cdata;
6832
$data=~ s{([&<"'])}{$XML::Twig::base_ent{$1}}g;
6833
$cdata->set_pcdata( $data);
6836
{ $cdata->set_pcdata( $cdata->cdata); }
6837
$cdata->set_gi( PCDATA);
6838
undef $cdata->{cdata};
6842
sub _is_private { return _is_private_name( $_[0]->gi); }
6843
sub _is_private_name { return $_[0]=~ m{^#(?!default:)}; }
6846
} # end of block containing package globals ($pretty_print, $quotes, keep_encoding...)
6849
# SAX export methods
6851
{ _toSAX(@_, \&_start_tag_data_SAX1, \&_end_tag_data_SAX1); }
6854
{ _toSAX(@_, \&_start_tag_data_SAX2, \&_end_tag_data_SAX2); }
6857
{ my( $elt, $handler, $start_tag_data, $end_tag_data)= @_;
6858
if( $elt->{gi} >= $XML::Twig::SPECIAL_GI)
6859
{ my $data= $start_tag_data->( $elt);
6860
_start_prefix_mapping( $elt, $handler, $data);
6861
if( $data && (my $start_element = $handler->can( 'start_element')))
6862
{ $start_element->( $handler, $data) unless( $elt->_flushed); }
6864
foreach my $child ($elt->children)
6865
{ $child->_toSAX( $handler, $start_tag_data, $end_tag_data); }
6867
if( (my $data= $end_tag_data->( $elt)) && (my $end_element = $handler->can( 'end_element')) )
6868
{ $end_element->( $handler, $data); }
6869
_end_prefix_mapping( $elt, $handler);
6871
else # text or special element
6872
{ if( $elt->is_pcdata && (my $characters= $handler->can( 'characters')))
6873
{ $characters->( $handler, { Data => $elt->pcdata }); }
6874
elsif( $elt->is_cdata)
6875
{ if( my $start_cdata= $handler->can( 'start_cdata'))
6876
{ $start_cdata->( $handler); }
6877
if( my $characters= $handler->can( 'characters'))
6878
{ $characters->( $handler, {Data => $elt->cdata }); }
6879
if( my $end_cdata= $handler->can( 'end_cdata'))
6880
{ $end_cdata->( $handler); }
6882
elsif( ($elt->is_pi) && (my $pi= $handler->can( 'processing_instruction')))
6883
{ $pi->( $handler, { Target =>$elt->target, Data => $elt->data }); }
6884
elsif( ($elt->is_comment) && (my $comment= $handler->can( 'comment')))
6885
{ $comment->( $handler, { Data => $elt->comment }); }
6886
elsif( ($elt->is_ent))
6888
if( my $se= $handler->can( 'skipped_entity'))
6889
{ $se->( $handler, { Name => $elt->ent_name }); }
6890
elsif( my $characters= $handler->can( 'characters'))
6891
{ if( defined $elt->ent_string)
6892
{ $characters->( $handler, {Data => $elt->ent_string}); }
6894
{ $characters->( $handler, {Data => $elt->ent_name}); }
6901
sub _start_tag_data_SAX1
6904
return if( $elt->_is_private);
6906
my $atts= $elt->atts;
6907
while( my( $att, $value)= each %$atts)
6908
{ $attributes->{$att}= $value unless( _is_private_name( $att)); }
6909
my $data= { Name => $name, Attributes => $attributes};
6913
sub _end_tag_data_SAX1
6915
return if( $elt->_is_private);
6916
return { Name => $elt->gi };
6919
sub _start_tag_data_SAX2
6924
return if( $elt->_is_private);
6925
$data->{Name} = $name;
6926
$data->{Prefix} = $elt->ns_prefix;
6927
$data->{LocalName} = $elt->local_name;
6928
$data->{NamespaceURI} = $elt->namespace;
6930
# save a copy of the data so we can re-use it for the end tag
6931
my %sax2_data= %$data;
6932
$elt->{twig_elt_SAX2_data}= \%sax2_data;
6934
# add the attributes
6935
$data->{Attributes}= $elt->_atts_to_SAX2;
6943
foreach my $att (keys %{$elt->atts})
6945
next if( _is_private_name( $att));
6947
$SAX2_att->{Name} = $att;
6948
$SAX2_att->{Prefix} = _ns_prefix( $att);
6949
$SAX2_att->{LocalName} = _local_name( $att);
6950
$SAX2_att->{NamespaceURI} = $elt->namespace( $SAX2_att->{Prefix});
6951
$SAX2_att->{Value} = $elt->att( $att);
6952
my $SAX2_att_name= "{$SAX2_att->{NamespaceURI}}$SAX2_att->{LocalName}";
6954
$SAX2_atts->{$SAX2_att_name}= $SAX2_att;
6959
sub _start_prefix_mapping
6960
{ my( $elt, $handler, $data)= @_;
6961
if( my $start_prefix_mapping= $handler->can( 'start_prefix_mapping')
6962
and my @new_prefix_mappings= grep { /^\{[^}]*\}xmlns/ || /^\{$XMLNS_URI\}/ } keys %{$data->{Attributes}}
6964
{ foreach my $prefix (@new_prefix_mappings)
6965
{ my $prefix_string= $data->{Attributes}->{$prefix}->{LocalName};
6966
if( $prefix_string eq 'xmlns') { $prefix_string=''; }
6968
{ Prefix => $prefix_string,
6969
NamespaceURI => $data->{Attributes}->{$prefix}->{Value}
6971
$start_prefix_mapping->( $handler, $prefix_data);
6972
$elt->{twig_end_prefix_mapping} ||= [];
6973
push @{$elt->{twig_end_prefix_mapping}}, $prefix_string;
6978
sub _end_prefix_mapping
6979
{ my( $elt, $handler)= @_;
6980
if( my $end_prefix_mapping= $handler->can( 'end_prefix_mapping'))
6981
{ foreach my $prefix (@{$elt->{twig_end_prefix_mapping}})
6982
{ $end_prefix_mapping->( $handler, { Prefix => $prefix} ); }
6986
sub _end_tag_data_SAX2
6988
return if( $elt->_is_private);
6989
return $elt->{twig_elt_SAX2_data};
6994
#start-extract twig_node
6997
my $child= $elt->_first_child;
6999
{ return 1 if( $child->is_text || $child->is_ent);
7000
$child= $child->_next_sibling;
7005
#end-extract twig_node
7007
# creates a single pcdata element containing the text as child of the element
7009
# - force_pcdata: when set to a true value forces the text to be in a#PCDATA
7010
# even if the original element was a #CDATA
7012
{ my( $elt, $string, %option)= @_;
7014
if( $elt->gi eq PCDATA)
7015
{ return $elt->set_pcdata( $string); }
7016
elsif( $elt->gi eq CDATA)
7017
{ if( $option{force_pcdata})
7018
{ $elt->set_gi( PCDATA);
7019
$elt->set_cdata('');
7020
return $elt->set_pcdata( $string);
7023
{ return $elt->set_cdata( $string); }
7026
foreach my $child (@{[$elt->children]})
7029
my $pcdata= XML::Twig::Elt->new( PCDATA, $string);
7030
$pcdata->paste( $elt);
7032
$elt->set_not_empty;
7037
# set the content of an element from a list of strings and elements
7041
return $elt unless defined $_[0];
7043
# attributes can be given as a hash (passed by ref)
7044
if( ref $_[0] eq 'HASH')
7046
$elt->del_atts; # usually useless but better safe than sorry
7047
$elt->set_atts( $atts);
7048
return $elt unless defined $_[0];
7051
# check next argument for #EMPTY
7052
if( !(ref $_[0]) && ($_[0] eq EMPTY) )
7053
{ $elt->set_empty; return $elt; }
7055
# case where we really want to do a set_text, the element is '#PCDATA'
7056
# and we only want to add text in it
7057
if( ($elt->gi eq PCDATA) && ($#_ == 0) && !( ref $_[0]))
7058
{ $elt->set_pcdata( $_[0]);
7061
elsif( ($elt->gi eq CDATA) && ($#_ == 0) && !( ref $_[0]))
7062
{ $elt->set_cdata( $_[0]);
7066
# delete the children
7067
# WARNING: potential problem here if the children are used
7068
# somewhere else (where?). Will be solved when I use weak refs
7069
foreach my $child (@{[$elt->children]})
7072
foreach my $child (@_)
7073
{ if( isa( $child, 'XML::Twig::Elt'))
7074
{ # argument is an element
7075
$child->paste( 'last_child', $elt);
7078
{ # argument is a string
7079
if( (my $pcdata= $elt->last_child) && $elt->last_child->is_pcdata)
7080
{ # previous child is also pcdata: just concatenate
7081
$pcdata->set_pcdata( $pcdata->pcdata . $child)
7084
{ # previous child is not a string: creat a new pcdata element
7085
$pcdata= XML::Twig::Elt->new( PCDATA, $child);
7086
$pcdata->paste( 'last_child', $elt);
7091
$elt->set_not_empty;
7096
# inserts an element (whose gi is given) as child of the element
7097
# all children of the element are now children of the new element
7098
# returns the new element
7100
{ my ($elt, @args)= @_;
7101
# first cut the children
7102
my @children= $elt->children;
7103
foreach my $child (@children)
7106
while( my $gi= shift @args)
7107
{ my $new_elt= XML::Twig::Elt->new( $gi);
7108
# add attributes if needed
7109
if( defined( $args[0]) && ( isa( $args[0], 'HASH')) )
7110
{ $new_elt->set_atts( shift @args); }
7112
$new_elt->paste( $elt);
7113
$elt->set_not_empty;
7116
# paste back the children
7117
foreach my $child (@children)
7118
{ $child->paste( 'last_child', $elt); }
7122
# insert a new element
7123
# $elt->insert_new_element( $opt_position, $gi, $opt_atts_hash, @opt_content);
7124
# the element is created with the same syntax as new
7125
# position is the same as in paste, first_child by default
7128
my $position= $_[0];
7129
if( ($position eq 'before') || ($position eq 'after')
7130
|| ($position eq 'first_child') || ($position eq 'last_child'))
7133
{ $position= 'first_child'; }
7135
my $new_elt= $elt->new( @_);
7136
$new_elt->paste( $position, $elt);
7140
# wraps an element in elements which gi's are given as arguments
7141
# $elt->wrap_in( 'td', 'tr', 'table') wraps the element as a single
7142
# cell in a table for example
7143
# returns the new element
7146
while( my $gi = shift @_)
7147
{ my $new_elt = XML::Twig::Elt->new( $gi);
7148
if( $elt->{twig_current})
7149
{ my $t= $elt->twig;
7150
$t->{twig_current}= $new_elt;
7151
$elt->del_twig_current;
7152
$new_elt->set_twig_current;
7155
if( my $parent= $elt->_parent)
7156
{ $new_elt->set_parent( $parent);
7157
$parent->set_first_child( $new_elt) if( $parent->_first_child == $elt);
7158
$parent->set_last_child( $new_elt) if( $parent->_last_child == $elt);
7161
{ # wrapping the root
7162
my $twig= $elt->twig;
7163
if( $twig && $twig->root && ($twig->root eq $elt) )
7164
{ $twig->{twig_root}= $new_elt; }
7167
if( my $prev_sibling= $elt->_prev_sibling)
7168
{ $new_elt->set_prev_sibling( $prev_sibling);
7169
$prev_sibling->set_next_sibling( $new_elt);
7172
if( my $next_sibling= $elt->next_sibling)
7173
{ $new_elt->set_next_sibling( $next_sibling);
7174
$next_sibling->set_prev_sibling( $new_elt);
7176
$new_elt->set_first_child( $elt);
7177
$new_elt->set_last_child( $elt);
7179
$elt->set_parent( $new_elt);
7180
$elt->set_prev_sibling( undef);
7181
$elt->set_next_sibling( undef);
7183
# add the attributes if the next argument is a hash ref
7184
if( defined( $_[0]) && (isa( $_[0], 'HASH')) )
7185
{ $new_elt->set_atts( shift @_); }
7194
{ my( $elt, $ref)= @_;
7195
if( my $parent= $ref->_parent)
7196
{ $elt->set_parent( $parent);
7197
$parent->set_first_child( $elt) if( $parent->_first_child == $ref);
7198
$parent->set_last_child( $elt) if( $parent->_last_child == $ref);
7200
if( my $prev_sibling= $ref->_prev_sibling)
7201
{ $elt->set_prev_sibling( $prev_sibling);
7202
$prev_sibling->set_next_sibling( $elt);
7204
if( my $next_sibling= $ref->_next_sibling)
7205
{ $elt->set_next_sibling( $next_sibling);
7206
$next_sibling->set_prev_sibling( $elt);
7209
$ref->set_parent( undef);
7210
$ref->set_prev_sibling( undef);
7211
$ref->set_next_sibling( undef);
7218
$elt->replace( $ref);
7219
foreach my $new_elt (reverse @_)
7220
{ $new_elt->paste( after => $elt); }
7225
#start-extract twig_node
7226
# move an element, same syntax as paste, except the element is first cut
7233
#end-extract twig_node
7236
# adds a prefix to an element, creating a pcdata child if needed
7238
{ my ($elt, $prefix, $option)= @_;
7239
my $asis= ($option && ($option eq 'asis')) ? 1 : 0;
7241
&& (($asis && $elt->{asis}) || (!$asis && ! $elt->{asis}))
7243
{ $elt->set_pcdata( $prefix . $elt->pcdata); }
7244
elsif( $elt->_first_child && $elt->_first_child->is_pcdata
7245
&& ( ($asis && $elt->_first_child->{asis})
7246
|| (!$asis && ! $elt->_first_child->{asis}))
7248
{ $elt->_first_child->set_pcdata( $prefix . $elt->_first_child->pcdata); }
7250
{ my $new_elt= XML::Twig::Elt->new( PCDATA, $prefix);
7251
$new_elt->paste( $elt);
7252
$new_elt->set_asis if( $asis);
7257
# adds a suffix to an element, creating a pcdata child if needed
7259
{ my ($elt, $suffix, $option)= @_;
7260
my $asis= ($option && ($option eq 'asis')) ? 1 : 0;
7262
&& (($asis && $elt->{asis}) || (!$asis && ! $elt->{asis}))
7264
{ $elt->set_pcdata( $elt->pcdata . $suffix); }
7265
elsif( $elt->_last_child && $elt->_last_child->is_pcdata
7266
&& ( ($asis && $elt->_last_child->{asis})
7267
|| (!$asis && ! $elt->_last_child->{asis}))
7269
{ $elt->last_child->set_pcdata( $elt->_last_child->pcdata . $suffix); }
7271
{ my $new_elt= XML::Twig::Elt->new( PCDATA, $suffix);
7272
$new_elt->paste( 'last_child', $elt);
7273
$new_elt->set_asis if( $asis);
7278
#start-extract twig_node
7279
# create a path to an element ('/root/.../gi)
7282
my @context= ( $elt, $elt->ancestors);
7283
return "/" . join( "/", reverse map {$_->gi} @context);
7289
foreach my $ancestor (reverse $elt->ancestors_or_self)
7290
{ my $gi= $ancestor->gi;
7292
my $index= $ancestor->prev_siblings( $gi) + 1;
7293
unless( ($index == 1) && !$ancestor->next_sibling( $gi))
7294
{ $xpath.= "[$index]"; }
7299
# methods used mainly by wrap_children
7301
# return a string with the
7302
# for an element <foo><elt att="val">...</elt><elt2/><elt>...</elt></foo>
7303
# returns '<elt att="val"><elt2><elt>'
7304
sub _stringify_struct
7305
{ my( $elt, %opt)= @_;
7307
my $pretty_print= XML::Twig::Elt::set_pretty_print( 'none');
7308
foreach my $child ($elt->children)
7309
{ $child->add_id; $string .= $child->start_tag ||''; }
7310
XML::Twig::Elt::set_pretty_print( $pretty_print);
7314
# wrap a series of elements in a new one
7318
my $atts= isa( $_[0], 'HASH') ? shift : undef;
7319
my $range= shift; # the string with the tags to wrap
7323
# get the tags to wrap
7325
while( $range=~ m{<\w+\s+[^>]*id=("[^"]*"|'[^']*')[^>]*>}g)
7326
{ push @to_wrap, $t->elt_id( substr( $1, 1, -1)); }
7328
return '' unless @to_wrap;
7330
my $to_wrap= shift @to_wrap;
7332
my $new_elt= $to_wrap->wrap_in( $gi, \%atts);
7333
$_->move( last_child => $new_elt) foreach (@to_wrap);
7338
# wrap children matching a regexp in a new element
7340
{ my( $elt, $regexp, $gi, $atts)= @_;
7344
my $elt_as_string= $elt->_stringify_struct; # stringify the elt structure
7345
$regexp=~ s{(<[^>]*>)}{_match_expr( $1)}eg; # in the regexp, replace gi's by the proper regexp
7346
$elt_as_string=~ s{($regexp)}{$elt->_wrap_range( $gi, $atts, $1)}eg; # then do the actual replace
7353
my( $gi, %atts)= XML::Twig::_parse_start_tag( $tag);
7354
return _match_tag( $gi, %atts);
7359
{ my( $elt, %atts)= @_;
7360
my $string= "<$elt\\b";
7361
foreach my $key (sort keys %atts)
7362
{ my $val= $atts{$key};
7364
# my $val= qq{\Q$atts{$key}\E};
7365
$string.= qq{[^>]*$key=(?:"$val"|'$val')};
7367
$string.= qq{[^>]*>};
7368
return "(?:$string)";
7372
{ my( $elt, $cond, $att)= @_;
7374
my $child= $elt->first_child( $cond) or return undef;
7375
$elt->set_att( $att => $child->text);
7381
{ my( $elt, $att, $tag)= @_;
7383
my $child= $elt->insert_new_elt( first_child => $tag, $elt->att( $att));
7384
$elt->del_att( $att);
7388
# sort children methods
7390
sub sort_children_on_field
7393
my $get_key= sub { return $_[0]->field( $field) };
7394
return $elt->sort_children( $get_key, @_);
7397
sub sort_children_on_att
7400
my $get_key= sub { return $_[0]->att( $att) };
7401
return $elt->sort_children( $get_key, @_);
7404
sub sort_children_on_value
7406
#my $get_key= eval qq{ sub { return \$_[0]->text } };
7407
my $get_key= \&text;
7408
return $elt->sort_children( $get_key, @_);
7413
{ my( $elt, $get_key, %opt)=@_;
7414
$opt{order} ||= 'normal';
7415
$opt{type} ||= 'alpha';
7416
my( $par_a, $par_b)= ($opt{order} eq 'reverse') ? qw( b a) : qw ( a b) ;
7417
my $op= ($opt{type} eq 'numeric') ? '<=>' : 'cmp' ;
7418
my @children= $elt->cut_children;
7419
if( $opt{type} eq 'numeric')
7420
{ @children= map { $_->[1] }
7421
sort { $a->[0] <=> $b->[0] }
7422
map { [ $get_key->( $_), $_] } @children;
7424
elsif( $opt{type} eq 'alpha')
7425
{ @children= map { $_->[1] }
7426
sort { $a->[0] cmp $b->[0] }
7427
map { [ $get_key->( $_), $_] } @children;
7430
{ croak "wrong sort type '$opt{type}', should be either 'alpha' or 'numeric'"; }
7432
@children= reverse @children if( $opt{order} eq 'reverse');
7433
$elt->set_content( @children);
7437
# comparison methods
7441
if( $a->cmp( $b) == -1) { return 1; } else { return 0; }
7446
if( $a->cmp( $b) == 1) { return 1; } else { return 0; }
7451
return 1 if( $a->cmp( $b) == -1);
7457
return 1 unless( $a->cmp( $b) == 1);
7463
return 1 if( $a->cmp( $b) == 1);
7469
return 1 unless( $a->cmp( $b) == -1);
7478
return 0 if( $a == $b);
7479
return 1 if( $a->in($b)); # a starts after b
7480
return -1 if( $b->in($a)); # a starts before b
7482
# ancestors does not include the element itself
7483
my @a_pile= ($a, $a->ancestors);
7484
my @b_pile= ($b, $b->ancestors);
7486
# the 2 elements are not in the same twig
7487
return undef unless( $a_pile[-1] == $b_pile[-1]);
7489
# find the first non common ancestors (they are siblings)
7490
my $a_anc= pop @a_pile;
7491
my $b_anc= pop @b_pile;
7493
while( $a_anc == $b_anc)
7494
{ $a_anc= pop @a_pile;
7495
$b_anc= pop @b_pile;
7498
# from there move left and right and figure out the order
7499
my( $a_prev, $a_next, $b_prev, $b_next)= ($a_anc, $a_anc, $b_anc, $b_anc);
7501
{ $a_prev= $a_prev->_prev_sibling || return( -1);
7502
return 1 if( $a_prev == $b_next);
7503
$a_next= $a_next->_next_sibling || return( 1);
7504
return -1 if( $a_next == $b_prev);
7505
$b_prev= $b_prev->_prev_sibling || return( 1);
7506
return -1 if( $b_prev == $a_next);
7507
$b_next= $b_next->_next_sibling || return( -1);
7508
return 1 if( $b_next == $a_prev);
7512
#end-extract twig_node
7520
XML::Twig - A perl module for processing huge XML documents in tree mode.
7524
Note that this documentation is intended as a reference to the module.
7526
Complete docs, including a tutorial, examples, an easier to use HTML version,
7527
a quick reference card and a FAQ are available at http://www.xmltwig.com/xmltwig
7529
Small documents (loaded in memory as a tree):
7531
my $twig=XML::Twig->new(); # create the twig
7532
$twig->parsefile( 'doc.xml'); # build it
7533
my_process( $twig); # use twig methods to process it
7534
$twig->print; # output the twig
7536
Huge documents (processed in combined stream/tree mode):
7538
# at most one div will be loaded in memory
7539
my $twig=XML::Twig->new(
7541
{ title => sub { $_->set_gi( 'h2') }, # change title tags to h2
7542
para => sub { $_->set_gi( 'p') }, # change para to p
7543
hidden => sub { $_->delete; }, # remove hidden elements
7544
list => \&my_list_process, # process list elements
7545
div => sub { $_[0]->flush; }, # output and free memory
7547
pretty_print => 'indented', # output will be nicely formatted
7548
empty_tags => 'html', # outputs <empty_tag />
7550
$twig->flush; # flush the end of the document
7552
See L<XML::Twig 101|XML::Twig 101> for other ways to use the module, as a
7558
This module provides a way to process XML documents. It is build on top
7561
The module offers a tree interface to the document, while allowing you
7562
to output the parts of it that have been completely processed.
7564
It allows minimal resource (CPU and memory) usage by building the tree
7565
only for the parts of the documents that need actual processing, through the
7566
use of the C<L<twig_roots|twig_roots> > and
7567
C<L<twig_print_outside_roots|twig_print_outside_roots> > options. The
7568
C<L<finish|finish> > and C<L<finish_print|finish_print> > methods also help
7569
to increase performances.
7571
XML::Twig tries to make simple things easy so it tries its best to takes care
7572
of a lot of the (usually) annoying (but sometimes necessary) features that
7573
come with XML and XML::Parser.
7575
=head1 XML::Twig 101
7577
XML::Twig can be used either on "small" XML documents (that fit in memory)
7578
or on huge ones, by processing parts of the document and outputting or
7579
discarding them once they are processed.
7582
=head2 Loading an XML document and processing it
7584
my $t= XML::Twig->new();
7585
$t->parse( '<d><title>title</title><para>p 1</para><para>p 2</para></d>');
7587
$root->set_gi( 'html'); # change doc to html
7588
$title= $root->first_child( 'title'); # get the title
7589
$title->set_gi( 'h1'); # turn it into h1
7590
my @para= $root->children( 'para'); # get the para children
7591
foreach my $para (@para)
7592
{ $para->set_gi( 'p'); } # turn them into p
7593
$t->print; # output the document
7595
Other useful methods include:
7597
L<att|att>: C<< $elt->att( 'foo') >> return the C<foo> attribute for an
7600
L<set_att|set_att> : C<< $elt->set_att( foo => "bar") >> sets the C<foo>
7601
attribute to the C<bar> value,
7603
L<next_sibling|next_sibling>: C<< $elt->next_sibling >> return the next sibling
7604
in the document (in the example C<< $title->next_sibling >> is the first
7605
C<para>, you can also (and actually should) use
7606
C<< $elt->next_sibling( 'para') >> to get it
7608
The document can also be transformed through the use of the L<cut|cut>,
7609
L<copy|copy>, L<paste|paste> and L<move|move> methods:
7610
C<< $title->cut; $title->paste( after => $p); >> for example
7612
And much, much more, see L<Elt|"Elt">.
7614
=head2 Processing an XML document chunk by chunk
7616
One of the strengths of XML::Twig is that it let you work with files that do
7617
not fit in memory (BTW storing an XML document in memory as a tree is quite
7618
memory-expensive, the expansion factor being often around 10).
7620
To do this you can define handlers, that will be called once a specific
7621
element has been completely parsed. In these handlers you can access the
7622
element and process it as you see fit, using the navigation and the
7623
cut-n-paste methods, plus lots of convenient ones like C<L<prefix|prefix> >.
7624
Once the element is completely processed you can then C<L<flush|flush> > it,
7625
which will output it and free the memory. You can also C<L<purge|purge> > it
7626
if you don't need to output it (if you are just extracting some data from
7627
the document for example). The handler will be called again once the next
7628
relevant element has been parsed.
7630
my $t= XML::Twig->new( twig_handlers =>
7631
{ section => \§ion,
7632
para => sub { $_->set_tag( 'p');
7635
$t->parsefile( 'doc.xml');
7636
$t->flush; # don't forget to flush one last time in the end or anything
7637
# after the last </section> tag will not be output
7639
# the handler is called once a section is completely parsed, ie when
7640
# the end tag for section is found, it receives the twig itself and
7641
# the element (including all its sub-elements) as arguments
7643
{ my( $t, $section)= @_; # arguments for all twig_handlers
7644
$section->set_tag( 'div'); # change the tag name.4, my favourite method...
7645
# let's use the attribute nb as a prefix to the title
7646
my $title= $section->first_child( 'title'); # find the title
7647
my $nb= $title->att( 'nb'); # get the attribute
7648
$title->prefix( "$nb - "); # easy isn't it?
7649
$section->flush; # outputs the section and frees memory
7653
There is of course more to it: you can trigger handlers on more elaborate
7654
conditions than just the name of the element, C<section/title> for example.
7656
my $t= XML::Twig->new( twig_handlers =>
7657
{ 'section/title' => sub { $_->print } }
7659
->parsefile( 'doc.xml');
7661
Here C<< sub { $_->print } >> simply prints the current element (C<$_> is aliased
7662
to the element in the handler).
7664
You can also trigger a handler on a test on an attribute:
7666
my $t= XML::Twig->new( twig_handlers =>
7667
{ 'section[@level="1"]' => sub { $_->print } }
7669
->parsefile( 'doc.xml');
7671
You can also use C<L<start_tag_handlers|start_tag_handlers> > to process an
7672
element as soon as the start tag is found. Besides C<L<prefix|prefix> > you
7673
can also use C<L<suffix|suffix> >,
7675
=head2 Processing just parts of an XML document
7677
The twig_roots mode builds only the required sub-trees from the document
7678
Anything outside of the twig roots will just be ignored:
7680
my $t= XML::Twig->new(
7681
# the twig will include just the root and selected titles
7682
twig_roots => { 'section/title' => \&print_n_purge,
7683
'annex/title' => \&print_n_purge
7686
$t->parsefile( 'doc.xml');
7689
{ my( $t, $elt)= @_;
7690
print $elt->text; # print the text (including sub-element texts)
7691
$t->purge; # frees the memory
7694
You can use that mode when you want to process parts of a documents but are
7695
not interested in the rest and you don't want to pay the price, either in
7696
time or memory, to build the tree for the it.
7699
=head2 Building an XML filter
7701
You can combine the C<twig_roots> and the C<twig_print_outside_roots> options to
7702
build filters, which let you modify selected elements and will output the rest
7703
of the document as is.
7705
This would convert prices in $ to prices in Euro in a document:
7707
my $t= XML::Twig->new(
7708
twig_roots => { 'price' => \&convert, }, # process prices
7709
twig_print_outside_roots => 1, # print the rest
7711
$t->parsefile( 'doc.xml');
7714
{ my( $t, $price)= @_;
7715
my $currency= $price->att( 'currency'); # get the currency
7716
if( $currency eq 'USD')
7717
{ $usd_price= $price->text; # get the price
7718
# %rate is just a conversion table
7719
my $euro_price= $usd_price * $rate{usd2euro};
7720
$price->set_text( $euro_price); # set the new price
7721
$price->set_att( currency => 'EUR'); # don't forget this!
7723
$price->print; # output the price
7726
=head2 XML::Twig and various versions of Perl, XML::Parser and expat:
7728
Before being uploaded to CPAN, XML::Twig 3.16 has been tested under the
7729
following environments:
7735
perl 5.6.2 to 5.9.1, expat 1.95.2 to 1.95.7, XML::Parser 2.31, 2.33 and 2.34
7736
perl 5.6.2, XML::Parser 2.27 (which comes with its own version of expat)
7738
=item Mac OS X (10.2/10.3)
7740
Mac OS X: same as linux-x86, plus perl 5.5.4
7744
perl 5.6.1, expat 1.95.2, XML::Parser 2.31
7748
perl 5.6.1 (Activestate build 635), XML::Parser 2.27
7749
perl 5.8.2 (Activestate build 808), XML::Parser 2.34
7751
Note that with Windows 98 and Perl 5.6.1 C<nmake> may freeze while trying to copy
7752
the tools (xml_grep, xml_print and xml_spellcheck), so you have to answer no
7753
when asked if you want to install them.
7757
See L<http://testers.cpan.org/search?request=dist&dist=XML-Twig> for the
7758
CPAN testers reports on XML::Twig
7760
XML::Twig does B<NOT> work with expat 1.95.4
7761
XML::Twig only works with XML::Parser 2.27 in perl 5.6.*
7762
XML::Parser 2.28 does not really work
7764
When in doubt, upgrade expat, XML::Parser and Scalar::Util
7766
=head1 Simplifying XML processing
7772
Whitespaces that look non-significant are discarded, this behaviour can be
7773
controlled using the C<L<keep_spaces|keep_spaces> >,
7774
C<L<keep_spaces_in|keep_spaces_in> > and
7775
C<L<discard_spaces_in|discard_spaces_in> > options.
7779
You can specify that you want the output in the same encoding as the input
7780
(provided you have valid XML, which means you have to specify the encoding
7781
either in the document or when you create the Twig object) using the
7782
C<L<keep_encoding|keep_encoding> > option
7784
You can also use C<L<output_encoding>> to convert the internal UTF-8 format
7785
to the required encoding.
7787
=item Comments and Processing Instructions (PI)
7789
Comments and PI's can be hidden from the processing, but still appear in the
7790
output (they are carried by the "real" element closer to them)
7792
=item Pretty Printing
7794
XML::Twig can output the document pretty printed so it is easier to read for
7797
=item Surviving an untimely death
7799
XML parsers are supposed to react violently when fed improper XML.
7800
XML::Parser just dies.
7802
XML::Twig provides the C<L<safe_parse|safe_parse> > and the
7803
C<L<safe_parsefile|safe_parsefile> > methods which wrap the parse in an eval
7804
and return either the parsed twig or 0 in case of failure.
7806
=item Private attributes
7808
Attributes with a name starting with # (illegal in XML) will not be
7809
output, so you can safely use them to store temporary values during
7810
processing. Note that you can store anything in a private attribute,
7811
not just text, it's just a regular Perl variable, so a reference to
7812
an object or a huge data structure is perfectly fine.
7818
XML::Twig uses a very limited number of classes. The ones you are most likely to use
7819
are C<L<XML::Twig>> of course, which represents a complete XML document, including the
7820
document itself (the root of the document itself is C<L<root>>), its handlers, its
7821
input or output filters... The other main class is C<L<XML::Twig::Elt>>, which models
7822
an XML element. Element here has a very wide definition: it can be a regular element, or
7823
but also text, with an element C<L<tag>> of C<#PCDATA> (or C<#CDATA>), an entity (tag is
7824
C<#ENT>), a Processing Instruction (C<#PI>), a comment (C<#COMMENT>).
7826
Those are the 2 commonly used classes.
7828
You might want to look the C<L<elt_class>> option if you want to subclass C<XML::Twig::Elt>.
7830
Attributes are just attached to their parent element, they are not objects per se. (Please
7831
use the provided methods C<L<att>> and C<L<set_att>> to access them, if you access them
7832
as a hash, then your code becomes implementaion deppndant and might break in the future).
7834
Other classes that are seldom used are C<L<XML::Twig::Entity_list>> and C<L<XML::Twig::Entity>>.
7836
If you use C<L<XML::Twig::XPath>> instead of C<XML::Twig>, elements are then created as
7837
C<L<XML::Twig::XPath::Elt>>
7844
A twig is a subclass of XML::Parser, so all XML::Parser methods can be
7845
called on a twig object, including parse and parsefile.
7846
C<setHandlers> on the other hand cannot be used, see C<L<BUGS|BUGS> >
7853
This is a class method, the constructor for XML::Twig. Options are passed
7854
as keyword value pairs. Recognized options are the same as XML::Parser,
7855
plus some XML::Twig specifics.
7863
This argument replaces the corresponding XML::Parser argument. It consists
7864
of a hash C<{ expression => \&handler}> where expression is a
7865
I<generic_attribute_condition>, I<string_condition>,
7866
an I<attribute_condition>,I<full_path>, a I<partial_path>, a I<gi>,
7867
I<_default_> or I<_all_>.
7869
The idea is to support a usefull but efficient (thus limited) subset of
7870
XPATH. A fuller expression set will be supported in the future, as users
7871
ask for more and as I manage to implement it efficiently. This will never
7872
encompass all of XPATH due to the streaming nature of parsing (no lookahead
7873
after the element end tag).
7875
A B<generic_attribute_condition> is a condition on an attribute, in the form
7876
C<*[@att="val"]> or C<*[@att]>, simple quotes can be used instead of double
7877
quotes and the leading '*' is actually optional. No matter what the gi of the
7878
element is, the handler will be triggered either if the attribute has the
7879
specified value or if it just exists.
7881
A B<string_condition> is a condition on the content of an element, in the form
7882
C<gi[string()="foo"]>, simple quotes can be used instead of double quotes, at
7883
the moment you cannot escape the quotes (this will be added as soon as I
7884
dig out my copy of Mastering Regular Expressions from its storage box).
7885
The text returned is, as per what I (and Matt Sergeant!) understood from
7886
the XPATH spec the concatenation of all the text in the element, excluding
7887
all markup. Thus to call a handler on the elementC<< <p>text <b>bold</b></p> >>
7888
the appropriate condition is C<p[string()="text bold"]>. Note that this is not
7889
exactly conformant to the XPATH spec, it just tries to mimic it while being
7890
still quite concise.
7892
A extension of that notation is C<gi[string(B<child_gi>)="foo"]> where the
7893
handler will be called if a child of a C<gi> element has a text value of
7894
C<foo>. At the moment only direct children of the C<gi> element are checked.
7895
If you need to test on descendants of the element let me know. The fix is
7896
trivial but would slow down the checks, so I'd like to keep it the way it is.
7898
A B<regexp_condition> is a condition on the content of an element, in the form
7899
C<gi[string()=~ /foo/"]>. This is the same as a string condition except that
7900
the text of the element is matched to the regexp. The C<i>, C<m>, C<s> and C<o>
7901
modifiers can be used on the regexp.
7903
The C<< gi[string(B<child_gi>)=~ /foo/"] >> extension is also supported.
7905
An B<attribute_condition> is a simple condition of an attribute of the
7906
current element in the form C<gi[@att="val"]> (simple quotes can be used
7907
instead of double quotes, you can escape quotes either).
7908
If several attribute_condition are true the same element all the handlers
7909
can be called in turn (in the order in which they were first defined).
7910
If the C<="val"> part is ommited ( the condition is then C<gi[@att]>) then
7911
the handler is triggered if the attribute actually exists for the element,
7912
no matter what it's value is.
7914
A B<full_path> looks like C<'/doc/section/chapter/title'>, it starts with
7915
a / then gives all the gi's to the element. The handler will be called if
7916
the path to the current element (in the input document) is exactly as
7917
defined by the C<full_path>.
7919
A B<partial_path> is like a full_path except it does not start with a /:
7920
C<'chapter/title'> for example. The handler will be called if the path to
7921
the element (in the input document) ends as defined in the C<partial_path>.
7923
B<WARNING>: (hopefully temporary) at the moment C<string_condition>,
7924
C<regexp_condition> and C<attribute_condition> are only supported on a
7925
simple gi, not on a path.
7927
A B<gi> (generic identifier) is just a tag name.
7929
#CDATA can be used to call a handler for a CDATA.
7931
A special gi B<_all_> is used to call a function for each element.
7932
The special gi B<_default_> is used to call a handler for each element
7933
that does NOT have a specific handler.
7935
The order of precedence to trigger a handler is:
7936
I<generic_attribute_condition>, I<string_condition>, I<regexp_condition>,
7937
I<attribute_condition>, I<full_path>, longer I<partial_path>, shorter
7938
I<partial_path>, I<gi>, I<_default_> .
7940
B<Important>: once a handler has been triggered if it returns 0 then no other
7941
handler is called, exept a C<_all_> handler which will be called anyway.
7943
If a handler returns a true value and other handlers apply, then the next
7944
applicable handler will be called. Repeat, rince, lather..; The exception
7945
to that rule is when the C<L<do_not_chain_handlers|do_not_chain_handlers>>
7946
option is set, in which case only the first handler will be called.
7948
Note that it might be a good idea to explicitely return a short true value
7949
(like 1) from handlers: this ensures that other applicable handlers are
7950
called even if the last statement for the handler happens to evaluate to
7951
false. This might also speedup the code by avoiding the result of the last
7952
statement of the code to be copied and passed to the code managing handlers.
7953
It can really pay to have 1 instead of a long string returned.
7955
When an element is CLOSED the corresponding handler is called, with 2
7956
arguments: the twig and the C<L</Element|/Element> >. The twig includes the
7957
document tree that has been built so far, the element is the complete sub-tree
7958
for the element. This means that handlers for inner elements are called before
7959
handlers for outer elements.
7961
C<$_> is also set to the element, so it is easy to write inline handlers like
7963
para => sub { $_->change_gi( 'p'); }
7965
Text is stored in elements where gi is #PCDATA (due to mixed content, text
7966
and sub-element in an element there is no way to store the text as just an
7967
attribute of the enclosing element).
7969
B<Warning>: if you have used purge or flush on the twig the element might not
7970
be complete, some of its children might have been entirely flushed or purged,
7971
and the start tag might even have been printed (by C<flush>) already, so changing
7972
its gi might not give the expected result.
7974
More generally, the I<full_path>, I<partial_path> and I<gi> expressions are
7975
evaluated against the input document. Which means that even if you have changed
7976
the gi of an element (changing the gi of a parent element from a handler for
7977
example) the change will not impact the expression evaluation. Attributes in
7978
I<attribute_condition> are different though. As the initial value of attribute
7979
is not stored the handler will be triggered if the B<current> attribute/value
7980
pair is found when the element end tag is found. Although this can be quite
7981
confusing it should not impact most of users, and allow others to play clever
7982
tricks with temporary attributes. Let me know if this is a problem for you.
7987
This argument let's you build the tree only for those elements you are
7990
Example: my $t= XML::Twig->new( twig_roots => { title => 1, subtitle => 1});
7991
$t->parsefile( file);
7992
my $t= XML::Twig->new( twig_roots => { 'section/title' => 1});
7993
$t->parsefile( file);
7996
return a twig containing a document including only C<title> and C<subtitle>
7997
elements, as children of the root element.
7999
You can use I<generic_attribute_condition>, I<attribute_condition>,
8000
I<full_path>, I<partial_path>, I<gi>, I<_default_> and I<_all_> to
8001
trigger the building of the twig.
8002
I<string_condition> and I<regexp_condition> cannot be used as the content
8003
of the element, and the string, have not yet been parsed when the condition
8006
B<WARNING>: path are checked for the document. Even if the C<twig_roots> option
8007
is used they will be checked against the full document tree, not the virtual
8008
tree created by XML::Twig
8011
B<WARNING>: twig_roots elements should NOT be nested, that would hopelessly
8012
confuse XML::Twig ;--(
8014
Note: you can set handlers (twig_handlers) using twig_roots
8015
Example: my $t= XML::Twig->new( twig_roots =>
8016
{ title => sub { $_{1]->print;},
8017
subtitle => \&process_subtitle
8020
$t->parsefile( file);
8023
=item twig_print_outside_roots
8025
To be used in conjunction with the C<twig_roots> argument. When set to a true
8026
value this will print the document outside of the C<twig_roots> elements.
8028
Example: my $t= XML::Twig->new( twig_roots => { title => \&number_title },
8029
twig_print_outside_roots => 1,
8031
$t->parsefile( file);
8034
{ my( $twig, $title);
8036
$title->prefix( "$nb "; }
8042
This example prints the document outside of the title element, calls
8043
C<number_title> for each C<title> element, prints it, and then resumes printing
8044
the document. The twig is built only for the C<title> elements.
8046
If the value is a reference to a file handle then the document outside the
8047
C<twig_roots> elements will be output to this file handle:
8049
open( OUT, ">out_file") or die "cannot open out file out_file:$!";
8050
my $t= XML::Twig->new( twig_roots => { title => \&number_title },
8051
# default output to OUT
8052
twig_print_outside_roots => \*OUT,
8057
{ my( $twig, $title);
8059
$title->prefix( "$nb "; }
8060
$title->print( \*OUT); # you have to print to \*OUT here
8065
=item start_tag_handlers
8067
A hash C<{ expression => \&handler}>. Sets element handlers that are called when
8068
the element is open (at the end of the XML::Parser C<Start> handler). The handlers
8069
are called with 2 params: the twig and the element. The element is empty at
8070
that point, its attributes are created though.
8072
You can use I<generic_attribute_condition>, I<attribute_condition>,
8073
I<full_path>, I<partial_path>, I<gi>, I<_default_> and I<_all_> to trigger
8076
I<string_condition> and I<regexp_condition> cannot be used as the content of
8077
the element, and the string, have not yet been parsed when the condition is
8080
The main uses for those handlers are to change the tag name (you might have to
8081
do it as soon as you find the open tag if you plan to C<flush> the twig at some
8082
point in the element, and to create temporary attributes that will be used
8083
when processing sub-element with C<twig_hanlders>.
8085
You should also use it to change tags if you use C<flush>. If you change the tag
8086
in a regular C<twig_handler> then the start tag might already have been flushed.
8088
B<Note>: C<start_tag> handlers can be called outside of C<twig_roots> if this
8089
argument is used, in this case handlers are called with the following arguments:
8090
C<$t> (the twig), C<$gi> (the gi of the element) and C<%att> (a hash of the
8091
attributes of the element).
8093
If the C<twig_print_outside_roots> argument is also used, if the last handler
8094
called returns a C<true> value, then the the start tag will be output as it
8095
appeared in the original document, if the handler returns a a C<false> value
8096
then the start tag will B<not> be printed (so you can print a modified string
8097
yourself for example).
8099
Note that you can use the L<ignore|ignore> method in C<start_tag_handlers>
8102
=item end_tag_handlers
8104
A hash C<{ expression => \&handler}>. Sets element handlers that are called when
8105
the element is closed (at the end of the XML::Parser C<End> handler). The handlers
8106
are called with 2 params: the twig and the gi of the element.
8108
I<twig_handlers> are called when an element is completely parsed, so why have
8109
this redundant option? There is only one use for C<end_tag_handlers>: when using
8110
the C<twig_roots> option, to trigger a handler for an element B<outside> the roots.
8111
It is for example very useful to number titles in a document using nested
8116
my $t= XML::Twig->new(
8117
start_tag_handlers =>
8118
{ section => sub { $no[$#no]++; $no= join '.', @no; push @no, 0; } },
8120
{ title => sub { $_[1]->prefix( $no); $_[1]->print; } },
8121
end_tag_handlers => { section => sub { pop @no; } },
8122
twig_print_outside_roots => 1
8124
$t->parsefile( $file);
8126
Using the C<end_tag_handlers> argument without C<twig_roots> will result in an
8129
=item do_not_chain_handlers
8131
If this option is set to a true value, then only one handler will be called for
8132
each element, even if several satisfy the condition
8134
Note that the C<_all_> handler will still be called regardeless
8138
This option lets you ignore elements when building the twig. This is useful
8139
in cases where you cannot use C<twig_roots> to ignore elements, for example if
8140
the element to ignore is a sibling of elements you are interested in.
8144
my $twig= XML::Twig->new( ignore_elts => { elt => 1 });
8145
$twig->parsefile( 'doc.xml');
8147
This will build the complete twig for the document, except that all C<elt>
8148
elements (and their children) will be left out.
8153
A reference to a subroutine that will be called every time C<PCDATA> is found.
8157
The name of a class used to store elements. this class should inherit from
8158
C<XML::Twig::Elt> (and by default it is C<XML::Twig::Elt>). This option is used
8159
to subclass the element class and extend it with new methods.
8161
This option is needed because during the parsing of the XML, elements are created
8162
by C<XML::Twig>, without any control from the user code.
8164
=item keep_atts_order
8166
Setting this option to a true value causes the attribute hash to be tied to
8167
a C<Tie::IxHash> object.
8168
This means that C<Tie::IxHash> needs to be installed for this option to be
8169
available. It also means that the hash keeps its order, so you will get
8170
the attributes in order. This allows outputing the attributes in the same
8171
order as they were in the original document.
8175
This is a (slightly?) evil option: if the XML document is not UTF-8 encoded and
8176
you want to keep it that way, then setting keep_encoding will use theC<Expat>
8177
original_string method for character, thus keeping the original encoding, as
8178
well as the original entities in the strings.
8180
See the C<t/test6.t> test file to see what results you can expect from the
8181
various encoding options.
8183
B<WARNING>: if the original encoding is multi-byte then attribute parsing will
8184
be EXTREMELY unsafe under any Perl before 5.6, as it uses regular expressions
8185
which do not deal properly with multi-byte characters. You can specify an
8186
alternate function to parse the start tags with the C<parse_start_tag> option
8189
B<WARNING>: this option is NOT used when parsing with the non-blocking parser
8190
(C<parse_start>, C<parse_more>, parse_done methods) which you probably should
8191
not use with XML::Twig anyway as they are totally untested!
8193
=item output_encoding
8195
This option generates an output_filter using C<Encode>, C<Text::Iconv> or
8196
C<Unicode::Map8> and C<Unicode::Strings>, and sets the encoding in the XML
8197
declaration. This is the easiest way to deal with encodings, if you need
8198
more sophisticated features, look at C<output_filter> below
8203
This option is used to convert the character encoding of the output document.
8204
It is passed either a string corresponding to a predefined filter or
8205
a subroutine reference. The filter will be called every time a document or
8206
element is processed by the "print" functions (C<print>, C<sprint>, C<flush>).
8208
Pre-defined filters:
8214
uses either C<Encode>, C<Text::Iconv> or C<Unicode::Map8> and C<Unicode::String>
8215
or a regexp (which works only with XML::Parser 2.27), in this order, to convert
8216
all characters to ISO-8859-1 (aka latin1)
8220
does the same conversion as C<latin1>, plus encodes entities using
8221
C<HTML::Entities> (oddly enough you will need to have HTML::Entities intalled
8222
for it to be available). This should only be used if the tags and attribute
8223
names themselves are in US-ASCII, or they will be converted and the output will
8224
not be valid XML any more
8228
converts the output to ASCII (US) only plus I<character entities> (C<&#nnn;>)
8229
this should be used only if the tags and attribute names themselves are in
8230
US-ASCII, or they will be converted and the output will not be valid XML any
8235
same as C<safe> except that the character entities are in hexa (C<&#xnnn;>)
8237
=item encode_convert ($encoding)
8239
Return a subref that can be used to convert utf8 strings to C<$encoding>).
8242
my $conv = XML::Twig::encode_convert( 'latin1');
8243
my $t = XML::Twig->new(output_filter => $conv);
8245
=item iconv_convert ($encoding)
8247
this function is used to create a filter subroutine that will be used to
8248
convert the characters to the target encoding using C<Text::Iconv> (which needs
8249
to be installed, look at the documentation for the module and for the
8250
C<iconv> library to find out which encodings are available on your system)
8252
my $conv = XML::Twig::iconv_convert( 'latin1');
8253
my $t = XML::Twig->new(output_filter => $conv);
8255
=item unicode_convert ($encoding)
8257
this function is used to create a filter subroutine that will be used to
8258
convert the characters to the target encoding using C<Unicode::Strings>
8259
and C<Unicode::Map8> (which need to be installed, look at the documentation
8260
for the modules to find out which encodings are available on your system)
8262
my $conv = XML::Twig::unicode_convert( 'latin1');
8263
my $t = XML::Twig->new(output_filter => $conv);
8267
The C<text> and C<att> methods do not use the filter, so their
8268
result are always in unicode.
8270
Those predeclared filters are based on subroutines that can be used
8271
by themselves (as C<XML::Twig::foo>).
8275
=item html_encode ($string)
8277
Use C<HTML::Entities> to encode a utf8 string
8279
=item safe_encode ($string)
8281
Use either a regexp (perl < 5.8) or C<Encode> to encode non-ascii characters
8282
in the string in C<< &#<nnnn>; >> format
8284
=item safe_encode_hex ($string)
8286
Use either a regexp (perl < 5.8) or C<Encode> to encode non-ascii characters
8287
in the string in C<< &#x<nnnn>; >> format
8289
=item regexp2latin1 ($string)
8291
Use a regexp to encode a utf8 string into latin 1 (ISO-8859-1). Does not
8292
work with Perl 5.8.0!
8296
=item output_text_filter
8298
same as output_filter, except it doesn't apply to the brackets and quotes
8299
around attribute values. This is useful for all filters that could change
8300
the tagging, basically anything that does not just change the encoding of
8301
the output. C<html>, C<safe> and C<safe_hex> are better used with this option.
8305
This option is similar to C<output_filter> except the filter is applied to
8306
the characters before they are stored in the twig, at parsing time.
8310
Setting this option to a true value will force the twig to output CDATA
8311
sections as regular (escaped) PCDATA
8313
=item parse_start_tag
8315
If you use the C<keep_encoding> option then this option can be used to replace
8316
the default parsing function. You should provide a coderef (a reference to a
8317
subroutine) as the argument, this subroutine takes the original tag (given
8318
by XML::Parser::Expat C<original_string()> method) and returns a gi and the
8319
attributes in a hash (or in a list attribute_name/attribute value).
8321
=item expand_external_ents
8323
When this option is used external entities (that are defined) are expanded
8324
when the document is output using "print" functions such as C<L<print> >,
8325
C<L<sprint|sprint> >, C<L<flush|flush> > and C<L<xml_string|xml_string> >.
8326
Note that in the twig the entity will be stored as an element whith a
8327
gi 'C<#ENT>', the entity will not be expanded there, so you might want to
8328
process the entities before outputting it.
8332
If this argument is set to a true value, C<parse> or C<parsefile> on the twig
8333
will load the DTD information. This information can then be accessed through
8334
the twig, in a C<DTD_handler> for example. This will load even an external DTD.
8336
Default and fixed values for attributes will also be filled, based on the DTD.
8338
Note that to do this the module will generate a temporary file in the current
8339
directory. If this is a problem let me know and I will add an option to
8340
specify an alternate directory.
8342
See L<DTD Handling|DTD Handling> for more information
8346
Set a handler that will be called once the doctype (and the DTD) have been
8347
loaded, with 2 arguments, the twig and the DTD.
8351
Does not output a prolog (XML declaration and DTD)
8355
This optional argument gives the name of an attribute that can be used as
8356
an ID in the document. Elements whose ID is known can be accessed through
8357
the elt_id method. id defaults to 'id'.
8358
See C<L<BUGS|BUGS> >
8360
=item discard_spaces
8362
If this optional argument is set to a true value then spaces are discarded
8363
when they look non-significant: strings containing only spaces are discarded.
8364
This argument is set to true by default.
8368
If this optional argument is set to a true value then all spaces in the
8369
document are kept, and stored as C<PCDATA>.
8370
C<keep_spaces> and C<discard_spaces> cannot be both set.
8372
=item discard_spaces_in
8374
This argument sets C<keep_spaces> to true but will cause the twig builder to
8375
discard spaces in the elements listed.
8377
The syntax for using this argument is:
8379
XML::Twig->new( discard_spaces_in => [ 'elt1', 'elt2']);
8381
=item keep_spaces_in
8383
This argument sets C<discard_spaces> to true but will cause the twig builder to
8384
keep spaces in the elements listed.
8386
The syntax for using this argument is:
8388
XML::Twig->new( keep_spaces_in => [ 'elt1', 'elt2']);
8392
Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>',
8393
'C<nice>', 'C<indented>', 'C<indented_c>', 'C<record>' and 'C<record_c>'
8395
pretty_print formats:
8401
The document is output as one ling string, with no line breaks except those
8402
found within text elements
8406
Line breaks are inserted in safe places: that is within tags, between a tag
8407
and an attribute, between attributes and before the > at the end of a tag.
8409
This is quite ugly but better than C<none>, and it is very safe, the document
8410
will still be valid (conforming to its DTD).
8412
This is how the SGML parser C<sgmls> splits documents, hence the name.
8416
This option inserts line breaks before any tag that does not contain text (so
8417
element with textual content are not broken as the \n is the significant).
8419
B<WARNING>: this option leaves the document well-formed but might make it
8420
invalid (not conformant to its DTD). If you have elements declared as
8422
<!ELEMENT foo (#PCDATA|bar)>
8424
then a C<foo> element including a C<bar> one will be printed as
8427
<bar>bar is just pcdata</bar>
8430
This is invalid, as the parser will take the line break after the C<foo> tag
8431
as a sign that the element contains PCDATA, it will then die when it finds the
8432
C<bar> tag. This may or may not be important for you, but be aware of it!
8436
Same as C<nice> (and with the same warning) but indents elements according to
8441
Same as C<indented> but a little more compact: the closing tags are on the
8442
same line as the preceeding text
8446
This is a record-oriented pretty print, that display data in records, one field
8447
per line (which looks a LOT like C<indented>)
8451
Stands for record compact, one record per line
8458
Set the empty tag display style ('C<normal>', 'C<html>' or 'C<expand>').
8462
Set the way comments are processed: 'C<drop>' (default), 'C<keep>' or
8465
Comments processing options:
8471
drops the comments, they are not read, nor printed to the output
8475
comments are loaded and will appear on the output, they are not
8476
accessible within the twig and will not interfere with processing
8479
B<Note>: comments in the middle of a text element such as
8481
<p>text <!-- comment --> more text --></p>
8483
are kept at their original position in the text. Using ˝"print"
8484
methods like C<print> or C<sprint> will return the comments in the
8485
text. Using C<text> or C<field> on the other hand will not.
8487
Any use of C<set_pcdata> on the C<#PCDATA> element (directly or
8488
through other methods like C<set_content>) will delete the comment(s).
8492
comments are loaded in the twig and will be treated as regular elements
8493
(their C<gi> is C<#COMMENT>) this can interfere with processing if you
8494
expect C<< $elt->first_child >> to be an element but find a comment there.
8495
Validation will not protect you from this as comments can happen anywhere.
8496
You can use C<< $elt->first_child( 'gi') >> (which is a good habit anyway)
8497
to get where you want.
8499
Consider using C<process> if you are outputing SAX events from XML::Twig.
8505
Set the way processing instructions are processed: 'C<drop>', 'C<keep>'
8506
(default) or 'C<process>'
8508
Note that you can also set PI handlers in the C<twig_handlers> option:
8511
'?target' => \&handler 2
8513
The handlers will be called with 2 parameters, the twig and the PI element if
8514
C<pi> is set to C<process>, and with 3, the twig, the target and the data if
8515
C<pi> is set to C<keep>. Of course they will not be called if C<pi> is set to
8518
If C<pi> is set to C<keep> the handler should return a string that will be used
8519
as-is as the PI text (it should look like "C< <?target data?> >" or '' if you
8520
want to remove the PI),
8522
Only one handler will be called, C<?target> or C<?> if no specific handler for
8523
that target is available.
8527
This option is passed a hashref that maps uri's to prefixes. The prefixes in
8528
the document will be replaced by the ones in the map. The mapped prefixes can
8529
(actually have to) be used to trigger handlers, navigate or query the document.
8533
my $t= XML::Twig->new( map_xmlns => {'http://www.w3.org/2000/svg' => "svg"},
8535
{ 'svg:circle' => sub { $_->set_att( r => 20) } },
8536
pretty_print => 'indented',
8538
->parse( '<doc xmlns:gr="http://www.w3.org/2000/svg">
8539
<gr:circle cx="10" cy="90" r="10"/>
8546
<doc xmlns:svg="http://www.w3.org/2000/svg">
8547
<svg:circle cx="10" cy="90" r="20"/>
8550
=item keep_original_prefix
8552
When used with C<L<map_xmlns>> this option will make C<XML::Twig> use the original
8553
namespace prefixes when outputing a document. The mapped prefix will still be used
8554
for triggering handlers and in navigation and query methods.
8556
my $t= XML::Twig->new( map_xmlns => {'http://www.w3.org/2000/svg' => "svg"},
8558
{ 'svg:circle' => sub { $_->set_att( r => 20) } },
8559
keep_original_prefix => 1,
8560
pretty_print => 'indented',
8562
->parse( '<doc xmlns:gr="http://www.w3.org/2000/svg">
8563
<gr:circle cx="10" cy="90" r="10"/>
8570
<doc xmlns:gr="http://www.w3.org/2000/svg">
8571
<gr:circle cx="10" cy="90" r="20"/>
8574
=item index ($arrayref or $hashref)
8576
This option creates lists of specific elements during the parsing of the XML.
8577
It takes a reference to either a list of triggering expressions or to a hash
8578
name => expression, and for each one generates the list of elements that
8579
match the expression. The list can be accessed through the C<L<index>> method.
8583
# using an array ref
8584
my $t= XML::Twig->new( index => [ 'div', 'table' ])
8585
->parsefile( "foo.xml');
8586
my $divs= $t->index( 'div');
8587
my $first_div= $divs->[0];
8588
my $last_table= $t->index( table => -1);
8590
# using a hashref to name the indexes
8591
my $t= XML::Twig->new( index => { email => 'a[@href=~/^\s*mailto:/]')
8592
->parsefile( "foo.xml');
8593
my $last_emails= $t->index( email => -1);
8595
Note that the index is not maintained after the parsing. If elements are
8596
deleted, renamed or otherwise hurt during processing, the index is NOT updated.
8600
B<Note>: I _HATE_ the Java-like name of arguments used by most XML modules.
8601
So in pure TIMTOWTDI fashion all arguments can be written either as
8602
C<UglyJavaLikeName> or as C<readable_perl_name>: C<twig_print_outside_roots>
8603
or C<TwigPrintOutsideRoots> (or even C<twigPrintOutsideRoots> {shudder}).
8604
XML::Twig normalizes them before processing them.
8606
=item parse (SOURCE [, OPT => OPT_VALUE [...]])
8608
This method is inherited from XML::Parser.
8609
The C<SOURCE> parameter should either be a string containing the whole XML
8610
document, or it should be an open C<IO::Handle>. Constructor options to
8611
C<XML::Parser::Expat> given as keyword-value pairs may follow theC<SOURCE>
8612
parameter. These override, for this call, any options or attributes passed
8613
through from the XML::Parser instance.
8615
A die call is thrown if a parse error occurs. Otherwise it will return
8616
the twig built by the parse. Use C<safe_parse> if you want the parsing
8617
to return even when an error occurs.
8621
This is just an alias for C<parse> for backwards compatibility.
8623
=item parsefile (FILE [, OPT => OPT_VALUE [...]])
8625
This method is inherited from XML::Parser.
8627
Open C<FILE> for reading, then call C<parse> with the open handle. The file
8628
is closed no matter how C<parse> returns.
8630
A C<die> call is thrown if a parse error occurs. Otherwise it will return
8631
the twig built by the parse. Use C<safe_parsefile> if you want the parsing
8632
to return even when an error occurs.
8634
=item parseurl ($url $optional_user_agent)
8636
Gets the data from C<$url> and parse it. Note that the data is piped to the
8637
parser in chunks the size of the XML::Parser::Expat buffer, so memory
8638
consumption and hopefully speed are optimal.
8640
If the C<$optional_user_agent> argument is used then it is used, otherwise a
8643
=item safe_parse ( SOURCE [, OPT => OPT_VALUE [...]])
8645
This method is similar to C<parse> except that it wraps the parsing in an
8646
C<eval> block. It returns the twig on success and 0 on failure (the twig object
8647
also contains the parsed twig). C<$@> contains the error message on failure.
8649
Note that the parsing still stops as soon as an error is detected, there is
8650
no way to keep going after an error.
8652
=item safe_parsefile (FILE [, OPT => OPT_VALUE [...]])
8654
This method is similar to C<parsefile> except that it wraps the parsing in an
8655
C<eval> block. It returns the twig on success and 0 on failure (the twig object
8656
also contains the parsed twig) . C<$@> contains the error message on failure
8658
Note that the parsing still stops as soon as an error is detected, there is
8659
no way to keep going after an error.
8661
=item safe_parseurl ($url $optional_user_agent)
8663
Same as C<parseurl> except that it wraps the parsing in an C<eval> block. It
8664
returns the twig on success and 0 on failure (the twig object also contains
8665
the parsed twig) . C<$@> contains the error message on failure
8669
This method returns the C<expat> object (actually the XML::Parser::Expat object)
8670
used during parsing. It is useful for example to call XML::Parser::Expat methods
8671
on it. To get the line of a tag for example use C<< $t->parser->current_line >>.
8673
=item setTwigHandlers ($handlers)
8675
Set the twig_handlers. C<$handlers> is a reference to a hash similar to the
8676
one in the C<twig_handlers> option of new. All previous handlers are unset.
8677
The method returns the reference to the previous handlers.
8679
=item setTwigHandler ($exp $handler)
8681
Set a single twig_handler for elements matching C<$exp>. C<$handler> is a
8682
reference to a subroutine. If the handler was previously set then the reference
8683
to the previous handler is returned.
8685
=item setStartTagHandlers ($handlers)
8687
Set the start_tag handlers. C<$handlers> is a reference to a hash similar to the
8688
one in the C<start_tag_handlers> option of new. All previous handlers are unset.
8689
The method returns the reference to the previous handlers.
8691
=item setStartTagHandler ($exp $handler)
8693
Set a single start_tag handlers for elements matching C<$exp>. C<$handler> is a
8694
reference to a subroutine. If the handler was previously set then the reference
8695
to the previous handler is returned.
8697
=item setEndTagHandlers ($handlers)
8699
Set the end_tag handlers. C<$handlers> is a reference to a hash similar to the
8700
one in the C<end_tag_handlers> option of new. All previous handlers are unset.
8701
The method returns the reference to the previous handlers.
8703
=item setEndTagHandler ($exp $handler)
8705
Set a single end_tag handlers for elements matching C<$exp>. C<$handler> is a
8706
reference to a subroutine. If the handler was previously set then the
8707
reference to the previous handler is returned.
8709
=item setTwigRoots ($handlers)
8711
Same as using the C<L<twig_roots>> option when creating the twig
8713
=item setCharHandler ($exp $handler)
8715
Set a C<char_handler>
8717
=item setIgnoreEltsHandler ($exp)
8719
Set a C<ignore_elt> handler (elements that match C<$exp> will be ignored
8721
=item setIgnoreEltsHandlers ($exp)
8723
Set all C<ignore_elt> handlers (previous handlers are replaced)
8727
Return the dtd (an L<XML::Twig::DTD> object) of a twig
8731
Return the XML declaration for the document, or a default one if it doesn't
8736
Return the doctype for the document
8748
Return the model (in the DTD) for the element C<$tag>
8752
Return the root element of a twig
8754
=item set_root ($elt)
8756
Set the root of a twig
8758
=item first_elt ($optional_condition)
8760
Return the first element matching C<$optional_condition> of a twig, if
8761
no condition is given then the root is returned
8765
Return the element whose C<id> attribute is $id
8769
Same as C<L<elt_id>>
8771
=item index ($index_name, $optional_index)
8773
If the C<$optional_index> argument is present, return the corresponding element
8774
in the index (created using the C<index> option for C<XML::Twig->new>)
8776
If the argument is not present, return an arrayref to the index
8780
This method returns the encoding of the XML document, as defined by the
8781
C<encoding> attribute in the XML declaration (ie it is C<undef> if the attribute
8786
This method sets the value of the C<encoding> attribute in the XML declaration.
8787
Note that if the document did not have a declaration it is generated (with
8788
an XML version of 1.0)
8792
This method returns the XML version, as defined by the C<version> attribute in
8793
the XML declaration (ie it is C<undef> if the attribute is not defined)
8795
=item set_xml_version
8797
This method sets the value of the C<version> attribute in the XML declaration.
8798
If the declaration did not exist it is created.
8802
This method returns the value of the C<standalone> declaration for the document
8804
=item set_standalone
8806
This method sets the value of the C<standalone> attribute in the XML
8807
declaration. Note that if the document did not have a declaration it is
8808
generated (with an XML version of 1.0)
8810
=item set_output_encoding
8812
Set the C<encoding> "attribute" in the XML declaration
8814
=item set_doctype ($name, $system, $public, $internal)
8816
Set the doctype of the element. If an argument is C<undef> (or not present)
8817
then its former value is retained, if a false ('' or 0) value is passed then
8818
the former value is deleted;
8822
Return the entity list of a twig
8826
Return the list of all defined entities
8828
=item entity ($entity_name)
8832
=item change_gi ($old_gi, $new_gi)
8834
Performs a (very fast) global change. All elements C<$old_gi> are now
8837
See C<L<BUGS|BUGS> >
8839
=item flush ($optional_filehandle, $options)
8841
Flushes a twig up to (and including) the current element, then deletes
8842
all unnecessary elements from the tree that's kept in memory.
8843
C<flush> keeps track of which elements need to be open/closed, so if you
8844
flush from handlers you don't have to worry about anything. Just keep
8845
flushing the twig every time you're done with a sub-tree and it will
8846
come out well-formed. After the whole parsing don't forget toC<flush>
8847
one more time to print the end of the document.
8848
The doctype and entity declarations are also printed.
8850
flush take an optional filehandle as an argument.
8852
options: use the C<update_DTD> option if you have updated the (internal) DTD
8853
and/or the entity list and you want the updated DTD to be output
8855
The C<pretty_print> option sets the pretty printing of the document.
8857
Example: $t->flush( Update_DTD => 1);
8858
$t->flush( \*FILE, Update_DTD => 1);
8862
=item flush_up_to ($elt, $optional_filehandle, %options)
8864
Flushes up to the C<$elt> element. This allows you to keep part of the
8865
tree in memory when you C<flush>.
8871
Does the same as a C<flush> except it does not print the twig. It just deletes
8872
all elements that have been completely parsed so far.
8874
=item purge_up_to ($elt)
8876
Purges up to the C<$elt> element. This allows you to keep part of the tree in
8877
memory when you C<purge>.
8879
=item print ($optional_filehandle, %options)
8881
Prints the whole document associated with the twig. To be used only AFTER the
8884
options: see C<flush>.
8888
Return the text of the whole document associated with the twig. To be used only
8891
options: see C<flush>.
8893
=item toSAX1 ($handler)
8895
Send SAX events for the twig to the SAX1 handler C<$handler>
8897
=item toSAX2 ($handler)
8899
Send SAX events for the twig to the SAX2 handler C<$handler>
8901
=item flush_toSAX1 ($handler)
8903
Same as flush, except that SAX events are sent to the SAX1 handler
8904
C<$handler> instead of the twig being printed
8906
=item flush_toSAX2 ($handler)
8908
Same as flush, except that SAX events are sent to the SAX2 handler
8909
C<$handler> instead of the twig being printed
8913
This method can B<only> be called in C<start_tag_handlers>. It causes the
8914
element to be skipped during the parsing: the twig is not built for this
8915
element, it will not be accessible during parsing or after it. The element
8916
will not take up any memory and parsing will be faster.
8918
Note that this method can also be called on an element. If the element is a
8919
parent of the current element then this element will be ignored (the twig will
8920
not be built any more for it and what has already been built will be deleted)
8923
=item set_pretty_print ($style)
8925
Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>',
8926
'C<nice>', 'C<indented>', 'C<record>' and 'C<record_c>'
8928
B<WARNING:> the pretty print style is a B<GLOBAL> variable, so once set it's
8929
applied to B<ALL> C<print>'s (and C<sprint>'s). Same goes if you use XML::Twig
8930
with C<mod_perl> . This should not be a problem as the XML that's generated
8931
is valid anyway, and XML processors (as well as HTML processors, including
8932
browsers) should not care. Let me know if this is a big problem, but at the
8933
moment the performance/cleanliness trade-off clearly favors the global
8936
=item set_empty_tag_style ($style)
8938
Set the empty tag display style ('C<normal>', 'C<html>' or 'C<expand>'). As
8939
with C<L<set_pretty_print>> this sets a global flag.
8941
C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space
8942
'C<< <tag /> >>' and C<expand> outputs 'C<< <tag></tag> >>'
8944
=item set_remove_cdata ($flag)
8946
set (or unset) the flag that forces the twig to output CDATA sections as
8947
regular (escaped) PCDATA
8949
=item print_prolog ($optional_filehandle, %options)
8951
Prints the prolog (XML declaration + DTD + entity declarations) of a document.
8953
options: see C<L<flush>>.
8955
=item prolog ($optional_filehandle, %options)
8957
Return the prolog (XML declaration + DTD + entity declarations) of a document.
8959
options: see C<L<flush>>.
8963
Call Expat C<finish> method.
8964
Unsets all handlers (including internal ones that set context), but expat
8965
continues parsing to the end of the document or until it finds an error.
8966
It should finish up a lot faster than with the handlers set.
8970
Stop twig processing, flush the twig and proceed to finish printing the
8971
document as fast as possible. Use this method when modifying a document and
8972
the modification is done.
8974
=item set_expand_external_entities
8976
Same as using the C<L<expand_external_ents>> option when creating the twig
8978
=item set_input_filter
8980
Same as using the C<L<input_filter>> option when creating the twig
8982
=item set_keep_atts_order
8984
Same as using the C<L<keep_atts_order>> option when creating the twig
8986
=item set_keep_encoding
8988
Same as using the C<L<keep_encoding>> option when creating the twig
8990
=item set_output_filter
8992
Same as using the C<L<output_filter>> option when creating the twig
8994
=item set_output_text_filter
8996
Same as using the C<L<output_text_filter>> option when creating the twig
8998
=item Methods inherited from XML::Parser::Expat
9000
A twig inherits all the relevant methods from XML::Parser::Expat. These
9001
methods can only be used during the parsing phase (they will generate
9002
a fatal error otherwise).
9004
Inherited methods are:
9010
Returns the size of the context list.
9014
Returns true if NAME is equal to the name of the innermost cur‐
9015
rently opened element. If namespace processing is being used and
9016
you want to check against a name that may be in a namespace, then
9017
use the generate_ns_name method to create the NAME argument.
9019
=item within_element
9021
Returns the number of times the given name appears in the context
9022
list. If namespace processing is being used and you want to check
9023
against a name that may be in a namespace, then use the gener‐
9024
ate_ns_name method to create the NAME argument.
9028
Returns a list of element names that represent open elements, with
9029
the last one being the innermost. Inside start and end tag han‐
9030
dlers, this will be the tag of the parent element.
9034
Returns the line number of the current position of the parse.
9036
=item current_column
9038
Returns the column number of the current position of the parse.
9042
Returns the current position of the parse.
9044
=item position_in_context
9046
Returns a string that shows the current parse position. LINES
9047
should be an integer >= 0 that represents the number of lines on
9048
either side of the current parse line to place into the returned
9051
=item base ([NEWBASE])
9053
Returns the current value of the base for resolving relative URIs.
9054
If NEWBASE is supplied, changes the base to that value.
9056
=item current_element
9058
Returns the name of the innermost currently opened element. Inside
9059
start or end handlers, returns the parent of the element associated
9064
Returns an integer that is the depth-first visit order of the cur‐
9065
rent element. This will be zero outside of the root element. For
9066
example, this will return 1 when called from the start handler for
9067
the root element start tag.
9069
=item recognized_string
9071
Returns the string from the document that was recognized in order
9072
to call the current handler. For instance, when called from a start
9073
handler, it will give us the the start-tag string. The string is
9074
encoded in UTF-8. This method doesn't return a meaningful string
9075
inside declaration handlers.
9077
=item original_string
9079
Returns the verbatim string from the document that was recognized
9080
in order to call the current handler. The string is in the original
9081
document encoding. This method doesn't return a meaningful string
9082
inside declaration handlers.
9086
Concatenate onto the given message the current line number within
9087
the XML document plus the message implied by ErrorContext. Then
9088
croak with the formed message.
9092
Concatenate onto the given message the current line number within
9093
the XML document plus the message implied by ErrorContext. Then
9094
carp with the formed message.
9096
=item xml_escape(TEXT [, CHAR [, CHAR ...]])
9098
Returns TEXT with markup characters turned into character entities.
9099
Any additional characters provided as arguments are also turned
9100
into character references where found in TEXT.
9102
(this method is broken on some versions of expat/XML::Parser)
9108
Return the element context in a form similar to XPath's short
9109
form: 'C</root/gi1/../gi>'
9111
=item get_xpath ( $optional_array_ref, $xpath, $optional_offset)
9113
Performs a C<get_xpath> on the document root (see <Elt|"Elt">)
9115
If the C<$optional_array_ref> argument is used the array must contain
9116
elements. The C<$xpath> expression is applied to each element in turn
9117
and the result is union of all results. This way a first query can be
9118
refined in further steps.
9121
=item find_nodes ( $optional_array_ref, $xpath, $optional_offset)
9123
same as C<get_xpath>
9125
=item findnodes ( $optional_array_ref, $xpath, $optional_offset)
9127
same as C<get_xpath> (similar to the XML::LibXML method)
9129
=item findvalue ( $optional_array_ref, $xpath, $optional_offset)
9131
Return the C<join> of all texts of the results of appling C<L<get_xpath>>
9132
to the node (similar to the XML::LibXML method)
9134
=item subs_text ($regexp, $replace)
9136
subs_text does text substitution on the whole document, similar to perl's
9141
Useful only if you don't have C<Scalar::Util> or C<WeakRef> installed.
9143
Reclaims properly the memory used by an XML::Twig object. As the object has
9144
circular references it never goes out of scope, so if you want to parse lots
9145
of XML documents then the memory leak becomes a problem. Use
9146
C<< $twig->dispose >> to clear this problem.
9151
=head2 XML::Twig::Elt
9155
=item new ($optional_gi, $optional_atts, @optional_content)
9157
The C<gi> is optional (but then you can't have a content ), the C<$optional_atts>
9158
argument is a refreference to a hash of attributes, the content can be just a
9159
string or a list of strings and element. A content of 'C<#EMPTY>' creates an empty
9162
Examples: my $elt= XML::Twig::Elt->new();
9163
my $elt= XML::Twig::Elt->new( para => { align => 'center' });
9164
my $elt= XML::Twig::Elt->new( para => { align => 'center' }, 'foo');
9165
my $elt= XML::Twig::Elt->new( br => '#EMPTY');
9166
my $elt= XML::Twig::Elt->new( 'para');
9167
my $elt= XML::Twig::Elt->new( para => 'this is a para');
9168
my $elt= XML::Twig::Elt->new( para => $elt3, 'another para');
9170
The strings are not parsed, the element is not attached to any twig.
9172
B<WARNING>: if you rely on ID's then you will have to set the id yourself. At
9173
this point the element does not belong to a twig yet, so the ID attribute
9174
is not known so it won't be strored in the ID list.
9176
Note that C<#COMMENT>, C<#PCDATA> or C<#CDATA> are valid tag names, that will
9177
create text elements.
9179
To create an element C<foo> containing a CDATA section:
9181
my $foo= XML::Twig::Elt->new( '#CDATA' => "content of the CDATA section")
9184
=item parse ($string, %args)
9186
Creates an element from an XML string. The string is actually
9187
parsed as a new twig, then the root of that twig is returned.
9188
The arguments in C<%args> are passed to the twig.
9189
As always if the parse fails the parser will die, so use an
9190
eval if you want to trap syntax errors.
9192
As obviously the element does not exist beforehand this method has to be
9193
called on the class:
9195
my $elt= parse XML::Twig::Elt( "<a> string to parse, with <sub/>
9196
<elements>, actually tons of </elements>
9199
=item print ($optional_filehandle, $optional_pretty_print_style)
9201
Prints an entire element, including the tags, optionally to a
9202
C<$optional_filehandle>, optionally with a C<$pretty_print_style>.
9204
The print outputs XML data so base entities are escaped.
9206
=item sprint ($elt, $optional_no_enclosing_tag)
9208
Return the xml string for an entire element, including the tags.
9209
If the optional second argument is true then only the string inside the
9210
element is returned (the start and end tag for $elt are not).
9211
The text is XML-escaped: base entities (& and < in text, & < and " in
9212
attribute values) are turned into entities.
9216
Return the gi of the element (the gi is the C<generic identifier> the tag
9217
name in SGML parlance).
9219
C<tag> and C<name> are synonyms of C<gi>.
9231
Set the gi (tag) of an element
9233
=item set_tag ($tag)
9235
Set the tag (=C<L<gi|gi>>) of an element
9237
=item set_name ($name)
9239
Set the name (=C<L<gi|gi>>) of an element
9243
Return the root of the twig in which the element is contained.
9247
Return the twig containing the element.
9249
=item parent ($optional_condition)
9251
Return the parent of the element, or the first ancestor matching the
9252
C<$optional_condition>
9254
=item first_child ($optional_condition)
9256
Return the first child of the element, or the first child matching the
9257
C<$optional_condition>
9259
=item has_child ($optional_condition)
9261
Return the first child of the element, or the first child matching the
9262
C<$optional_condition> (same as L<first_child>)
9264
=item has_children ($optional_condition)
9266
Return the first child of the element, or the first child matching the
9267
C<$optional_condition> (same as L<first_child>)
9270
=item first_child_text ($optional_condition)
9272
Return the text of the first child of the element, or the first child
9273
matching the C<$optional_condition>
9274
If there is no first_child then returns ''. This avoids getting the
9275
child, checking for its existence then getting the text for trivial cases.
9277
Similar methods are available for the other navigation methods:
9281
=item last_child_text
9283
=item prev_sibling_text
9285
=item next_sibling_text
9297
All this methods also exist in "trimmed" variant:
9301
=item first_child_trimmed_text
9303
=item last_child_trimmed_text
9305
=item prev_sibling_trimmed_text
9307
=item next_sibling_trimmed_text
9309
=item prev_elt_trimmed_text
9311
=item next_elt_trimmed_text
9313
=item child_trimmed_text
9315
=item parent_trimmed_text
9319
=item field ($optional_condition)
9321
Same method as C<first_child_text> with a different name
9323
=item trimmed_field ($optional_condition)
9325
Same method as C<first_child_trimmed_text> with a different name
9327
=item set_field ($condition, $optional_atts, @list_of_elt_and_strings)
9329
Set the content of the first child of the element that matches
9330
C<$condition>, the rest of the arguments is tha same as for C<L<set_content>>
9332
If no child matches C<$condition> _and_ if C<$condition> is a valid
9333
XML element name, then a new element by that name is created and
9334
inserted as the last child.
9336
=item first_child_matches ($optional_condition)
9338
Return the element if the first child of the element (if it exists) passes
9339
the C<$optional_condition> C<undef> otherwise
9341
if( $elt->first_child_matches( 'title')) ...
9345
if( $elt->first_child && $elt->first_child->passes( 'title'))
9347
C<first_child_is> is an other name for this method
9349
Similar methods are available for the other navigation methods:
9353
=item last_child_matches
9355
=item prev_sibling_matches
9357
=item next_sibling_matches
9359
=item prev_elt_matches
9361
=item next_elt_matches
9365
=item parent_matches
9369
=item is_first_child ($optional_condition)
9371
returns true (the element) if the element is the first child of its parent
9372
(optionaly that satisfies the C<$optional_condition>)
9374
=item is_last_child ($optional_condition)
9376
returns true (the element) if the element is the first child of its parent
9377
(optionaly that satisfies the C<$optional_condition>)
9379
=item prev_sibling ($optional_condition)
9381
Return the previous sibling of the element, or the previous sibling matching
9382
C<$optional_condition>
9384
=item next_sibling ($optional_condition)
9386
Return the next sibling of the element, or the first one matching
9387
C<$optional_condition>.
9389
=item next_elt ($optional_elt, $optional_condition)
9391
Return the next elt (optionally matching C<$optional_condition>) of the element. This
9392
is defined as the next element which opens after the current element opens.
9393
Which usually means the first child of the element.
9394
Counter-intuitive as it might look this allows you to loop through the
9395
whole document by starting from the root.
9397
The C<$optional_elt> is the root of a subtree. When the C<next_elt> is out of the
9398
subtree then the method returns undef. You can then walk a sub tree with:
9400
my $elt= $subtree_root;
9401
while( $elt= $elt->next_elt( $subtree_root)
9402
{ # insert processing code here
9405
=item prev_elt ($optional_condition)
9407
Return the previous elt (optionally matching C<$optional_condition>) of the
9408
element. This is the first element which opens before the current one.
9409
It is usually either the last descendant of the previous sibling or
9412
=item next_n_elt ($offset, $optional_condition)
9414
Return the C<$offset>-th element that matches the C<$optional_condition>
9416
=item children ($optional_condition)
9418
Return the list of children (optionally which matches C<$optional_condition>) of
9419
the element. The list is in document order.
9421
=item children_count ($optional_condition)
9423
Return the number of children of the element (optionally which matches
9424
C<$optional_condition>)
9426
=item children_text ($optional_condition)
9428
Return an array containing the text of children of the element (optionally
9429
which matches C<$optional_condition>)
9431
=item children_copy ($optional_condition)
9433
Return a list of elements that are copies of the children of the element,
9434
optionally which matches C<$optional_condition>
9436
=item descendants ($optional_condition)
9438
Return the list of all descendants (optionally which matches
9439
C<$optional_condition>) of the element. This is the equivalent of the
9440
C<getElementsByTagName> of the DOM (by the way, if you are really a DOM
9441
addict, you can use C<getElementsByTagName> instead)
9443
=item getElementsByTagName ($optional_condition)
9445
Same as C<L<descendants>>
9447
=item find_by_tag_name ($optional_condition)
9449
Same as C<L<descendants>>
9451
=item descendants_or_self ($optional_condition)
9453
Same as C<L<descendants>> except that the element itself is included in the list
9454
if it matches the C<$optional_condition>
9456
=item first_descendant ($optional_condition)
9458
Return the first descendant of the element that matches the condition
9460
=item ancestors ($optional_condition)
9462
Return the list of ancestors (optionally matching C<$optional_condition>) of the
9463
element. The list is ordered from the innermost ancestor to the outtermost one
9465
NOTE: the element itself is not part of the list, in order to include it
9466
you will have to use ancestors_or_self
9468
=item ancestors_or_self ($optional_condition)
9470
Return the list of ancestors (optionally matching C<$optional_condition>) of the
9471
element, including the element (if it matches the condition>).
9472
The list is ordered from the innermost ancestor to the outtermost one
9474
=item passes ($condition)
9476
Return the element if it passes the C<$condition>
9480
Return the value of attribute C<$att> or C<undef>
9482
=item set_att ($att, $att_value)
9484
Set the attribute of the element to the given value
9486
You can actually set several attributes this way:
9488
$elt->set_att( att1 => "val1", att2 => "val2");
9490
=item del_att ($att)
9492
Delete the attribute for the element
9494
You can actually delete several attributes at once:
9496
$elt->del_att( 'att1', 'att2', 'att3');
9500
Cut the element from the tree. The element still exists, it can be copied
9501
or pasted somewhere else, it is just not attached to the tree anymore.
9503
=item cut_children ($optional_condition)
9505
Cut all the children of the element (or all of those which satisfy the
9506
C<$optional_condition>).
9508
Return the list of children
9512
Return a copy of the element. The copy is a "deep" copy: all sub elements of
9513
the element are duplicated.
9515
=item paste ($optional_position, $ref)
9517
Paste a (previously C<cut> or newly generated) element. Die if the element
9518
already belongs to a tree.
9520
Note that the calling element is pasted:
9522
$child->paste( first_child => $existing_parent);
9523
$new_sibling->paste( after => $this_sibling_is_already_in_the_tree);
9527
my $new_elt= XML::Twig::Elt->new( tag => $content);
9528
$new_elt->paste( $position => $existing_elt);
9532
my $t= XML::Twig->new->parse( 'doc.xml')
9533
my $toc= $t->root->new( 'toc');
9534
$toc->paste( $t->root); # $toc is pasted as first child of the root
9535
foreach my $title ($t->findnodes( '/doc/section/title'))
9536
{ my $title_toc= $title->copy;
9537
# paste $title_toc as the last child of toc
9538
$title_toc->paste( last_child => $toc)
9545
=item first_child (default)
9547
The element is pasted as the first child of C<$ref>
9551
The element is pasted as the last child of C<$ref>
9555
The element is pasted before C<$ref>, as its previous sibling.
9559
The element is pasted after C<$ref>, as its next sibling.
9563
In this case an extra argument, C<$offset>, should be supplied. The element
9564
will be pasted in the reference element (or in its first text child) at the
9565
given offset. To achieve this the reference element will be split at the
9570
Note that you can call directly the underlying method:
9578
=item paste_first_child
9580
=item paste_last_child
9586
=item move ($optional_position, $ref)
9588
Move an element in the tree.
9589
This is just a C<cut> then a C<paste>. The syntax is the same as C<paste>.
9591
=item replace ($ref)
9593
Replaces an element in the tree. Sometimes it is just not possible toC<cut>
9594
an element then C<paste> another in its place, so C<replace> comes in handy.
9595
The calling element replaces C<$ref>.
9597
=item replace_with (@elts)
9599
Replaces the calling element with one or more elements
9603
Cut the element and frees the memory.
9605
=item prefix ($text, $optional_option)
9607
Add a prefix to an element. If the element is a C<PCDATA> element the text
9608
is added to the pcdata, if the elements first child is a C<PCDATA> then the
9609
text is added to it's pcdata, otherwise a new C<PCDATA> element is created
9610
and pasted as the first child of the element.
9612
If the option is C<asis> then the prefix is added asis: it is created in
9613
a separate C<PCDATA> element with an C<asis> property. You can then write:
9615
$elt1->prefix( '<b>', 'asis');
9617
to create a C<< <b> >> in the output of C<print>.
9619
=item suffix ($text, $optional_option)
9621
Add a suffix to an element. If the element is a C<PCDATA> element the text
9622
is added to the pcdata, if the elements last child is a C<PCDATA> then the
9623
text is added to it's pcdata, otherwise a new PCDATA element is created
9624
and pasted as the last child of the element.
9626
If the option is C<asis> then the suffix is added asis: it is created in
9627
a separate C<PCDATA> element with an C<asis> property. You can then write:
9629
$elt2->suffix( '</b>', 'asis');
9631
=item simplify (%options)
9633
Return a data structure suspiciously similar to XML::Simple's. Options are
9634
identical to XMLin options, see XML::Simple doc for more details (or use
9635
DATA::dumper or YAML to dump the data structure)
9647
=item normalize_space
9651
=item variables (%var_hash)
9653
%var_hash is a hash { name => value }
9655
This option allows variables in the XML to be expanded when the file is read. (there is no facility for putting the variable names back if you regenerate XML using XMLout).
9657
A 'variable' is any text of the form ${name} (or $name) which occurs in an attribute value or in the text content of an element. If 'name' matches a key in the supplied hashref, ${name} will be replaced with the corresponding value from the hashref. If no matching key is found, the variable will not be replaced.
9659
=item var_att ($attribute_name)
9661
This option gives the name of an attribute that will be used to create
9662
variables in the XML:
9665
<dir name="prefix">/usr/local</dir>
9666
<dir name="exec_prefix">$prefix/bin</dir>
9669
use C<< var => 'name' >> to get $prefix replaced by /usr/local in the
9670
generated data structure
9672
By default variables are captured by the following regexp: /$(\w+)/
9674
=item var_regexp (regexp)
9676
This option changes the regexp used to capture variables. The variable
9677
name should be in $1
9679
=item group_tags ([<tag1>, <tag2>...])
9681
Option used to simplify the structure: elements listed will not be used.
9682
Their children will be, they will be considered children of the element
9687
<config host="laptop.xmltwig.com">
9688
<server>localhost</server>
9690
<dir name="base">/home/mrodrigu/standards</dir>
9691
<dir name="tools">$base/tools</dir>
9694
<template name="std_def">std_def.templ</template>
9695
<template name="dummy">dummy</template>
9699
Then callin simplify with C<< erase => [ 'dirs', 'templates'] >> makes the
9700
data structure be exactly as if the start and end tags for C<dirs> and
9701
C<templates> were not there.
9703
A YAML dump of the structure
9705
base: '/home/mrodrigu/standards'
9706
host: laptop.xmltwig.com
9711
tools: '$base/tools'
9716
=item split_at ($offset)
9718
Split a text (C<PCDATA> or C<CDATA>) element in 2 at C<$offset>, the original
9719
element now holds the first part of the string and a new element holds the
9720
right part. The new element is returned
9722
If the element is not a text element then the first text child of the element
9725
=item split ( $optional_regexp, $optional_tag, $optional_attribute_ref)
9727
Split the text descendants of an element in place, the text is split using
9728
the regexp, if the regexp includes () then the matched separators will be
9729
wrapped in C<$optional_tag>, with C<$optional_attribute_ref> attributes
9731
if $elt is C<< <p>tati tata <b>tutu tati titi</b> tata tati tata</p> >>
9733
$elt->split( qr/(ta)ti/, 'foo', {type => 'toto'} )
9737
<p><foo type="toto">ta</foo> tata <b>tutu <foo type="toto">ta</foo>
9738
titi</b> tata <foo type="toto">ta</foo> tata</p>
9740
The regexp can be passed either as a string or as C<qr//> (perl 5.005 and
9741
later), it defaults to \s+ just as the C<split> built-in (but this would be
9742
quite a useless behaviour without the C<$optional_tag> parameter)
9744
C<$optional_tag> defaults to PCDATA or CDATA, depending on the initial element
9747
The list of descendants is returned (including un-touched original elements
9748
and newly created ones)
9750
=item mark ( $regexp, $optional_tag, $optional_attribute_ref)
9752
This method behaves exactly as L<split|split>, except only the newly created
9753
elements are returned
9755
=item wrap_children ( $regexp_string, $tag, $optional_att, $optional_value)
9757
Wrap the children of the element that match the regexp in an element C<$tag>.
9758
If C<$optional_att> and C<$optional_value> are passed then the new element will
9759
have an attribute C<$optional_att> with a value C<$optional_value>.
9761
Note that elements might get extra C<id> attributes in the process. See L<add_id>.
9762
Use L<strip_att> to remove unwanted id's.
9766
If the element C<$elt> has the following content:
9770
<l_l1_1>list 1 item 1 para 1</l_l1_1>
9771
<l_l1>list 1 item 1 para 2</l_l1>
9772
<l_l1_n>list 1 item 2 para 1 (only para)</l_l1_n>
9773
<l_l1_n>list 1 item 3 para 1</l_l1_n>
9774
<l_l1>list 1 item 3 para 2</l_l1>
9775
<l_l1>list 1 item 3 para 3</l_l1>
9776
<l_l1_1>list 2 item 1 para 1</l_l1_1>
9777
<l_l1>list 2 item 1 para 2</l_l1>
9778
<l_l1_n>list 2 item 2 para 1 (only para)</l_l1_n>
9779
<l_l1_n>list 2 item 3 para 1</l_l1_n>
9780
<l_l1>list 2 item 3 para 2</l_l1>
9781
<l_l1>list 2 item 3 para 3</l_l1>
9786
$elt->wrap_children( q{<l_l1_1><l_l1>*} , li => { type => "ul1" });
9787
$elt->wrap_children( q{<l_l1_n><l_l1>*} , li => { type => "ul" });
9789
$elt->wrap_children( q{<li type="ul1"><li type="ul">+}, "ul");
9790
$elt->strip_att( 'id');
9791
$elt->strip_att( 'type');
9800
<l_l1_1>list 1 item 1 para 1</l_l1_1>
9801
<l_l1>list 1 item 1 para 2</l_l1>
9804
<l_l1_n>list 1 item 2 para 1 (only para)</l_l1_n>
9807
<l_l1_n>list 1 item 3 para 1</l_l1_n>
9808
<l_l1>list 1 item 3 para 2</l_l1>
9809
<l_l1>list 1 item 3 para 3</l_l1>
9814
<l_l1_1>list 2 item 1 para 1</l_l1_1>
9815
<l_l1>list 2 item 1 para 2</l_l1>
9818
<l_l1_n>list 2 item 2 para 1 (only para)</l_l1_n>
9821
<l_l1_n>list 2 item 3 para 1</l_l1_n>
9822
<l_l1>list 2 item 3 para 2</l_l1>
9823
<l_l1>list 2 item 3 para 3</l_l1>
9828
=item subs_text ($regexp, $replace)
9830
subs_text does text substitution, similar to perl's C< s///> operator.
9832
C<$regexp> must be a perl regexp, created with the C<qr> operatot.
9834
C<$replace> can include C<$1, $2>... from the C<$regexp>. It can also be
9835
used to create element and entities, by using
9836
C<< &elt( tag => { att => val }, text) >> (similar syntax as C<L<new>>) and
9839
Here is a rather complex example:
9841
$elt->subs_text( qr{(?<!do not )link to (http://([^\s,]*))},
9842
'see &elt( a =>{ href => $1 }, $2)'
9845
This will replace text like I<link to http://www.xmltwig.com> by
9846
I<< see <a href="www.xmltwig.com">www.xmltwig.com</a> >>, but not
9847
I<do not link to...>
9849
Generating entities (here replacing spaces with ):
9851
$elt->subs_text( qr{ }, '&ent( " ")');
9853
or, using a variable:
9856
$elt->subs_text( qr{ }, "&ent( '$ent')");
9858
Note that the substitution is always global, as in using the C<g> modifier
9859
in a perl substitution, and that it is performed on all text descendants
9864
Add an id to the element.
9866
The id is an attribute (C<id> by default, see the C<id> option for XML::Twig
9867
C<new> to change it. Use an id starting with C<#> to get an id that's not
9868
output by L<print>, L<flush> or L<sprint>) that allows you to use the
9869
L<elt_id> method to get the element easily.
9871
=item set_id_seed ($prefix)
9873
by default the id generated by C<L<add_id>> is C<< twig_id_<nnnn> >>,
9874
C<set_id_seed> changes the prefix to C<$prefix> and resets the number
9877
=item strip_att ($att)
9879
Remove the attribute C<$att> from all descendants of the element (including
9882
=item change_att_name ($old_name, $new_name)
9884
Change the name of the attribute from C<$old_name> to C<$new_name>. If there is no
9885
attribute C<$old_name> nothing happens.
9887
=item sort_children_on_value( %options)
9889
Sort the children of the element in place according to their text.
9890
All children are sorted.
9892
Return the element, with its children sorted.
9897
type : numeric | alpha (default: alpha)
9898
order : normal | reverse (default: normal)
9900
Return the element, with its children sorted
9903
=item sort_children_on_att ($att, %options)
9905
Sort the children of the element in place according to attribute C<$att>.
9906
C<%options> are the same as for L<C<sort_children_on_value>>
9911
=item sort_children_on_field ($gi, %options)
9913
Sort the children of the element in place, according to the field C<$gi> (the
9914
text of the first child of the child with this gi). C<%options> are the same
9915
as for L<C<sort_children_on_value>>.
9917
Return the element, with its children sorted
9920
=item sort_children( $get_key, %options)
9922
Sort the children of the element in place. The C<$get_key> argument is
9923
a reference to a function that returns the sort key when passed an element.
9927
$elt->sort_children( sub { $_[0]->att( "nb") + $_[0]->text },
9928
type => 'numeric', order => 'reverse'
9931
=item field_to_att ($cond, $att)
9933
Turn the text of the first sub-element matched by C<$cond> into the value of
9934
attribute C<$att> of the element. If C<$att> is ommited then C<$cond> is used
9935
as the name of the attribute, which makes sense only if C<$cond> is a valid
9936
element (and attribute) name.
9938
The sub-element is then cut.
9940
=item att_to_field ($att, $gi)
9942
Take the value of attribute C<$att> and create a sub-element C<$gi> as first
9943
child of the element. If C<$gi> is ommited then C<$att> is used as the name of
9947
=item get_xpath ($xpath, $optional_offset)
9949
Return a list of elements satisfying the C<$xpath>. C<$xpath> is an XPATH-like
9952
A subset of the XPATH abbreviated syntax is covered:
9955
gi[1] (or any other positive number)
9957
gi[@att] (the attribute exists for the element)
9960
gi[att1="val1" and att2="val2"]
9961
gi[att1="val1" or att2="val2"]
9962
gi[string()="toto"] (returns gi elements which text (as per the text method)
9964
gi[string()=~/regexp/] (returns gi elements which text (as per the text
9965
method) matches regexp)
9966
expressions can start with / (search starts at the document root)
9967
expressions can start with . (search starts at the current element)
9968
// can be used to get all descendants instead of just direct children
9971
So the following examples from the
9972
F<XPath recommendationL<http://www.w3.org/TR/xpath.html#path-abbrev>> work:
9974
para selects the para element children of the context node
9975
* selects all element children of the context node
9976
para[1] selects the first para child of the context node
9977
para[last()] selects the last para child of the context node
9978
*/para selects all para grandchildren of the context node
9979
/doc/chapter[5]/section[2] selects the second section of the fifth chapter
9981
chapter//para selects the para element descendants of the chapter element
9982
children of the context node
9983
//para selects all the para descendants of the document root and thus selects
9984
all para elements in the same document as the context node
9985
//olist/item selects all the item elements in the same document as the
9986
context node that have an olist parent
9987
.//para selects the para element descendants of the context node
9988
.. selects the parent of the context node
9989
para[@type="warning"] selects all para children of the context node that have
9990
a type attribute with value warning
9991
employee[@secretary and @assistant] selects all the employee children of the
9992
context node that have both a secretary attribute and an assistant
9996
The elements will be returned in the document order.
9998
If C<$optional_offset> is used then only one element will be returned, the one
9999
with the appropriate offset in the list, starting at 0
10001
Quoting and interpolating variables can be a pain when the Perl syntax and the
10002
XPATH syntax collide, so here are some more examples to get you started:
10006
my @res= $t->get_xpath( "p[string( '$p1') or string( '$p2')]");
10009
my @res= $t->get_xpath( "//*[@att=\"$a\"]);
10012
my $exp= "//p[ \@att='$val']"; # you need to use \@ or you will get a warning
10013
my @res= $t->get_xpath( $exp);
10015
Note that the only supported regexps delimiters are / and that you must
10016
backslash all / in regexps AND in regular strings.
10018
XML::Twig does not provide natively full XPATH support, but you can use
10019
XML::Twig does not provide natively full XPATH support, but you can use
10020
C<L<XML::Twig::XPath>> to get C<findnodes> to use C<XML::XPath> as the
10021
XPath engine, with full coverage of the spec.
10023
C<L<XML::Twig::XPath>> to get C<findnodes> to use C<XML::XPath> as the
10024
XPath engine, with full coverage of the spec.
10028
same asC<get_xpath>
10032
same asC<get_xpath>
10037
Return a string consisting of all the C<PCDATA> and C<CDATA> in an element,
10038
without any tags. The text is not XML-escaped: base entities such as C<&>
10039
and C<< < >> are not escaped.
10043
Same as C<text> except that the text is trimmed: leading and trailing spaces
10044
are discarded, consecutive spaces are collapsed
10046
=item set_text ($string)
10048
Set the text for the element: if the element is a C<PCDATA>, just set its
10049
text, otherwise cut all the children of the element and create a single
10050
C<PCDATA> child for it, which holds the text.
10052
=item insert ($gi1, [$optional_atts1], $gi2, [$optional_atts2],...)
10054
For each gi in the list inserts an element C<$gi> as the only child of the
10055
element. The element gets the optional attributes inC<< $optional_atts<n>. >>
10056
All children of the element are set as children of the new element.
10057
The upper level element is returned.
10059
$p->insert( table => { border=> 1}, 'tr', 'td')
10061
put C<$p> in a table with a visible border, a single C<tr> and a single C<td>
10062
and return the C<table> element:
10064
<p><table border="1"><tr><td>original content of p</td></tr></table></p>
10066
=item wrap_in (@gi)
10068
Wrap elements C<$gi> as the successive ancestors of the element, returns the
10070
$elt->wrap_in( 'td', 'tr', 'table') wraps the element as a single cell in a
10073
=item insert_new_elt ($opt_position, $gi, $opt_atts_hashref, @opt_content)
10075
Combines a C<L<new|new> > and a C<L<paste|paste> >: creates a new element using
10076
C<$gi>, C<$opt_atts_hashref >and C<@opt_content> which are arguments similar
10077
to those for C<new>, then paste it, using C<$opt_position> or C<'first_child'>,
10078
relative to C<$elt>.
10080
Return the newly created element
10084
Erase the element: the element is deleted and all of its children are
10085
pasted in its place.
10087
=item set_content ( $optional_atts, @list_of_elt_and_strings)
10088
( $optional_atts, '#EMPTY')
10090
Set the content for the element, from a list of strings and
10091
elements. Cuts all the element children, then pastes the list
10092
elements as the children. This method will create a C<PCDATA> element
10093
for any strings in the list.
10095
The C<$optional_atts> argument is the ref of a hash of attributes. If this
10096
argument is used then the previous attributes are deleted, otherwise they
10097
are left untouched.
10099
B<WARNING>: if you rely on ID's then you will have to set the id yourself. At
10100
this point the element does not belong to a twig yet, so the ID attribute
10101
is not known so it won't be strored in the ID list.
10103
A content of 'C<#EMPTY>' creates an empty element;
10105
=item namespace ($optional_prefix)
10107
Return the URI of the namespace that C<$optional_prefix> or the element name
10108
belongs to. If the name doesn't belong to any namespace, C<undef> is returned.
10112
Return the local name (without the prefix) for the element
10116
Return the namespace prefix for the element
10118
=item current_ns_prefixes
10120
Returna list of namespace prefixes valid for the element. The order of the
10121
prefixes in the list has no meaning. If the default namespace is currently
10122
bound, '' appears in the list.
10125
=item inherit_att ($att, @optional_gi_list)
10127
Return the value of an attribute inherited from parent tags. The value
10128
returned is found by looking for the attribute in the element then in turn
10129
in each of its ancestors. If the C<@optional_gi_list> is supplied only those
10130
ancestors whose gi is in the list will be checked.
10132
=item all_children_are ($optional_condition)
10134
return 1 if all children of the element pass the C<$optional_condition>,
10137
=item level ($optional_condition)
10139
Return the depth of the element in the twig (root is 0).
10140
If C<$optional_condition> is given then only ancestors that match the condition are
10143
B<WARNING>: in a tree created using the C<twig_roots> option this will not return
10144
the level in the document tree, level 0 will be the document root, level 1
10145
will be the C<twig_roots> elements. During the parsing (in a C<twig_handler>)
10146
you can use the C<depth> method on the twig object to get the real parsing depth.
10148
=item in ($potential_parent)
10150
Return true if the element is in the potential_parent (C<$potential_parent> is
10153
=item in_context ($cond, $optional_level)
10155
Return true if the element is included in an element which passes C<$cond>
10156
optionally within C<$optional_level> levels. The returned value is the
10161
Return the text of a C<PCDATA> element or C<undef> if the element is not
10164
=item pcdata_xml_string
10166
Return the text of a PCDATA element or undef if the element is not PCDATA.
10167
The text is "XML-escaped" ('&' and '<' are replaced by '&' and '<')
10169
=item set_pcdata ($text)
10171
Set the text of a C<PCDATA> element.
10173
=item append_pcdata ($text)
10175
Add the text at the end of a C<PCDATA> element.
10179
Return 1 if the element is a C<CDATA> element, returns 0 otherwise.
10183
Return 1 if the element is a C<CDATA> or C<PCDATA> element, returns 0 otherwise.
10187
Return the text of a C<CDATA> element or C<undef> if the element is not
10192
Return the XML string of a C<CDATA> element, including the opening and
10195
=item set_cdata ($text)
10197
Set the text of a C<CDATA> element.
10199
=item append_cdata ($text)
10201
Add the text at the end of a C<CDATA> element.
10205
Turns all C<CDATA> sections in the element into regular C<PCDATA> elements. This is useful
10206
when converting XML to HTML, as browsers do not support CDATA sections.
10210
Return the extra_data (comments and PI's) attached to an element
10212
=item set_extra_data ($extra_data)
10214
Set the extra_data (comments and PI's) attached to an element
10216
=item append_extra_data ($extra_data)
10218
Append extra_data to the existing extra_data before the element (if no
10219
previous extra_data exists then it is created)
10223
Set a property of the element that causes it to be output without being XML
10224
escaped by the print functions: if it contains C<< a < b >> it will be output
10225
as such and not as C<< a < b >>. This can be useful to create text elements
10226
that will be output as markup. Note that all C<PCDATA> descendants of the
10227
element are also marked as having the property (they are the ones taht are
10228
actually impacted by the change).
10230
If the element is a C<CDATA> element it will also be output asis, without the
10231
C<CDATA> markers. The same goes for any C<CDATA> descendant of the element
10235
Unsets the C<asis> property for the element and its text descendants.
10239
Return the C<asis> property status of the element ( 1 or C<undef>)
10243
Return true if the element has been closed. Might be usefull if you are
10244
somewhere in the tree, during the parse, and have no idea whether a parent
10245
element is completely loaded or not.
10249
Return the type of the element: 'C<#ELT>' for "real" elements, or 'C<#PCDATA>',
10250
'C<#CDATA>', 'C<#COMMENT>', 'C<#ENT>', 'C<#PI>'
10254
Return the gi if the element is a "real" element, or 0 if it is C<PCDATA>,
10257
=item contains_only_text
10259
Return 1 if the element does not contain any other "real" element
10261
=item contains_only ($exp)
10263
Return the list of children if all children of the element match
10264
the expression C<$exp>
10266
if( $para->contains_only( 'tt')) { ... }
10268
=item contains_a_single ($exp)
10270
If the element contains a single child that matches the expression C<$exp>
10271
returns that element. Otherwise returns 0.
10275
same as C<contains_only_text>
10279
Return 1 if the element is a C<PCDATA> element, returns 0 otherwise.
10283
Return 1 if the element is an entity (an unexpanded entity) element,
10284
return 0 otherwise.
10288
Return 1 if the element is empty, 0 otherwise
10292
Flags the element as empty. No further check is made, so if the element
10293
is actually not empty the output will be messed. The only effect of this
10294
method is that the output will be C<< <gi att="value""/> >>.
10296
=item set_not_empty
10298
Flags the element as not empty. if it is actually empty then the element will
10299
be output as C<< <gi att="value""></gi> >>
10303
Return 1 if the element is a processing instruction (C<#PI>) element,
10304
return 0 otherwise.
10308
Return the target of a processing instruction
10310
=item set_target ($target)
10312
Set the target of a processing instruction
10316
Return the data part of a processing instruction
10318
=item set_data ($data)
10320
Set the data of a processing instruction
10322
=item set_pi ($target, $data)
10324
Set the target and data of a processing instruction
10328
Return the string form of a processing instruction
10329
(C<< <?target data?> >>)
10333
Return 1 if the element is a comment (C<#COMMENT>) element,
10334
return 0 otherwise.
10336
=item set_comment ($comment_text)
10338
Set the text for a comment
10342
Return the content of a comment (just the text, not the C<< <!-- >>
10345
=item comment_string
10347
Return the XML string for a comment (C<< <!-- comment --> >>)
10349
=item set_ent ($entity)
10351
Set an (non-expanded) entity (C<#ENT>). C<$entity>) is the entity
10356
Return the entity for an entity (C<#ENT>) element (C<&ent;>)
10360
Return the entity name for an entity (C<#ENT>) element (C<ent>)
10364
Return the entity, either expanded if the expanded version is available,
10365
or non-expanded (C<&ent;>) otherwise
10367
=item child ($offset, $optional_condition)
10369
Return the C<$offset>-th child of the element, optionally the C<$offset>-th
10370
child that matches C<$optional_condition>. The children are treated as a list, so
10371
C<< $elt->child( 0) >> is the first child, while C<< $elt->child( -1) >> is
10374
=item child_text ($offset, $optional_condition)
10376
Return the text of a child or C<undef> if the sibling does not exist. Arguments
10377
are the same as child.
10379
=item last_child ($optional_condition)
10381
Return the last child of the element, or the last child matching
10382
C<$optional_condition> (ie the last of the element children matching
10385
=item last_child_text ($optional_condition)
10387
Same as C<first_child_text> but for the last child.
10389
=item sibling ($offset, $optional_condition)
10391
Return the next or previous C<$offset>-th sibling of the element, or the
10392
C<$offset>-th one matching C<$optional_condition>. If C<$offset> is negative then a
10393
previous sibling is returned, if $offset is positive then a next sibling is
10394
returned. C<$offset=0> returns the element if there is no condition or
10395
if the element matches the condition>, C<undef> otherwise.
10397
=item sibling_text ($offset, $optional_condition)
10399
Return the text of a sibling or C<undef> if the sibling does not exist.
10400
Arguments are the same as C<sibling>.
10402
=item prev_siblings ($optional_condition)
10404
Return the list of previous siblings (optionaly matching C<$optional_condition>)
10405
for the element. The elements are ordered in document order.
10407
=item next_siblings ($optional_condition)
10409
Return the list of siblings (optionaly matching C<$optional_condition>)
10410
following the element. The elements are ordered in document order.
10412
=item pos ($optional_condition)
10414
Return the position of the element in the children list. The first child has a
10415
position of 1 (as in XPath).
10417
If the C<$optional_condition> is given then only siblings that match the condition
10418
are counted. If the element itself does not match the condition then
10423
Return a hash ref containing the element attributes
10425
=item set_atts ({att1=>$att1_val, att2=> $att2_val... })
10427
Set the element attributes with the hash ref supplied as the argument
10431
Deletes all the element attributes.
10435
Return the number of attributes for the element
10439
Return true if the element has attributes (in fact return the number of
10440
attributes, thus being an alias to C<L<att_nb>>
10444
Return true if the element has no attributes, false (0) otherwise
10448
return a list of the attribute names for the element
10450
=item att_xml_string ($att, $optional_quote)
10452
Return the attribute value, where '&', '<' and $quote (" by default)
10455
if C<$optional_quote> is passed then it is used as the quote.
10459
Set the C<id> attribute of the element to the value.
10460
See C<L<elt_id|elt_id> > to change the id attribute name
10464
Gets the id attribute value
10468
Deletes the C<id> attribute of the element and remove it from the id list
10473
Return the C<class> attribute for the element (methods on the C<class>
10474
attribute are quite convenient when dealing with XHTML, or plain XML that
10475
will eventually be displayed using CSS)
10477
=item set_class ($class)
10479
Set the C<class> attribute for the element to C<$class>
10481
=item add_to_class ($class)
10483
Add C<$class> to the element C<class> attribute: the new class is added
10484
only if it is not already present. Note that classes are sorted alphabetically,
10485
so the C<class> attribute can be changed even if the class is already there
10487
=item att_to_class ($att)
10489
Set the C<class> attribute to the value of attribute C<$att>
10491
=item add_att_to_class ($att)
10493
Add the value of attribute C<$att> to the C<class> attribute of the element
10495
=item move_att_to_class ($att)
10497
Add the value of attribute C<$att> to the C<class> attribute of the element
10498
and delete the attribute
10502
Set the C<class> attribute of the element to the element tag
10504
=item add_tag_to_class
10506
Add the element tag to its C<class> attribute
10508
=item set_tag_class ($new_tag)
10510
Add the element tag to its C<class> attribute and sets the tag to C<$new_tag>
10512
=item in_class ($class)
10514
Return true (C<1>) if the element is in the class C<$class> (if C<$class> is
10515
one of the tokens in the element C<class> attribute)
10519
Frees the element from memory.
10523
Return the string for the start tag for the element, including
10524
the C<< /> >> at the end of an empty element tag
10528
Return the string for the end tag of an element. For an empty
10529
element, this returns the empty string ('').
10533
Equivalent to C<< $elt->sprint( 1) >>, returns the string for the entire
10534
element, excluding the element's tags (but nested element tags are present)
10538
Return the text of the element, encoded (and processed by the current
10539
C<L<output_filter>> or C<L<output_encoding>> options, without any tag.
10541
=item set_pretty_print ($style)
10543
Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>',
10544
'C<nice>', 'C<indented>', 'C<record>' and 'C<record_c>'
10546
pretty_print styles:
10552
the default, no C<\n> is used
10556
nsgmls style, with C<\n> added within tags
10560
adds C<\n> wherever possible (NOT SAFE, can lead to invalid XML)
10564
same as C<nice> plus indents elements (NOT SAFE, can lead to invalid XML)
10568
table-oriented pretty print, one field per line
10572
table-oriented pretty print, more compact than C<record>, one record per line
10576
=item set_empty_tag_style ($style)
10578
Set the method to output empty tags, amongst 'C<normal>' (default), 'C<html>',
10581
=item set_remove_cdata ($flag)
10583
set (or unset) the flag that forces the twig to output CDATA sections as
10584
regular (escaped) PCDATA
10587
=item set_indent ($string)
10589
Set the indentation for the indented pretty print style (default is 2 spaces)
10591
=item set_quote ($quote)
10593
Set the quotes used for attributes. can be 'C<double>' (default) or 'C<single>'
10597
Compare the order of the 2 elements in a twig.
10599
C<$a> is the <A>..</A> element, C<$b> is the <B>...</B> element
10601
document $a->cmp( $b)
10602
<A> ... </A> ... <B> ... </B> -1
10603
<A> ... <B> ... </B> ... </A> -1
10604
<B> ... </B> ... <A> ... </A> 1
10605
<B> ... <A> ... </A> ... </B> 1
10607
$a and $b not in the same tree undef
10609
=item before ($elt)
10611
Return 1 if C<$elt> starts before the element, 0 otherwise. If the 2 elements
10612
are not in the same twig then return C<undef>.
10614
if( $a->cmp( $b) == -1) { return 1; } else { return 0; }
10618
Return 1 if $elt starts after the element, 0 otherwise. If the 2 elements
10619
are not in the same twig then return C<undef>.
10621
if( $a->cmp( $b) == -1) { return 1; } else { return 0; }
10623
=item other comparison methods
10639
Return the element context in a form similar to XPath's short
10640
form: 'C</root/gi1/../gi>'
10644
Return a unique XPath expression that can be used to find the element
10647
It looks like C</doc/sect[3]/title>: unique elements do not have an index,
10650
=item private methods
10652
Low-level methods on the twig:
10656
=item set_parent ($parent)
10658
=item set_first_child ($first_child)
10660
=item set_last_child ($last_child)
10662
=item set_prev_sibling ($prev_sibling)
10664
=item set_next_sibling ($next_sibling)
10666
=item set_twig_current
10668
=item del_twig_current
10674
This method should NOT be used, always flush the twig, not an element.
10676
=item contains_text
10680
Those methods should not be used, unless of course you find some creative
10681
and interesting, not to mention useful, ways to do it.
10687
Most of the navigation functions accept a condition as an optional argument
10688
The first element (or all elements for C<L<children|children> > or
10689
C<L<ancestors|ancestors> >) that passes the condition is returned.
10691
The condition is a single step of an XPath expression using the XPath subset
10692
defined by C<L<get_xpath>>. Additional conditions are:
10694
The condition can be
10700
return a "real" element (not a PCDATA, CDATA, comment or pi element)
10704
return a PCDATA or CDATA element
10706
=item regular expression
10708
return an element whose gi matches the regexp. The regexp has to be created
10709
with C<qr//> (hence this is available only on perl 5.005 and above)
10711
=item code reference
10713
applies the code, passing the current element as argument, if the code returns
10714
true then the element is returned, if it returns false then the code is applied
10715
to the next candidate.
10719
=head2 XML::Twig::XPath
10721
XML::Twig implements a subset of XPath through the C<L<get_xpath>> method.
10723
If you want to use the whole XPath power, then you can use C<XML::Twig::XPath>
10724
instead. In this case C<XML::Twig> uses C<XML::XPath> to execute XPath queries.
10725
You will of course need C<XML::XPath> installed to be able to use C<XML::Twig::XPath>.
10727
See L<XML::XPath> for more information.
10729
The methods you can use are:
10733
=item findnodes ($path)
10735
return a list of nodes found by C<$path>.
10737
=item findnodes_as_string ($path)
10739
return the nodes found reproduced as XML. The result is not guaranteed
10740
to be valid XML though.
10742
=item findvalue ($path)
10744
return the concatenation of the text content of the result nodes
10748
In order for C<XML::XPath> to be used as the XPath engine the following methods
10749
are included in C<XML::Twig>:
10757
=item getParentNode
10759
=item getChildNodes
10775
=item getNextSibling
10777
=item getPreviousSibling
10779
=item isElementNode
10787
=item isProcessingInstructionNode
10791
=item isCommentNode
10795
=item getChildNodes
10797
=item getElementById
10801
=head2 XML::Twig::XPath::Elt
10803
The methods you can use are the same as on C<XML::Twig::XPath> elements:
10807
=item findnodes ($path)
10809
return a list of nodes found by C<$path>.
10811
=item findnodes_as_string ($path)
10813
return the nodes found reproduced as XML. The result is not guaranteed
10814
to be valid XML though.
10816
=item findvalue ($path)
10818
return the concatenation of the text content of the result nodes
10823
=head2 XML::Twig::Entity_list
10829
Create an entity list.
10833
Add an entity to an entity list.
10835
=item add_new_ent ($name, $val, $sysid, $pubid, $ndata)
10837
Create a new entity and add it to the entity list
10839
=item delete ($ent or $gi).
10841
Delete an entity (defined by its name or by the Entity object)
10844
=item print ($optional_filehandle)
10846
Print the entity list.
10850
Return the list as an array
10855
=head2 XML::Twig::Entity
10859
=item new ($name, $val, $sysid, $pubid, $ndata)
10861
Same arguments as the Entity handler for XML::Parser.
10863
=item print ($optional_filehandle)
10865
Print an entity declaration.
10869
Return the name of the entity
10873
Return the value of the entity
10877
Return the system id for the entity (for NDATA entities)
10881
Return the public id for the entity (for NDATA entities)
10885
Return true if the entity is an NDATA entity
10889
Return the entity declaration text.
10896
Additional examples (and a complete tutorial) can be found on the
10897
F<XML::Twig PageL<http://www.xmltwig.com/xmltwig/>>
10899
To figure out what flush does call the following script with an
10900
XML file and an element name as arguments
10904
my ($file, $elt)= @ARGV;
10905
my $t= XML::Twig->new( twig_handlers =>
10906
{ $elt => sub {$_[0]->flush; print "\n[flushed here]\n";} });
10907
$t->parsefile( $file, ErrorContext => 2);
10914
=head2 Subclassing XML::Twig
10922
In order to subclass C<XML::Twig> you will probably need to subclass also
10923
C<L<XML::Twig::Elt>>. Use the C<elt_class> option when you create the
10924
C<XML::Twig> object to get the elements created in a different class
10925
(which should be a subclass of C<XML::Twig::Elt>.
10929
If you inherit C<XML::Twig> new method but want to add more options to it
10930
you can use this method to prevent XML::Twig to issue warnings for those
10931
additional options.
10935
=head2 DTD Handling
10937
There are 3 possibilities here. They are:
10943
No doctype, no DTD information, no entity information, the world is simple...
10947
The XML document includes an internal DTD, and maybe entity declarations.
10949
If you use the load_DTD option when creating the twig the DTD information and
10950
the entity declarations can be accessed.
10952
The DTD and the entity declarations will be C<flush>'ed (or C<print>'ed) either
10953
as is (if they have not been modified) or as reconstructed (poorly, comments
10954
are lost, order is not kept, due to it's content this DTD should not be viewed
10955
by anyone) if they have been modified. You can also modify them directly by
10956
changing the C<< $twig->{twig_doctype}->{internal} >> field (straight from
10957
XML::Parser, see the C<Doctype> handler doc)
10961
The XML document includes a reference to an external DTD, and maybe entity
10964
If you use the C<load_DTD> when creating the twig the DTD information and the
10965
entity declarations can be accessed. The entity declarations will be
10966
C<flush>'ed (or C<print>'ed) either as is (if they have not been modified) or
10967
as reconstructed (badly, comments are lost, order is not kept).
10969
You can change the doctype through the C<< $twig->set_doctype >> method and
10970
print the dtd through the C<< $twig->dtd_text >> or C<< $twig->dtd_print >>
10973
If you need to modify the entity list this is probably the easiest way to do it.
10980
If you set handlers and use C<flush>, do not forget to flush the twig one
10981
last time AFTER the parsing, or you might be missing the end of the document.
10983
Remember that element handlers are called when the element is CLOSED, so
10984
if you have handlers for nested elements the inner handlers will be called
10985
first. It makes it for example trickier than it would seem to number nested
10994
=item entity handling
10996
Due to XML::Parser behaviour, non-base entities in attribute values disappear:
10997
C<att="val&ent;"> will be turned into C<< att => val >>, unless you use the
10998
C<keep_encoding> argument to C<< XML::Twig->new >>
11002
Basically the DTD handling methods are competely bugged. No one uses them and
11003
it seems very difficult to get them to work in all cases, including with
11004
several slightly incompatible versions of XML::Parser and of libexpat.
11006
So use XML::Twig with standalone documents, or with documents refering to an
11007
external DTD, but don't expect it to properly parse and even output back the
11012
If you use a lot of twigs you might find that you leak quite a lot of memory
11013
(about 2Ks per twig). You can use the C<L<dispose|dispose> > method to free
11014
that memory after you are done.
11016
If you create elements the same thing might happen, use the C<L<delete|delete>>
11017
method to get rid of them.
11019
Alternatively installing the C<Scalar::Util> (or C<WeakRef>) module on a version
11020
of Perl that supports it (>5.6.0) will get rid of the memory leaks automagically.
11024
The ID list is NOT updated when ID's are modified or elements cut or
11029
This method will not function properly if you do:
11031
$twig->change_gi( $old1, $new);
11032
$twig->change_gi( $old2, $new);
11033
$twig->change_gi( $new, $even_newer);
11035
=item sanity check on XML::Parser method calls
11037
XML::Twig should really prevent calls to some XML::Parser methods, especially
11038
the C<setHandlers> method.
11040
=item pretty printing
11042
Pretty printing (at least using the 'C<indented>' style) is hard to get right!
11043
Only elements that belong to the document will be properly indented. Printing
11044
elements that do not belong to the twig makes it impossible for XML::Twig to
11045
figure out their depth, and thus their indentation level.
11047
Also there is an anavoidable bug when using C<flush> and pretty printing for
11048
elements with mixed content that start with an embedded element:
11050
<elt><b>b</b>toto<b>bold</b></elt>
11055
<b>b</b>toto<b>bold</b></elt>
11057
if you flush the twig when you find the C<< <b> >> element
11064
These are the things that can mess up calling code, especially if threaded.
11065
They might also cause problem under mod_perl.
11069
=item Exported constants
11071
Whether you want them or not you get them! These are subroutines to use
11072
as constant when creating or testing elements
11074
PCDATA return '#PCDATA'
11075
CDATA return '#CDATA'
11076
PI return '#PI', I had the choice between PROC and PI :--(
11078
=item Module scoped values: constants
11080
these should cause no trouble:
11082
%base_ent= ( '>' => '>',
11088
CDATA_START = "<![CDATA[";
11092
COMMENT_START = "<!--";
11093
COMMENT_END = "-->";
11095
pretty print styles
11097
( $NSGMLS, $NICE, $INDENTED, $RECORD1, $RECORD2)= (1..5);
11099
empty tag output style
11101
( $HTML, $EXPAND)= (1..2);
11103
=item Module scoped values: might be changed
11105
Most of these deal with pretty printing, so the worst that can
11106
happen is probably that XML output does not look right, but is
11107
still valid and processed identically by XML processors.
11109
C<$empty_tag_style> can mess up HTML bowsers though and changing C<$ID>
11110
would most likely create problems.
11112
$pretty=0; # pretty print style
11113
$quote='"'; # quote for attributes
11114
$INDENT= ' '; # indent for indented pretty print
11115
$empty_tag_style= 0; # how to display empty tags
11116
$ID # attribute used as a gi ('id' by default)
11118
=item Module scoped values: definitely changed
11120
These 2 variables are used to replace gi's by an index, thus
11121
saving some space when creating a twig. If they really cause
11122
you too much trouble, let me know, it is probably possible to
11123
create either a switch or at least a version of XML::Twig that
11124
does not perform this optimisation.
11126
%gi2index; # gi => index
11127
@index2gi; # list of gi's
11131
If you need to manipulate all those values, you can use the following methods on the
11138
Return a hasref with all the global variables used by XML::Twig
11140
The hash has the following fields: C<pretty>, C<quote>, C<indent>,
11141
C<empty_tag_style>, C<keep_encoding>, C<expand_external_entities>,
11142
C<output_filter>, C<output_text_filter>, C<keep_atts_order>
11144
=item set_global_state ($state)
11146
Set the global state, C<$state> is a hashref
11148
=item save_global_state
11150
Save the current global state
11152
=item restore_global_state
11154
Restore the previously saved (using C<Lsave_global_state>> state
11164
Allowing XML::Twig to work on top of any SAX parser
11166
=item multiple twigs are not well supported
11168
A number of twig features are just global at the moment. These include
11169
the ID list and the "gi pool" (if you use C<change_gi> then you change the gi
11172
A future version will try to support this while trying not to be to
11173
hard on performance (at least when a single twig is used!).
11181
Michel Rodriguez <mirod@xmltwig.com>
11185
This library is free software; you can redistribute it and/or modify
11186
it under the same terms as Perl itself.
11188
Bug reports should be sent using:
11189
F<RTL<http://rt.cpan.org/NoAuth/Bugs.html?Dist=XML-Twig>>
11191
Comments can be sent to mirod@xmltwig.com
11193
The XML::Twig page is at L<http://www.xmltwig.com/xmltwig/>
11194
It includes the development version of the module, a slightly better version
11195
of the documentation, examples, a tutorial and a:
11196
F<Processing XML efficiently with Perl and XML::Twig:
11197
L<http://www.xmltwig.com/xmltwig/tutorial/index.html>>
11201
Complete docs, including a tutorial, examples, an easier to use HTML version,
11202
a quick reference card and a FAQ are available at http://www.xmltwig.com/xmltwig/
11204
XML::Parser,XML::Parser::Expat, Encode, Text::Iconv, Scalar::Utils