~ubuntu-branches/debian/wheezy/libxml-twig-perl/wheezy

Viewing changes to Twig.pm

Committer: Package Import Robot
Author(s): Bart Martens
Date: 2011-10-01 17:41:39 UTC
mfrom: (1.3.4 upstream)
Revision ID: package-import@ubuntu.com-20111001174139-namjm3aptn9m6jou

Tags: 1:3.39-1

* New upstream release.
* Switch to dpkg-source 3.0 (quilt) format.
* No longer using cdbs. Fixes debian-rules-uses-deprecated-makefile.

files added:
debian/docs

debian/libxml-twig-perl.install

debian/source

debian/source/format

debian/xml-twig-tools.docs

debian/xml-twig-tools.install

files modified:
Changes

META.yml

Twig.pm

Twig_pm.slow

debian/changelog

debian/compat

debian/control

debian/copyright

debian/rules

t/test_3_36.t

t/test_errors.t

t/tools.pm

tools/xml_grep/xml_grep

tools/xml_pp/xml_pp

Show diffs side-by-side

added added

removed removed

Twig.pm

# xml name (leading # allowed)

# first line is for perl 5.005, second line for modern perl, that accept character classes

my $REG_NAME = q{(?:(?:[^\W\d]|[:#])(?:[\w.-]*:)?[\w.-]*)}; # does not work for leading non-ascii letters

$REG_NAME = q{(?:(?:[[:alpha:]:#])(?:[\w.-]*:)?[\w.-]*)}; # > perl 5.5

my $REG_NAME = q{(?:(?:[^\W\d]|[:#_])(?:[\w.-]*:)?[\w.-]*)}; # does not work for leading non-ascii letters

$REG_NAME = q{(?:(?:[[:alpha:]:#_])(?:[\w.-]*:)?[\w.-]*)}; # > perl 5.5

# name or wildcard (* or '') (leading # allowed)

my $REG_NAME_W = q{(?:(?:[^\W\d]|[:#])(?:[\w.-]*:)?[\w.-]*|\*)}; # does not work for leading non-ascii letters

$REG_NAME_W = q{(?:(?:[[:alpha:]:#])(?:[\w.-]*:)?[\w.-]*|\*)}; # > perl 5.5

my $REG_NAME_W = q{(?:(?:[^\W\d]|[:#_])(?:[\w.-]*:)?[\w.-]*|\*)}; # does not work for leading non-ascii letters

$REG_NAME_W = q{(?:(?:[[:alpha:]:#_])(?:[\w.-]*:)?[\w.-]*|\*)}; # > perl 5.5

# name or wildcard (* or '') (leading # allowed) with optional class

my $REG_NAME_WC = q{(?(?:(?:[^\W\d]|[:#])(?:[\w.-]*:)?[\w.-]*|\*)(?:\.[\w-]+)?|(?:\.[\w-]+))}; # does not work for leading non-ascii letters

$REG_NAME_WC = q{(?:(?:(?:[[:alpha:]:#])(?:[\w.-]*:)?[\w.-]*|\*)(?:\.[\w-]+)?|(?:\.[\w-]+))}; # > perl 5.5

my $REG_NAME_WC = q{(?(?:(?:[^\W\d]|[:#_])(?:[\w.-]*:)?[\w.-]*|\*)(?:\.[\w-]+)?|(?:\.[\w-]+))}; # does not work for leading non-ascii letters

$REG_NAME_WC = q{(?:(?:(?:[[:alpha:]:#_])(?:[\w.-]*:)?[\w.-]*|\*)(?:\.[\w-]+)?|(?:\.[\w-]+))}; # > perl 5.5

my $REG_REGEXP = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)}; # regexp

my $REG_REGEXP_EXP = q{(?:(?:[^\\/]|\\.)*)}; # content of a regexp

my $REG_REGEXP_MOD = q{(?:[eimso]*)}; # regexp modifiers

my $REG_MATCH = q{[!=]~}; # match (or not)

my $REG_STRING = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')}; # string (simple or double quoted)

my $REG_NUMBER = q{(?:\d+(?:\.\d*)?|\.\d+)}; # number

BEGIN

{

100

$VERSION = '3.37';

$VERSION = '3.39';

101

102

100

use XML::Parser;

103

101

my $needVersion = '2.23';

783

781

return $@ ? $t->_reset_twig_after_error : $t;

784

782

}

785

783

784

sub parseurl_html

785

{ my $t= shift;

786

_use( 'LWP::Simple') or croak "missing LWP::Simple";

787

$t->parse_html( LWP::Simple::get( shift()), @_);

788

}

789

790

786

791

# uses eval to catch the parser's death

787

792

sub safe_parse_html

788

793

{ my $t= shift;

847

852

if( _is_well_formed_xml( $_[0]))

848

853

{ $t->parse( @_) }

849

854

else

850

{ my $html= _html2xml( $_[0]);

855

{ my $html= $t->{use_tidy} ? _tidy_html( $_[0]) : _html2xml( $_[0]);

851

856

if( _is_well_formed_xml( $html))

852

857

{ $t->parse( $html); }

853

858

else

913

918

wrap => 0,

914

919

break_before_br => 0,

915

920

};

916

921

$options ||= {};

917

922

my $tidy_options= { %$TIDY_DEFAULTS, %$options};

918

923

my $tidy = HTML::Tidy->new( $tidy_options);

919

924

$tidy->ignore( type => 1, type => 2 ); # 1 is TIDY_WARNING, 2 is TIDY_ERROR, not clean

1366

1371

sub _set_regexp_handler

1367

1372

{ my( $handlers, $path, $handler, $prev_handler)= @_;

1368

1373

# if the expression was a regexp it is now a string (it was stringified when it became a hash key)

1369

if( $path=~ m{^$\?([xism]*)(?:-[xism]*)?:(.*)$$})

1374

if( $path=~ m{^$\?([\^xism]*)(?:-[\^xism]*)?:(.*)$$})

1370

1375

{ my $regexp= qr/(?$1:$2)/; # convert it back into a regexp

1371

1376

my $sub= sub { my( $stack)= @_; return( $stack->[-1]->{_tag} =~ $regexp ) };

1372

1377

my $handler_data= { tag=> '*', score => { type => $REGEXP_TRIGGER} , trigger => $sub,

1763

1768

sub _add_or_discard_stored_spaces

1764

1769

{ my $t= shift;

1765

1770

1771

$t->{twig_right_after_root}=0; #XX

1772

1766

1773

my $current= $t->{twig_current} or return; # ugly hack, with ignore on, twig_current can disappear

1767

1774

if( $t->{twig_stored_spaces} || $t->{twig_preserve_space})

1768

1775

{ if( (exists $current->{'pcdata'}))

2077

2084

foreach my $handler ( @handlers)

2078

2085

{ $handler->($t, $elt) || last; }

2079

2086

# call _all_ handler if needed

2080

if( my $all= $t->{twig_handlers}->{handlers}->{$ALL})

2087

my $all= $t->{twig_handlers}->{handlers}->{$ALL};

2088

if( $all)

2081

2089

{ $all->($t, $elt); }

2090

if( @handlers || $all) { $t->{twig_right_after_root}=0; }

2082

2091

}

2083

2092

}

2084

2093

2087

2096

{ if( $t->{twig_default_print})

2088

2097

{ # select the proper fh (and store the currently selected one)

2089

2098

$t->_set_fh_to_twig_output_fh();

2099

if( !$p->depth==1) { $t->{twig_right_after_root}=1; } #XX

2090

2100

if( $t->{twig_keep_encoding})

2091

2101

{ $p->setHandlers( %twig_handlers_roots_print_original); }

2092

2102

else

2207

2217

$elt->{pcdata}.= $string;

2208

2218

}

2209

2219

else

2210

{ # text is just space, which might be discarded later

2220

{

2221

# text is just space, which might be discarded later

2211

2222

if( $string=~/\A\s*\Z/s)

2212

2223

{

2213

2224

if( $t->{extra_data})

3652

3663

$p->{twig}->{expat_1_95_2_seen_bracket}=1;

3653

3664

}

3654

3665

else

3655

{ print $p->recognized_string(); }

3666

{ if( $p->{twig}->{twig_right_after_root})

3667

{ my $s= $p->recognized_string(); print $s if $s=~ m{\S}; }

3668

else

3669

{ print $p->recognized_string(); }

3670

}

3656

3671

return;

3657

3672

}

3658

3673

# recognized_string does not seem to work for entities, go figure!

5522

5537

}

5523

5538

5524

5539

# methods dealing with the class attribute, convenient if you work with xhtml

5525

sub class

5540

sub class { $_[0]->{att}->{class}; }

5541

# lvalue version of class. separate from class to avoid problem like RT#

5542

sub lclass

5526

5543

:lvalue # > perl 5.5

5527

{ my( $elt)= @_; $elt->{'att'}->{'class'}; }

5544

{ $_[0]->{att}->{class}; }

5545

5528

5546

sub set_class { my( $elt, $class)= @_; $elt->set_att( class => $class); }

5529

5547

5530

5548

# adds a class to an element

5611

5629

return $elt;

5612

5630

}

5613

5631

5614

sub att

5632

sub att { $_[0]->{att}->{$_[1]}; }

5633

# lvalue version of att. separate from class to avoid problem like RT#

5634

sub latt

5615

5635

:lvalue # > perl 5.5

5616

{ $_[0]->{att}->{$_[1]}; }

5636

{ $_[0]->{att}->{$_[1]}; }

5637

5617

5638

sub del_att

5618

5639

{ my $elt= shift;

5619

5640

while( @_) { delete $elt->{'att'}->{shift()}; }

7327

7348

my $replacement_string;

7328

7349

my $is_string= _is_string( $replace);

7329

7350

foreach my $text_elt ($elt->descendants_or_self( $TEXT))

7330

{ if( $is_string)

7351

{

7352

if( $is_string)

7331

7353

{ my $text= $text_elt->text;

7332

7354

$text=~ s{$regexp}{ _replace_var( $replace, $1, $2, $3, $4, $5, $6, $7, $8, $9)}egx;

7333

7355

$text_elt->set_text( $text);

7334

7356

}

7335

7357

else

7336

{

7358

{

7337

7359

no utf8; # = perl 5.6

7338

7360

my $replace_sub= ( $replace_sub{$replace} ||= _install_replace_sub( $replace));

7339

7361

my $text= $text_elt->text;

7340

7362

my $pos=0; # used to skip text that was previously matched

7363

my $found_hit;

7341

7364

while( my( $pre_match_string, $match_string, @var)= ($text=~ m{(.*?)($regexp)}sg))

7342

{ my $match_start = length( $pre_match_string);

7343

my $match = $text_elt->split_at( $match_start + $pos);

7365

{ $found_hit=1;

7366

my $match_start = length( $pre_match_string);

7367

my $match = $match_start ? $text_elt->split_at( $match_start + $pos) : $text_elt;

7344

7368

my $match_length = length( $match_string);

7345

my $post_match = $match->split_at( $match_length);

7369

my $post_match = $match->split_at( $match_length);

7346

7370

$replace_sub->( $match, @var);

7347

7371

# merge previous text with current one

7348

7372

my $next_sibling;

7357

7381

# go to next

7358

7382

$text_elt= $post_match;

7359

7383

$text= $post_match->text;

7360

# merge last text element with next one if needed,

7361

# the match will be against the non-matched text,

7362

# so $pos is used to skip the merged part

7363

my $prev_sibling;

7364

if( ($prev_sibling= $post_match->{prev_sibling})

7365

&& ($XML::Twig::index2gi[$post_match->{'gi'}] eq $XML::Twig::index2gi[$prev_sibling->{'gi'}])

7366

)

7367

{ $pos= length( $prev_sibling->text);

7368

$prev_sibling->merge_text( $post_match);

7369

}

7370

7384

7371

7385

# if the match is at the end of the text an empty #PCDATA is left: remove it

7372

7386

if( !$text_elt->text) { $text_elt->delete; }

7373

7387

7374

7388

}

7389

if( $found_hit) { $text_elt->normalize; } # in case consecutive #PCDATA have been created

7375

7390

7376

7391

}

7377

7392

}

7392

7407

sub _install_replace_sub

7393

7408

{ my $replace_exp= shift;

7394

7409

my @item= split m{(&e[ln]t\s*$[^)]*$)}, $replace_exp;

7395

my $sub= q{ my( $match, @var)= @_; unshift @var, undef; my $new; };

7410

my $sub= q{ my( $match, @var)= @_; my $new; my $last_inserted=$match;};

7396

7411

my( $gi, $exp);

7397

7412

foreach my $item (@item)

7398

{ if( $item=~ m{^&elt\s*$([^)]*)$})

7399

{ $exp= $1;

7400

}

7413

{ next if ! length $item;

7414

if( $item=~ m{^&elt\s*$([^)]*)$})

7415

{ $exp= $1; }

7401

7416

elsif( $item=~ m{^&ent\s*$\s*([^\s)]*)\s*$})

7402

7417

{ $exp= " '#ENT' => $1"; }

7403

7418

else

7404

7419

{ $exp= qq{ '#PCDATA' => "$item"}; }

7405

$exp=~ s{\$(\d)}{\$var[$1]}g; # replace references to matches

7420

$exp=~ s{\$(\d)}{my $i= $1-1; "\$var[$i]"}eg; # replace references to matches

7406

7421

$sub.= qq{ \$new= \$match->new( $exp); };

7407

$sub .= q{ $new->paste( before => $match); };

7422

$sub .= q{ $new->paste( after => $last_inserted); $last_inserted=$new;};

7408

7423

}

7409

7424

$sub .= q{ $match->delete; };

7410

#$sub=~ s/;/;\n/g;

7425

#$sub=~ s/;/;\n/g; warn "subs: $sub";

7411

7426

my $coderef= eval "sub { $NO_WARNINGS; $sub }";

7412

7427

if( $@) { croak( "invalid replacement expression $replace_exp: ",$@); }

7413

7428

return $coderef;

8440

8455

8441

8456

} # end of block containing package globals ($pretty_print, $quotes, keep_encoding...)

8442

8457

8458

# merges consecutive #PCDATAs in am element

8443

8459

sub normalize

8444

8460

{ my( $elt)= @_;

8445

8461

my @descendants= $elt->descendants( $PCDATA);

10486

10502

get get an extra CDATA section inside (  becomes

10487

10503

10488

10504

10489

=item parsefile_html

10505

=item parsefile_html ($file)

10490

10506

10491

10507

parse an HTML file (by converting it to XML using HTML::TreeBuilder, which

10492

needs to be available). The file is loaded completely in memory and converted

10493

to XML before being parsed.

10494

10495

B<Alpha>: implementation, and thus generated XML could change.

10508

needs to be available, or HTML::Tidy if the C<use_tidy> option was used).

10509

The file is loaded completely in memory and converted to XML before being parsed.

10510

10511

=item parseurl_html ($url $optional_user_agent)

10512

10513

parse an URL as html the same way C<L<parse_html>> does

10496

10514

10497

10515

=item safe_parseurl_html ($url $optional_user_agent)

10498

10516

11546

11564

11547

11565

Return the value of attribute C<$att> or C<undef>

11548

11566

11549

this method is an lvalue, so you can do C<< $elt->{'att'}->{'foo'}= 'bar' >>

11567

=item latt ($att)

11568

11569

Return the value of attribute C<$att> or C<undef>

11570

11571

this method is an lvalue, so you can do C<< $elt->latt( 'foo')= 'bar' >> or C<< $elt->latt( 'foo')++; >>

11550

11572

11551

11573

=item set_att ($att, $att_value)

11552

11574

12665

12687

attribute are quite convenient when dealing with XHTML, or plain XML that

12666

12688

will eventually be displayed using CSS)

12667

12689

12668

this method is an lvalue, so you can do C<< $elt->class= "foo" >>

12690

=item lclass

12691

12692

same as class, except that

12693

this method is an lvalue, so you can do C<< $elt->lclass= "foo" >>

12669

12694

12670

12695

=item set_class ($class)

12671

12696

13420

13445

13421

13446

=head1 AUTHOR

13422

13447

13423

Michel Rodriguez <mirod@xmltwig.com>

13448

Michel Rodriguez <mirod@cpan.org>

13424

13449

13425

13450

=head1 LICENSE

13426

13451

13430

13455

Bug reports should be sent using:

13431

13456

F<RT L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=XML-Twig>>

13432

13457

13433

Comments can be sent to mirod@xmltwig.com

13458

Comments can be sent to mirod@cpan.org

13434

13459

13435

13460

The XML::Twig page is at L<http://www.xmltwig.com/xmltwig/>

13436

13461

It includes the development version of the module, a slightly better version

Older »