1229 lines
35 KiB
Perl
1229 lines
35 KiB
Perl
=head1 NAME
|
|
|
|
XML::TreePP -- Pure Perl implementation for parsing/writing xml files
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
parse xml file into hash tree
|
|
|
|
use XML::TreePP;
|
|
my $tpp = XML::TreePP->new();
|
|
my $tree = $tpp->parsefile( "index.rdf" );
|
|
print "Title: ", $tree->{"rdf:RDF"}->{item}->[0]->{title}, "\n";
|
|
print "URL: ", $tree->{"rdf:RDF"}->{item}->[0]->{link}, "\n";
|
|
|
|
write xml as string from hash tree
|
|
|
|
use XML::TreePP;
|
|
my $tpp = XML::TreePP->new();
|
|
my $tree = { rss => { channel => { item => [ {
|
|
title => "The Perl Directory",
|
|
link => "http://www.perl.org/",
|
|
}, {
|
|
title => "The Comprehensive Perl Archive Network",
|
|
link => "http://cpan.perl.org/",
|
|
} ] } } };
|
|
my $xml = $tpp->write( $tree );
|
|
print $xml;
|
|
|
|
get remote xml file with HTTP-GET and parse it into hash tree
|
|
|
|
use XML::TreePP;
|
|
my $tpp = XML::TreePP->new();
|
|
my $tree = $tpp->parsehttp( GET => "http://use.perl.org/index.rss" );
|
|
print "Title: ", $tree->{"rdf:RDF"}->{channel}->{title}, "\n";
|
|
print "URL: ", $tree->{"rdf:RDF"}->{channel}->{link}, "\n";
|
|
|
|
get remote xml file with HTTP-POST and parse it into hash tree
|
|
|
|
use XML::TreePP;
|
|
my $tpp = XML::TreePP->new( force_array => [qw( item )] );
|
|
my $cgiurl = "http://search.hatena.ne.jp/keyword";
|
|
my $keyword = "ajax";
|
|
my $cgiquery = "mode=rss2&word=".$keyword;
|
|
my $tree = $tpp->parsehttp( POST => $cgiurl, $cgiquery );
|
|
print "Link: ", $tree->{rss}->{channel}->{item}->[0]->{link}, "\n";
|
|
print "Desc: ", $tree->{rss}->{channel}->{item}->[0]->{description}, "\n";
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
XML::TreePP module parses XML file and expands it for a hash tree.
|
|
And also generate XML file from a hash tree.
|
|
This is a pure Perl implementation.
|
|
You can also download XML from remote web server
|
|
like XMLHttpRequest object at JavaScript language.
|
|
|
|
=head1 EXAMPLES
|
|
|
|
=head2 Parse XML file
|
|
|
|
Sample XML source:
|
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<family name="Kawasaki">
|
|
<father>Yasuhisa</father>
|
|
<mother>Chizuko</mother>
|
|
<children>
|
|
<girl>Shiori</girl>
|
|
<boy>Yusuke</boy>
|
|
<boy>Kairi</boy>
|
|
</children>
|
|
</family>
|
|
|
|
Sample program to read a xml file and dump it:
|
|
|
|
use XML::TreePP;
|
|
use Data::Dumper;
|
|
my $tpp = XML::TreePP->new();
|
|
my $tree = $tpp->parsefile( "family.xml" );
|
|
my $text = Dumper( $tree );
|
|
print $text;
|
|
|
|
Result dumped:
|
|
|
|
$VAR1 = {
|
|
'family' => {
|
|
'-name' => 'Kawasaki',
|
|
'father' => 'Yasuhisa',
|
|
'mother' => 'Chizuko',
|
|
'children' => {
|
|
'girl' => 'Shiori'
|
|
'boy' => [
|
|
'Yusuke',
|
|
'Kairi'
|
|
],
|
|
}
|
|
}
|
|
};
|
|
|
|
Details:
|
|
|
|
print $tree->{family}->{father}; # the father's given name.
|
|
|
|
The prefix '-' is added on every attribute's name.
|
|
|
|
print $tree->{family}->{"-name"}; # the family name of the family
|
|
|
|
The array is used because the family has two boys.
|
|
|
|
print $tree->{family}->{children}->{boy}->[1]; # The second boy's name
|
|
print $tree->{family}->{children}->{girl}; # The girl's name
|
|
|
|
=head2 Text node and attributes:
|
|
|
|
If a element has both of a text node and attributes
|
|
or both of a text node and other child nodes,
|
|
value of a text node is moved to C<#text> like child nodes.
|
|
|
|
use XML::TreePP;
|
|
use Data::Dumper;
|
|
my $tpp = XML::TreePP->new();
|
|
my $source = '<span class="author">Kawasaki Yusuke</span>';
|
|
my $tree = $tpp->parse( $source );
|
|
my $text = Dumper( $tree );
|
|
print $text;
|
|
|
|
The result dumped is following:
|
|
|
|
$VAR1 = {
|
|
'span' => {
|
|
'-class' => 'author',
|
|
'#text' => 'Kawasaki Yusuke'
|
|
}
|
|
};
|
|
|
|
The special node name of C<#text> is used because this elements
|
|
has attribute(s) in addition to the text node.
|
|
See also L</text_node_key> option.
|
|
|
|
=head1 METHODS
|
|
|
|
=head2 new
|
|
|
|
This constructor method returns a new XML::TreePP object with C<%options>.
|
|
|
|
$tpp = XML::TreePP->new( %options );
|
|
|
|
=head2 set
|
|
|
|
This method sets a option value for C<option_name>.
|
|
If C<$option_value> is not defined, its option is deleted.
|
|
|
|
$tpp->set( option_name => $option_value );
|
|
|
|
See OPTIONS section below for details.
|
|
|
|
=head2 get
|
|
|
|
This method returns a current option value for C<option_name>.
|
|
|
|
$tpp->get( 'option_name' );
|
|
|
|
=head2 parse
|
|
|
|
This method reads XML source and returns a hash tree converted.
|
|
The first argument is a scalar or a reference to a scalar.
|
|
|
|
$tree = $tpp->parse( $source );
|
|
|
|
=head2 parsefile
|
|
|
|
This method reads a XML file and returns a hash tree converted.
|
|
The first argument is a filename.
|
|
|
|
$tree = $tpp->parsefile( $file );
|
|
|
|
=head2 parsehttp
|
|
|
|
This method receives a XML file from a remote server via HTTP and
|
|
returns a hash tree converted.
|
|
|
|
$tree = $tpp->parsehttp( $method, $url, $body, $head );
|
|
|
|
C<$method> is a method of HTTP connection: GET/POST/PUT/DELETE
|
|
C<$url> is an URI of a XML file.
|
|
C<$body> is a request body when you use POST method.
|
|
C<$head> is a request headers as a hash ref.
|
|
L<LWP::UserAgent> module or L<HTTP::Lite> module is required to fetch a file.
|
|
|
|
( $tree, $xml, $code ) = $tpp->parsehttp( $method, $url, $body, $head );
|
|
|
|
In array context, This method returns also raw XML source received
|
|
and HTTP response's status code.
|
|
|
|
=head2 write
|
|
|
|
This method parses a hash tree and returns a XML source generated.
|
|
|
|
$source = $tpp->write( $tree, $encode );
|
|
|
|
C<$tree> is a reference to a hash tree.
|
|
|
|
=head2 writefile
|
|
|
|
This method parses a hash tree and writes a XML source into a file.
|
|
|
|
$tpp->writefile( $file, $tree, $encode );
|
|
|
|
C<$file> is a filename to create.
|
|
C<$tree> is a reference to a hash tree.
|
|
|
|
=head1 OPTIONS FOR PARSING XML
|
|
|
|
This module accepts option parameters following:
|
|
|
|
=head2 force_array
|
|
|
|
This option allows you to specify a list of element names which
|
|
should always be forced into an array representation.
|
|
|
|
$tpp->set( force_array => [ 'rdf:li', 'item', '-xmlns' ] );
|
|
|
|
The default value is null, it means that context of the elements
|
|
will determine to make array or to keep it scalar or hash.
|
|
Note that the special wildcard name C<'*'> means all elements.
|
|
|
|
=head2 force_hash
|
|
|
|
This option allows you to specify a list of element names which
|
|
should always be forced into an hash representation.
|
|
|
|
$tpp->set( force_hash => [ 'item', 'image' ] );
|
|
|
|
The default value is null, it means that context of the elements
|
|
will determine to make hash or to keep it scalar as a text node.
|
|
See also L</text_node_key> option below.
|
|
Note that the special wildcard name C<'*'> means all elements.
|
|
|
|
=head2 cdata_scalar_ref
|
|
|
|
This option allows you to convert a cdata section into a reference
|
|
for scalar on parsing XML source.
|
|
|
|
$tpp->set( cdata_scalar_ref => 1 );
|
|
|
|
The default value is false, it means that each cdata section is converted into a scalar.
|
|
|
|
=head2 user_agent
|
|
|
|
This option allows you to specify a HTTP_USER_AGENT string which
|
|
is used by parsehttp() method.
|
|
|
|
$tpp->set( user_agent => 'Mozilla/4.0 (compatible; ...)' );
|
|
|
|
The default string is C<'XML-TreePP/#.##'>, where C<'#.##'> is
|
|
substituted with the version number of this library.
|
|
|
|
=head2 http_lite
|
|
|
|
This option forces pasrsehttp() method to use a L<HTTP::Lite> instance.
|
|
|
|
my $http = HTTP::Lite->new();
|
|
$tpp->set( http_lite => $http );
|
|
|
|
=head2 lwp_useragent
|
|
|
|
This option forces pasrsehttp() method to use a L<LWP::UserAgent> instance.
|
|
|
|
my $ua = LWP::UserAgent->new();
|
|
$ua->timeout( 60 );
|
|
$ua->env_proxy;
|
|
$tpp->set( lwp_useragent => $ua );
|
|
|
|
You may use this with L<LWP::UserAgent::WithCache>.
|
|
|
|
=head2 base_class
|
|
|
|
This blesses class name for each element's hashref.
|
|
Each class is named straight as a child class of it parent class.
|
|
|
|
$tpp->set( base_class => 'MyElement' );
|
|
my $xml = '<root><parent><child key="val">text</child></parent></root>';
|
|
my $tree = $tpp->parse( $xml );
|
|
print ref $tree->{root}->{parent}->{child}, "\n";
|
|
|
|
A hash for <child> element above is blessed to C<MyElement::root::parent::child>
|
|
class. You may use this with L<Class::Accessor>.
|
|
|
|
=head2 elem_class
|
|
|
|
This blesses class name for each element's hashref.
|
|
Each class is named horizontally under the direct child of C<MyElement>.
|
|
|
|
$tpp->set( base_class => 'MyElement' );
|
|
my $xml = '<root><parent><child key="val">text</child></parent></root>';
|
|
my $tree = $tpp->parse( $xml );
|
|
print ref $tree->{root}->{parent}->{child}, "\n";
|
|
|
|
A hash for <child> element above is blessed to C<MyElement::child> class.
|
|
|
|
=head1 OPTIONS FOR WRITING XML
|
|
|
|
=head2 first_out
|
|
|
|
This option allows you to specify a list of element/attribute
|
|
names which should always appears at first on output XML code.
|
|
|
|
$tpp->set( first_out => [ 'link', 'title', '-type' ] );
|
|
|
|
The default value is null, it means alphabetical order is used.
|
|
|
|
=head2 last_out
|
|
|
|
This option allows you to specify a list of element/attribute
|
|
names which should always appears at last on output XML code.
|
|
|
|
$tpp->set( last_out => [ 'items', 'item', 'entry' ] );
|
|
|
|
=head2 indent
|
|
|
|
This makes the output more human readable by indenting appropriately.
|
|
|
|
$tpp->set( indent => 2 );
|
|
|
|
This doesn't strictly follow the XML Document Spec but does looks nice.
|
|
|
|
=head2 xml_decl
|
|
|
|
This module generates an XML declaration on writing an XML code per default.
|
|
This option forces to change or leave it.
|
|
|
|
$tpp->set( xml_decl => '' );
|
|
|
|
=head2 output_encoding
|
|
|
|
This option allows you to specify a encoding of xml file generated
|
|
by write/writefile methods.
|
|
|
|
$tpp->set( output_encoding => 'UTF-8' );
|
|
|
|
On Perl 5.8.0 and later, you can select it from every
|
|
encodings supported by Encode.pm. On Perl 5.6.x and before with
|
|
Jcode.pm, you can use C<Shift_JIS>, C<EUC-JP>, C<ISO-2022-JP> and
|
|
C<UTF-8>. The default value is C<UTF-8> which is recommended encoding.
|
|
|
|
=head1 OPTIONS FOR BOTH
|
|
|
|
=head2 utf8_flag
|
|
|
|
This makes utf8 flag on for every element's value parsed
|
|
and makes it on for an XML code generated as well.
|
|
|
|
$tpp->set( utf8_flag => 1 );
|
|
|
|
Perl 5.8.1 or later is required to use this.
|
|
|
|
=head2 attr_prefix
|
|
|
|
This option allows you to specify a prefix character(s) which
|
|
is inserted before each attribute names.
|
|
|
|
$tpp->set( attr_prefix => '@' );
|
|
|
|
The default character is C<'-'>.
|
|
Or set C<'@'> to access attribute values like E4X, ECMAScript for XML.
|
|
Zero-length prefix C<''> is available as well, it means no prefix is added.
|
|
|
|
=head2 text_node_key
|
|
|
|
This option allows you to specify a hash key for text nodes.
|
|
|
|
$tpp->set( text_node_key => '#text' );
|
|
|
|
The default key is C<#text>.
|
|
|
|
=head2 ignore_error
|
|
|
|
This module calls Carp::croak function on an error per default.
|
|
This option makes all errors ignored and just return.
|
|
|
|
$tpp->set( ignore_error => 1 );
|
|
|
|
=head2 use_ixhash
|
|
|
|
This option keeps the order for each element appeared in XML.
|
|
L<Tie::IxHash> module is required.
|
|
|
|
$tpp->set( use_ixhash => 1 );
|
|
|
|
This makes parsing performance slow.
|
|
(about 100% slower than default)
|
|
|
|
=head1 AUTHOR
|
|
|
|
Yusuke Kawasaki, http://www.kawa.net/
|
|
|
|
=head1 COPYRIGHT AND LICENSE
|
|
|
|
Copyright (c) 2006-2007 Yusuke Kawasaki. All rights reserved.
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the same terms as Perl itself.
|
|
|
|
=cut
|
|
|
|
package XML::TreePP;
|
|
use strict;
|
|
use Carp;
|
|
use Symbol;
|
|
|
|
use vars qw( $VERSION );
|
|
$VERSION = '0.32';
|
|
|
|
my $XML_ENCODING = 'UTF-8';
|
|
my $INTERNAL_ENCODING = 'UTF-8';
|
|
my $USER_AGENT = 'XML-TreePP/'.$VERSION.' ';
|
|
my $ATTR_PREFIX = '-';
|
|
my $TEXT_NODE_KEY = '#text';
|
|
|
|
sub new {
|
|
my $package = shift;
|
|
my $self = {@_};
|
|
bless $self, $package;
|
|
$self;
|
|
}
|
|
|
|
sub die {
|
|
my $self = shift;
|
|
my $mess = shift;
|
|
return if $self->{ignore_error};
|
|
Carp::croak $mess;
|
|
}
|
|
|
|
sub warn {
|
|
my $self = shift;
|
|
my $mess = shift;
|
|
return if $self->{ignore_error};
|
|
Carp::carp $mess;
|
|
}
|
|
|
|
sub set {
|
|
my $self = shift;
|
|
my $key = shift;
|
|
my $val = shift;
|
|
if ( defined $val ) {
|
|
$self->{$key} = $val;
|
|
}
|
|
else {
|
|
delete $self->{$key};
|
|
}
|
|
}
|
|
|
|
sub get {
|
|
my $self = shift;
|
|
my $key = shift;
|
|
$self->{$key} if exists $self->{$key};
|
|
}
|
|
|
|
sub writefile {
|
|
my $self = shift;
|
|
my $file = shift;
|
|
my $tree = shift or return $self->die( 'Invalid tree' );
|
|
my $encode = shift;
|
|
return $self->die( 'Invalid filename' ) unless defined $file;
|
|
my $text = $self->write( $tree, $encode );
|
|
if ( $] >= 5.008001 && utf8::is_utf8( $text ) ) {
|
|
utf8::encode( $text );
|
|
}
|
|
$self->write_raw_xml( $file, $text );
|
|
}
|
|
|
|
sub write {
|
|
my $self = shift;
|
|
my $tree = shift or return $self->die( 'Invalid tree' );
|
|
my $from = $self->{internal_encoding} || $INTERNAL_ENCODING;
|
|
my $to = shift || $self->{output_encoding} || $XML_ENCODING;
|
|
my $decl = $self->{xml_decl};
|
|
$decl = '<?xml version="1.0" encoding="' . $to . '" ?>' unless defined $decl;
|
|
|
|
local $self->{__first_out};
|
|
if ( exists $self->{first_out} ) {
|
|
my $keys = $self->{first_out};
|
|
$keys = [$keys] unless ref $keys;
|
|
$self->{__first_out} = { map { $keys->[$_] => $_ } 0 .. $#$keys };
|
|
}
|
|
|
|
local $self->{__last_out};
|
|
if ( exists $self->{last_out} ) {
|
|
my $keys = $self->{last_out};
|
|
$keys = [$keys] unless ref $keys;
|
|
$self->{__last_out} = { map { $keys->[$_] => $_ } 0 .. $#$keys };
|
|
}
|
|
|
|
my $tnk = $self->{text_node_key} if exists $self->{text_node_key};
|
|
$tnk = $TEXT_NODE_KEY unless defined $tnk;
|
|
local $self->{text_node_key} = $tnk;
|
|
|
|
my $apre = $self->{attr_prefix} if exists $self->{attr_prefix};
|
|
$apre = $ATTR_PREFIX unless defined $apre;
|
|
local $self->{__attr_prefix_len} = length($apre);
|
|
local $self->{__attr_prefix_rex} = defined $apre ? qr/^\Q$apre\E/s : undef;
|
|
|
|
local $self->{__indent};
|
|
if ( exists $self->{indent} && $self->{indent} ) {
|
|
$self->{__indent} = ' ' x $self->{indent};
|
|
}
|
|
|
|
my $text = $self->hash_to_xml( undef, $tree );
|
|
if ( $from && $to ) {
|
|
my $stat = $self->encode_from_to( \$text, $from, $to );
|
|
return $self->die( "Unsupported encoding: $to" ) unless $stat;
|
|
}
|
|
|
|
return $text if ( $decl eq '' );
|
|
join( "\n", $decl, $text );
|
|
}
|
|
|
|
sub parsehttp {
|
|
my $self = shift;
|
|
|
|
local $self->{__user_agent};
|
|
if ( exists $self->{user_agent} ) {
|
|
my $agent = $self->{user_agent};
|
|
$agent .= $USER_AGENT if ( $agent =~ /\s$/s );
|
|
$self->{__user_agent} = $agent if ( $agent ne '' );
|
|
} else {
|
|
$self->{__user_agent} = $USER_AGENT;
|
|
}
|
|
|
|
my $http = $self->{__http_module};
|
|
unless ( $http ) {
|
|
$http = $self->find_http_module(@_);
|
|
$self->{__http_module} = $http;
|
|
}
|
|
if ( $http eq 'LWP::UserAgent' ) {
|
|
return $self->parsehttp_lwp(@_);
|
|
}
|
|
elsif ( $http eq 'HTTP::Lite' ) {
|
|
return $self->parsehttp_lite(@_);
|
|
}
|
|
else {
|
|
return $self->die( "LWP::UserAgent or HTTP::Lite is required: $_[1]" );
|
|
}
|
|
}
|
|
|
|
sub find_http_module {
|
|
my $self = shift || {};
|
|
|
|
if ( exists $self->{lwp_useragent} && ref $self->{lwp_useragent} ) {
|
|
return 'LWP::UserAgent' if defined $LWP::UserAgent::VERSION;
|
|
return 'LWP::UserAgent' if &load_lwp_useragent();
|
|
return $self->die( "LWP::UserAgent is required: $_[1]" );
|
|
}
|
|
|
|
if ( exists $self->{http_lite} && ref $self->{http_lite} ) {
|
|
return 'HTTP::Lite' if defined $HTTP::Lite::VERSION;
|
|
return 'HTTP::Lite' if &load_http_lite();
|
|
return $self->die( "HTTP::Lite is required: $_[1]" );
|
|
}
|
|
|
|
return 'LWP::UserAgent' if defined $LWP::UserAgent::VERSION;
|
|
return 'HTTP::Lite' if defined $HTTP::Lite::VERSION;
|
|
return 'LWP::UserAgent' if &load_lwp_useragent();
|
|
return 'HTTP::Lite' if &load_http_lite();
|
|
return $self->die( "LWP::UserAgent or HTTP::Lite is required: $_[1]" );
|
|
}
|
|
|
|
sub load_lwp_useragent {
|
|
return $LWP::UserAgent::VERSION if defined $LWP::UserAgent::VERSION;
|
|
local $@;
|
|
eval { require LWP::UserAgent; };
|
|
$LWP::UserAgent::VERSION;
|
|
}
|
|
|
|
sub load_http_lite {
|
|
return $HTTP::Lite::VERSION if defined $HTTP::Lite::VERSION;
|
|
local $@;
|
|
eval { require HTTP::Lite; };
|
|
$HTTP::Lite::VERSION;
|
|
}
|
|
|
|
sub load_tie_ixhash {
|
|
return $Tie::IxHash::VERSION if defined $Tie::IxHash::VERSION;
|
|
local $@;
|
|
eval { require Tie::IxHash; };
|
|
$Tie::IxHash::VERSION;
|
|
}
|
|
|
|
sub parsehttp_lwp {
|
|
my $self = shift;
|
|
my $method = shift or return $self->die( 'Invalid HTTP method' );
|
|
my $url = shift or return $self->die( 'Invalid URL' );
|
|
my $body = shift;
|
|
my $header = shift;
|
|
|
|
my $ua = $self->{lwp_useragent} if exists $self->{lwp_useragent};
|
|
if ( ! ref $ua ) {
|
|
$ua = LWP::UserAgent->new();
|
|
$ua->timeout(10);
|
|
$ua->env_proxy();
|
|
$ua->agent( $self->{__user_agent} ) if defined $self->{__user_agent};
|
|
} else {
|
|
$ua->agent( $self->{__user_agent} ) if exists $self->{user_agent};
|
|
}
|
|
|
|
my $req = HTTP::Request->new( $method, $url );
|
|
my $ct = 0;
|
|
if ( ref $header ) {
|
|
foreach my $field ( sort keys %$header ) {
|
|
my $value = $header->{$field};
|
|
$req->header( $field => $value );
|
|
$ct ++ if ( $field =~ /^Content-Type$/i );
|
|
}
|
|
}
|
|
if ( defined $body && ! $ct ) {
|
|
$req->header( 'Content-Type' => 'application/x-www-form-urlencoded' );
|
|
}
|
|
$req->content($body) if defined $body;
|
|
my $res = $ua->request($req);
|
|
my $code = $res->code();
|
|
my $text = $res->content();
|
|
my $tree = $self->parse( \$text ) if $res->is_success();
|
|
wantarray ? ( $tree, $text, $code ) : $tree;
|
|
}
|
|
|
|
sub parsehttp_lite {
|
|
my $self = shift;
|
|
my $method = shift or return $self->die( 'Invalid HTTP method' );
|
|
my $url = shift or return $self->die( 'Invalid URL' );
|
|
my $body = shift;
|
|
my $header = shift;
|
|
|
|
my $http = HTTP::Lite->new();
|
|
$http->method($method);
|
|
my $ua = 0;
|
|
if ( ref $header ) {
|
|
foreach my $field ( sort keys %$header ) {
|
|
my $value = $header->{$field};
|
|
$http->add_req_header( $field, $value );
|
|
$ua ++ if ( $field =~ /^User-Agent$/i );
|
|
}
|
|
}
|
|
if ( defined $self->{__user_agent} && ! $ua ) {
|
|
$http->add_req_header( 'User-Agent', $self->{__user_agent} );
|
|
}
|
|
$http->{content} = $body if defined $body;
|
|
my $code = $http->request($url) or return;
|
|
my $text = $http->body();
|
|
my $tree = $self->parse( \$text );
|
|
wantarray ? ( $tree, $text, $code ) : $tree;
|
|
}
|
|
|
|
sub parsefile {
|
|
my $self = shift;
|
|
my $file = shift;
|
|
return $self->die( 'Invalid filename' ) unless defined $file;
|
|
my $text = $self->read_raw_xml($file);
|
|
$self->parse( \$text );
|
|
}
|
|
|
|
sub parse {
|
|
my $self = shift;
|
|
my $text = ref $_[0] ? ${$_[0]} : $_[0];
|
|
return $self->die( 'Null XML source' ) unless defined $text;
|
|
|
|
my $from = &xml_decl_encoding(\$text) || $XML_ENCODING;
|
|
my $to = $self->{internal_encoding} || $INTERNAL_ENCODING;
|
|
if ( $from && $to ) {
|
|
my $stat = $self->encode_from_to( \$text, $from, $to );
|
|
return $self->die( "Unsupported encoding: $from" ) unless $stat;
|
|
}
|
|
|
|
local $self->{__force_array};
|
|
local $self->{__force_array_all};
|
|
if ( exists $self->{force_array} ) {
|
|
my $force = $self->{force_array};
|
|
$force = [$force] unless ref $force;
|
|
$self->{__force_array} = { map { $_ => 1 } @$force };
|
|
$self->{__force_array_all} = $self->{__force_array}->{'*'};
|
|
}
|
|
|
|
local $self->{__force_hash};
|
|
local $self->{__force_hash_all};
|
|
if ( exists $self->{force_hash} ) {
|
|
my $force = $self->{force_hash};
|
|
$force = [$force] unless ref $force;
|
|
$self->{__force_hash} = { map { $_ => 1 } @$force };
|
|
$self->{__force_hash_all} = $self->{__force_hash}->{'*'};
|
|
}
|
|
|
|
my $tnk = $self->{text_node_key} if exists $self->{text_node_key};
|
|
$tnk = $TEXT_NODE_KEY unless defined $tnk;
|
|
local $self->{text_node_key} = $tnk;
|
|
|
|
my $apre = $self->{attr_prefix} if exists $self->{attr_prefix};
|
|
$apre = $ATTR_PREFIX unless defined $apre;
|
|
local $self->{attr_prefix} = $apre;
|
|
|
|
if ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
|
|
return $self->die( "Tie::IxHash is required." ) unless &load_tie_ixhash();
|
|
}
|
|
|
|
my $flat = $self->xml_to_flat(\$text);
|
|
my $class = $self->{base_class} if exists $self->{base_class};
|
|
my $tree = $self->flat_to_tree( $flat, '', $class );
|
|
if ( ref $tree ) {
|
|
if ( defined $class ) {
|
|
bless( $tree, $class );
|
|
}
|
|
elsif ( exists $self->{elem_class} && $self->{elem_class} ) {
|
|
bless( $tree, $self->{elem_class} );
|
|
}
|
|
}
|
|
wantarray ? ( $tree, $text ) : $tree;
|
|
}
|
|
|
|
sub xml_to_flat {
|
|
my $self = shift;
|
|
my $textref = shift; # reference
|
|
my $flat = [];
|
|
my $prefix = $self->{attr_prefix};
|
|
my $ixhash = ( exists $self->{use_ixhash} && $self->{use_ixhash} );
|
|
|
|
while ( $$textref =~ m{
|
|
([^<]*) <
|
|
((
|
|
\? ([^<>]*) \?
|
|
)|(
|
|
\!\[CDATA\[(.*?)\]\]
|
|
)|(
|
|
\!DOCTYPE\s+([^\[\]<>]*(?:\[.*?\]\s*)?)
|
|
)|(
|
|
\!--(.*?)--
|
|
)|(
|
|
([^\!\?\s<>](?:"[^"]*"|'[^']*'|[^"'<>])*)
|
|
))
|
|
> ([^<]*)
|
|
}sxg ) {
|
|
my (
|
|
$ahead, $match, $typePI, $contPI, $typeCDATA,
|
|
$contCDATA, $typeDocT, $contDocT, $typeCmnt, $contCmnt,
|
|
$typeElem, $contElem, $follow
|
|
)
|
|
= ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 );
|
|
if ( defined $ahead && $ahead =~ /\S/ ) {
|
|
$self->warn( "Invalid string: [$ahead] before <$match>" );
|
|
}
|
|
|
|
if ($typeElem) { # Element
|
|
my $node = {};
|
|
if ( $contElem =~ s#^/## ) {
|
|
$node->{endTag}++;
|
|
}
|
|
elsif ( $contElem =~ s#/$## ) {
|
|
$node->{emptyTag}++;
|
|
}
|
|
else {
|
|
$node->{startTag}++;
|
|
}
|
|
$node->{tagName} = $1 if ( $contElem =~ s#^(\S+)\s*## );
|
|
unless ( $node->{endTag} ) {
|
|
my $attr;
|
|
while ( $contElem =~ m{
|
|
([^\s\=\"\']+)=(?:(")(.*?)"|'(.*?)')
|
|
}sxg ) {
|
|
my $key = $1;
|
|
my $val = &xml_unescape( $2 ? $3 : $4 );
|
|
if ( ! ref $attr ) {
|
|
$attr = {};
|
|
tie( %$attr, 'Tie::IxHash' ) if $ixhash;
|
|
}
|
|
$attr->{$prefix.$key} = $val;
|
|
}
|
|
$node->{attributes} = $attr if ref $attr;
|
|
}
|
|
push( @$flat, $node );
|
|
}
|
|
elsif ($typeCDATA) { ## CDATASection
|
|
if ( exists $self->{cdata_scalar_ref} && $self->{cdata_scalar_ref} ) {
|
|
push( @$flat, \$contCDATA ); # as reference for scalar
|
|
}
|
|
else {
|
|
push( @$flat, $contCDATA ); # as scalar like text node
|
|
}
|
|
}
|
|
elsif ($typeCmnt) { # Comment (ignore)
|
|
}
|
|
elsif ($typeDocT) { # DocumentType (ignore)
|
|
}
|
|
elsif ($typePI) { # ProcessingInstruction (ignore)
|
|
}
|
|
else {
|
|
$self->warn( "Invalid Tag: <$match>" );
|
|
}
|
|
if ( $follow =~ /\S/ ) { # text node
|
|
my $val = &xml_unescape($follow);
|
|
push( @$flat, $val );
|
|
}
|
|
}
|
|
$flat;
|
|
}
|
|
|
|
sub flat_to_tree {
|
|
my $self = shift;
|
|
my $source = shift;
|
|
my $parent = shift;
|
|
my $class = shift;
|
|
my $tree = {};
|
|
my $text = [];
|
|
|
|
if ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
|
|
tie( %$tree, 'Tie::IxHash' );
|
|
}
|
|
|
|
while ( scalar @$source ) {
|
|
my $node = shift @$source;
|
|
if ( !ref $node || UNIVERSAL::isa( $node, "SCALAR" ) ) {
|
|
push( @$text, $node ); # cdata or text node
|
|
next;
|
|
}
|
|
my $name = $node->{tagName};
|
|
if ( $node->{endTag} ) {
|
|
last if ( $parent eq $name );
|
|
return $self->die( "Invalid tag sequence: <$parent></$name>" );
|
|
}
|
|
my $elem = $node->{attributes};
|
|
my $forcehash = $self->{__force_hash_all} || $self->{__force_hash}->{$name};
|
|
my $subclass;
|
|
if ( defined $class ) {
|
|
my $escname = $name;
|
|
$escname =~ s/\W/_/sg;
|
|
$subclass = $class.'::'.$escname;
|
|
}
|
|
if ( $node->{startTag} ) { # recursive call
|
|
my $child = $self->flat_to_tree( $source, $name, $subclass );
|
|
next unless defined $child;
|
|
my $hasattr = scalar keys %$elem if ref $elem;
|
|
if ( UNIVERSAL::isa( $child, "HASH" ) ) {
|
|
if ( $hasattr ) {
|
|
# some attributes and some child nodes
|
|
%$elem = ( %$elem, %$child );
|
|
}
|
|
else {
|
|
# some child nodes without attributes
|
|
$elem = $child;
|
|
}
|
|
}
|
|
else {
|
|
if ( $hasattr ) {
|
|
# some attributes and text node
|
|
$elem->{$self->{text_node_key}} = $child;
|
|
}
|
|
elsif ( $forcehash ) {
|
|
# only text node without attributes
|
|
$elem = { $self->{text_node_key} => $child };
|
|
}
|
|
else {
|
|
# text node without attributes
|
|
$elem = $child;
|
|
}
|
|
}
|
|
}
|
|
elsif ( $forcehash && ! ref $elem ) {
|
|
$elem = {};
|
|
}
|
|
# bless to a class by base_class or elem_class
|
|
if ( ref $elem && UNIVERSAL::isa( $elem, "HASH" ) ) {
|
|
if ( defined $subclass ) {
|
|
bless( $elem, $subclass );
|
|
} elsif ( exists $self->{elem_class} && $self->{elem_class} ) {
|
|
my $escname = $name;
|
|
$escname =~ s/\W/_/sg;
|
|
my $elmclass = $self->{elem_class}.'::'.$escname;
|
|
bless( $elem, $elmclass );
|
|
}
|
|
}
|
|
# next unless defined $elem;
|
|
$tree->{$name} ||= [];
|
|
push( @{ $tree->{$name} }, $elem );
|
|
}
|
|
if ( ! $self->{__force_array_all} ) {
|
|
foreach my $key ( keys %$tree ) {
|
|
next if $self->{__force_array}->{$key};
|
|
next if ( 1 < scalar @{ $tree->{$key} } );
|
|
$tree->{$key} = shift @{ $tree->{$key} };
|
|
}
|
|
}
|
|
my $haschild = scalar keys %$tree;
|
|
if ( scalar @$text ) {
|
|
if ( scalar @$text == 1 ) {
|
|
# one text node (normal)
|
|
$text = shift @$text;
|
|
}
|
|
elsif ( ! scalar grep {ref $_} @$text ) {
|
|
# some text node splitted
|
|
$text = join( '', @$text );
|
|
}
|
|
else {
|
|
# some cdata node
|
|
my $join = join( '', map {ref $_ ? $$_ : $_} @$text );
|
|
$text = \$join;
|
|
}
|
|
if ( $haschild ) {
|
|
# some child nodes and also text node
|
|
$tree->{$self->{text_node_key}} = $text;
|
|
}
|
|
else {
|
|
# only text node without child nodes
|
|
$tree = $text;
|
|
}
|
|
}
|
|
elsif ( ! $haschild ) {
|
|
# no child and no text
|
|
$tree = "";
|
|
}
|
|
$tree;
|
|
}
|
|
|
|
sub hash_to_xml {
|
|
my $self = shift;
|
|
my $name = shift;
|
|
my $hash = shift;
|
|
my $out = [];
|
|
my $attr = [];
|
|
my $allkeys = [ keys %$hash ];
|
|
my $fo = $self->{__first_out} if ref $self->{__first_out};
|
|
my $lo = $self->{__last_out} if ref $self->{__last_out};
|
|
my $firstkeys = [ sort { $fo->{$a} <=> $fo->{$b} } grep { exists $fo->{$_} } @$allkeys ] if ref $fo;
|
|
my $lastkeys = [ sort { $lo->{$a} <=> $lo->{$b} } grep { exists $lo->{$_} } @$allkeys ] if ref $lo;
|
|
$allkeys = [ grep { ! exists $fo->{$_} } @$allkeys ] if ref $fo;
|
|
$allkeys = [ grep { ! exists $lo->{$_} } @$allkeys ] if ref $lo;
|
|
unless ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
|
|
$allkeys = [ sort @$allkeys ];
|
|
}
|
|
my $prelen = $self->{__attr_prefix_len};
|
|
my $pregex = $self->{__attr_prefix_rex};
|
|
|
|
foreach my $keys ( $firstkeys, $allkeys, $lastkeys ) {
|
|
next unless ref $keys;
|
|
my $elemkey = $prelen ? [ grep { $_ !~ $pregex } @$keys ] : $keys;
|
|
my $attrkey = $prelen ? [ grep { $_ =~ $pregex } @$keys ] : [];
|
|
|
|
foreach my $key ( @$elemkey ) {
|
|
my $val = $hash->{$key};
|
|
if ( !defined $val ) {
|
|
push( @$out, "<$key />" );
|
|
}
|
|
elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) {
|
|
my $child = $self->array_to_xml( $key, $val );
|
|
push( @$out, $child );
|
|
}
|
|
elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) {
|
|
my $child = $self->scalaref_to_cdata( $key, $val );
|
|
push( @$out, $child );
|
|
}
|
|
elsif ( ref $val ) {
|
|
my $child = $self->hash_to_xml( $key, $val );
|
|
push( @$out, $child );
|
|
}
|
|
else {
|
|
my $child = $self->scalar_to_xml( $key, $val );
|
|
push( @$out, $child );
|
|
}
|
|
}
|
|
|
|
foreach my $key ( @$attrkey ) {
|
|
my $name = substr( $key, $prelen );
|
|
my $val = &xml_escape( $hash->{$key} );
|
|
push( @$attr, ' ' . $name . '="' . $val . '"' );
|
|
}
|
|
}
|
|
my $jattr = join( '', @$attr );
|
|
|
|
if ( defined $name && scalar @$out && ! grep { ! /^</s } @$out ) {
|
|
# Use human-friendly white spacing
|
|
if ( defined $self->{__indent} ) {
|
|
s/^(\s*<)/$self->{__indent}$1/mg foreach @$out;
|
|
}
|
|
unshift( @$out, "\n" );
|
|
}
|
|
|
|
my $text = join( '', @$out );
|
|
if ( defined $name ) {
|
|
if ( scalar @$out ) {
|
|
$text = "<$name$jattr>$text</$name>\n";
|
|
}
|
|
else {
|
|
$text = "<$name$jattr />\n";
|
|
}
|
|
}
|
|
$text;
|
|
}
|
|
|
|
sub array_to_xml {
|
|
my $self = shift;
|
|
my $name = shift;
|
|
my $array = shift;
|
|
my $out = [];
|
|
foreach my $val (@$array) {
|
|
if ( !defined $val ) {
|
|
push( @$out, "<$name />\n" );
|
|
}
|
|
elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) {
|
|
my $child = $self->array_to_xml( $name, $val );
|
|
push( @$out, $child );
|
|
}
|
|
elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) {
|
|
my $child = $self->scalaref_to_cdata( $name, $val );
|
|
push( @$out, $child );
|
|
}
|
|
elsif ( ref $val ) {
|
|
my $child = $self->hash_to_xml( $name, $val );
|
|
push( @$out, $child );
|
|
}
|
|
else {
|
|
my $child = $self->scalar_to_xml( $name, $val );
|
|
push( @$out, $child );
|
|
}
|
|
}
|
|
|
|
my $text = join( '', @$out );
|
|
$text;
|
|
}
|
|
|
|
sub scalaref_to_cdata {
|
|
my $self = shift;
|
|
my $name = shift;
|
|
my $ref = shift;
|
|
my $data = defined $$ref ? $$ref : '';
|
|
$data =~ s#(]])(>)#$1]]><![CDATA[$2#g;
|
|
#my $text = '<![CDATA[' . $data . ']]>';
|
|
my $text = $data;
|
|
$text = "<$name>$text</$name>\n" if ( $name ne $self->{text_node_key} );
|
|
$text;
|
|
}
|
|
|
|
sub scalar_to_xml {
|
|
my $self = shift;
|
|
my $name = shift;
|
|
my $scalar = shift;
|
|
my $copy = $scalar;
|
|
my $text = &xml_escape($copy);
|
|
$text = "<$name>$text</$name>\n" if ( $name ne $self->{text_node_key} );
|
|
$text;
|
|
}
|
|
|
|
sub write_raw_xml {
|
|
my $self = shift;
|
|
my $file = shift;
|
|
my $fh = Symbol::gensym();
|
|
open( $fh, ">$file" ) or return $self->die( "$! - $file" );
|
|
print $fh @_;
|
|
close($fh);
|
|
}
|
|
|
|
sub read_raw_xml {
|
|
my $self = shift;
|
|
my $file = shift;
|
|
my $fh = Symbol::gensym();
|
|
open( $fh, $file ) or return $self->die( "$! - $file" );
|
|
local $/ = undef;
|
|
my $text = <$fh>;
|
|
close($fh);
|
|
$text;
|
|
}
|
|
|
|
sub xml_decl_encoding {
|
|
my $textref = shift;
|
|
return unless defined $$textref;
|
|
my $args = ( $$textref =~ /^\s*<\?xml(\s+\S.*)\?>/s )[0] or return;
|
|
my $getcode = ( $args =~ /\s+encoding=(".*?"|'.*?')/ )[0] or return;
|
|
$getcode =~ s/^['"]//;
|
|
$getcode =~ s/['"]$//;
|
|
$getcode;
|
|
}
|
|
|
|
sub encode_from_to {
|
|
my $self = shift;
|
|
my $txtref = shift or return;
|
|
my $from = shift or return;
|
|
my $to = shift or return;
|
|
|
|
unless ( defined $Encode::EUCJPMS::VERSION ) {
|
|
$from = 'EUC-JP' if ( $from =~ /\beuc-?jp-?(win|ms)$/i );
|
|
$to = 'EUC-JP' if ( $to =~ /\beuc-?jp-?(win|ms)$/i );
|
|
}
|
|
|
|
my $setflag = $self->{utf8_flag} if exists $self->{utf8_flag};
|
|
if ( $] < 5.008001 && $setflag ) {
|
|
return $self->die( "Perl 5.8.1 is required for utf8_flag: $]" );
|
|
}
|
|
|
|
if ( $] >= 5.008 ) {
|
|
&load_encode();
|
|
my $check = ( $Encode::VERSION < 2.13 ) ? 0x400 : Encode::FB_XMLCREF();
|
|
if ( $] >= 5.008001 && utf8::is_utf8( $$txtref ) ) {
|
|
if ( $to =~ /^utf-?8$/i ) {
|
|
# skip
|
|
} else {
|
|
$$txtref = Encode::encode( $to, $$txtref, $check );
|
|
}
|
|
} else {
|
|
$$txtref = Encode::decode( $from, $$txtref );
|
|
if ( $to =~ /^utf-?8$/i && $setflag ) {
|
|
# skip
|
|
} else {
|
|
$$txtref = Encode::encode( $to, $$txtref, $check );
|
|
}
|
|
}
|
|
}
|
|
elsif ( ( uc($from) eq 'ISO-8859-1'
|
|
|| uc($from) eq 'US-ASCII'
|
|
|| uc($from) eq 'LATIN-1' ) && uc($to) eq 'UTF-8' ) {
|
|
&latin1_to_utf8($txtref);
|
|
}
|
|
else {
|
|
my $jfrom = &get_jcode_name($from);
|
|
my $jto = &get_jcode_name($to);
|
|
return $to if ( uc($jfrom) eq uc($jto) );
|
|
if ( $jfrom && $jto ) {
|
|
&load_jcode();
|
|
if ( defined $Jcode::VERSION ) {
|
|
Jcode::convert( $txtref, $jto, $jfrom );
|
|
}
|
|
else {
|
|
return $self->die( "Jcode.pm is required: $from to $to" );
|
|
}
|
|
}
|
|
else {
|
|
return $self->die( "Encode.pm is required: $from to $to" );
|
|
}
|
|
}
|
|
$to;
|
|
}
|
|
|
|
sub load_jcode {
|
|
return if defined $Jcode::VERSION;
|
|
local $@;
|
|
eval { require Jcode; };
|
|
}
|
|
|
|
sub load_encode {
|
|
return if defined $Encode::VERSION;
|
|
local $@;
|
|
eval { require Encode; };
|
|
}
|
|
|
|
sub latin1_to_utf8 {
|
|
my $strref = shift;
|
|
$$strref =~ s{
|
|
([\x80-\xFF])
|
|
}{
|
|
pack( 'C2' => 0xC0|(ord($1)>>6),0x80|(ord($1)&0x3F) )
|
|
}exg;
|
|
}
|
|
|
|
sub get_jcode_name {
|
|
my $src = shift;
|
|
my $dst;
|
|
if ( $src =~ /^utf-?8$/i ) {
|
|
$dst = 'utf8';
|
|
}
|
|
elsif ( $src =~ /^euc.*jp(-?(win|ms))?$/i ) {
|
|
$dst = 'euc';
|
|
}
|
|
elsif ( $src =~ /^(shift.*jis|cp932|windows-31j)$/i ) {
|
|
$dst = 'sjis';
|
|
}
|
|
elsif ( $src =~ /^iso-2022-jp/ ) {
|
|
$dst = 'jis';
|
|
}
|
|
$dst;
|
|
}
|
|
|
|
sub xml_escape {
|
|
my $str = shift;
|
|
return '' unless defined $str;
|
|
# except for TAB(\x09),CR(\x0D),LF(\x0A)
|
|
$str =~ s{
|
|
([\x00-\x08\x0B\x0C\x0E-\x1F\x7F])
|
|
}{
|
|
sprintf( '&#%d;', ord($1) );
|
|
}gex;
|
|
$str =~ s/&(?!#(\d+;|x[\dA-Fa-f]+;))/&/g;
|
|
$str =~ s/</</g;
|
|
$str =~ s/>/>/g;
|
|
$str =~ s/'/'/g;
|
|
$str =~ s/"/"/g;
|
|
$str;
|
|
}
|
|
|
|
sub xml_unescape {
|
|
my $str = shift;
|
|
my $map = {qw( quot " lt < gt > apos ' amp & )};
|
|
$str =~ s{
|
|
(&(?:\#(\d+)|\#x([0-9a-fA-F]+)|(quot|lt|gt|apos|amp));)
|
|
}{
|
|
$4 ? $map->{$4} : &char_deref($1,$2,$3);
|
|
}gex;
|
|
$str;
|
|
}
|
|
|
|
sub char_deref {
|
|
my( $str, $dec, $hex ) = @_;
|
|
if ( defined $dec ) {
|
|
return &code_to_utf8( $dec ) if ( $dec < 256 );
|
|
}
|
|
elsif ( defined $hex ) {
|
|
my $num = hex($hex);
|
|
return &code_to_utf8( $num ) if ( $num < 256 );
|
|
}
|
|
return $str;
|
|
}
|
|
|
|
sub code_to_utf8 {
|
|
my $code = shift;
|
|
if ( $code < 128 ) {
|
|
return pack( C => $code );
|
|
}
|
|
elsif ( $code < 256 ) {
|
|
return pack( C2 => 0xC0|($code>>6), 0x80|($code&0x3F));
|
|
}
|
|
elsif ( $code < 65536 ) {
|
|
return pack( C3 => 0xC0|($code>>12), 0x80|(($code>>6)&0x3F), 0x80|($code&0x3F));
|
|
}
|
|
return shift if scalar @_; # default value
|
|
sprintf( '&#x%04X;', $code );
|
|
}
|
|
|
|
1;
|