2
# $Id: XMLTV.pm.in,v 1.132 2007/11/05 08:14:27 rmeden Exp $
8
our @EXPORT_OK = qw(read_data parse parsefile write_data
9
best_name list_channel_keys list_programme_keys);
11
# For the time being the version of this library is tied to that of
12
# the xmltv package as a whole. This number should be checked by the
15
our $VERSION = '0.5.50';
17
# Work around changing behaviour of XML::Twig. On some systems (like
18
# mine) it always returns UTF-8 data unless KeepEncoding is specified.
19
# However the encoding() method tells you the encoding of the original
20
# document, not of the data you receive. To be sure of what you're
21
# getting, it is easiest on such a system to not give KeepEncoding and
24
# But on other systems (seemingly perl 5.8 and above), XML::Twig tries
25
# to keep the original document's encoding in the strings returned.
26
# You then have to call encoding() to find out what you're getting.
27
# To make sure of this behaviour we set KeepEncoding to true on such a
30
# Setting KeepEncoding true everywhere seems to do no harm, it's a
31
# pity that we lose conversion to UTF-8 but at least it's the same
32
# everywhere. So the library is distributed with this flag on.
34
my $KEEP_ENCODING = 1;
36
my %warned_unknown_key;
37
sub warn_unknown_keys( $$ );
43
XMLTV - Perl extension to read and write TV listings in XMLTV format
48
my $data = XMLTV::parsefile('tv.xml');
49
my ($encoding, $credits, $ch, $progs) = @$data;
50
my $langs = [ 'en', 'fr' ];
51
print 'source of listings is: ', $credits->{'source-info-name'}, "\n"
52
if defined $credits->{'source-info-name'};
53
foreach (values %$ch) {
54
my ($text, $lang) = @{XMLTV::best_name($langs, $_->{'display-name'})};
55
print "channel $_->{id} has name $text\n";
56
print "...in language $lang\n" if defined $lang;
59
print "programme on channel $_->{channel} at time $_->{start}\n";
60
next if not defined $_->{desc};
61
foreach (@{$_->{desc}}) {
62
my ($text, $lang) = @$_;
63
print "has description $text\n";
64
print "...in language $lang\n" if defined $lang;
68
The value of $data will be something a bit like:
71
{ 'source-info-name' => 'Ananova', 'generator-info-name' => 'XMLTV' },
72
{ 'radio-4.bbc.co.uk' => { 'display-name' => [ [ 'en', 'BBC Radio 4' ],
75
'id' => 'radio-4.bbc.co.uk' },
77
[ { start => '200111121800', title => [ [ 'Simpsons', 'en' ] ],
78
channel => 'radio-4.bbc.co.uk' },
83
This module provides an interface to read and write files in XMLTV
84
format (a TV listings format defined by xmltv.dtd). In general element
85
names in the XML correspond to hash keys in the Perl data structure.
86
You can think of this module as a bit like B<XML::Simple>, but
87
specialized to the XMLTV file format.
89
The Perl data structure corresponding to an XMLTV file has four
90
elements. The first gives the character encoding used for text data,
91
typically UTF-8 or ISO-8859-1. (The encoding value could also be
92
undef meaning 'unknown', when the library canE<39>t work out what it
93
is.) The second element gives the attributes of the root <tv>
94
element, which give information about the source of the TV listings.
95
The third element is a list of channels, each list element being a
96
hash corresponding to one <channel> element. The fourth element is
97
similarly a list of programmes. More details about the data structure
98
are given later. The easiest way to find out what it looks like is to
99
load some small XMLTV files and use B<Data::Dumper> to print out the
109
use XML::Writer 0.600;
114
# Use Lingua::Preferred if available, else kludge a replacement.
115
sub my_which_lang { return $_[1]->[0] }
117
eval { require Lingua::Preferred };
118
*which_lang = $@ ? \&my_which_lang : \&Lingua::Preferred::which_lang;
121
# Use Log::TraceMessages if installed.
123
eval { require Log::TraceMessages };
129
*t = \&Log::TraceMessages::t;
130
*d = \&Log::TraceMessages::d;
134
# Attributes and subelements of channel. Each subelement additionally
135
# needs a handler defined. Multiplicity is given for both, but for
136
# attributes the only allowable values are '1' and '?'.
138
# Ordering of attributes is not really important, but we keep the same
139
# order as they are given in the DTD so that output looks nice.
141
# The ordering of the subelements list gives the order in which these
142
# elements must appear in the DTD. In fact, these lists just
143
# duplicate information in the DTD and add details of what handlers
146
our @Channel_Attributes = ([ 'id', '1' ]);
147
our @Channel_Handlers =
149
[ 'display-name', 'with-lang', '+' ],
150
[ 'icon', 'icon', '*' ],
151
[ 'url', 'scalar', '*' ],
154
# Same for <programme> elements.
155
our @Programme_Attributes =
159
[ 'pdc-start', '?' ],
160
[ 'vps-start', '?' ],
162
[ 'videoplus', '?' ],
166
our @Programme_Handlers =
168
[ 'title', 'with-lang', '+' ],
169
[ 'sub-title', 'with-lang', '*' ],
170
[ 'desc', 'with-lang/m', '*' ],
171
[ 'credits', 'credits', '?' ],
172
[ 'date', 'scalar', '?' ],
173
[ 'category', 'with-lang', '*' ],
174
[ 'language', 'with-lang', '?' ],
175
[ 'orig-language', 'with-lang', '?' ],
176
[ 'length', 'length', '?' ],
177
[ 'icon', 'icon', '*' ],
178
[ 'url', 'scalar', '*' ],
179
[ 'country', 'with-lang', '*' ],
180
[ 'episode-num', 'episode-num', '*' ],
181
[ 'video', 'video', '?' ],
182
[ 'audio', 'audio', '?' ],
183
[ 'previously-shown', 'previously-shown', '?' ],
184
[ 'premiere', 'with-lang/em', '?' ],
185
[ 'last-chance', 'with-lang/em', '?' ],
186
[ 'new', 'presence', '?' ],
187
[ 'subtitles', 'subtitles', '*' ],
188
[ 'rating', 'rating', '*' ],
189
[ 'star-rating', 'star-rating', '*' ],
192
# And a hash mapping names like 'with-lang' to pairs of subs. The
193
# first for reading, the second for writing. Note that the writers
194
# alter the passed-in data as a side effect! (If the writing sub is
195
# called with an undef XML::Writer then it writes nothing but still
196
# warns for (most) bad data checks - and still alters the data.)
200
# Undocumented interface for adding extensions to the XMLTV format:
201
# first add an entry to @XMLTV::Channel_Handlers or
202
# @XMLTV::Programme_Handlers with your new element's name, 'type' and
203
# multiplicity. The 'type' should be a string you invent yourself.
204
# Then $XMLTV::Handlers{'type'} should be a pair of subroutines, a
205
# reader and a writer. (Unless you want to use one of the existing
206
# types such as 'with-lang' or 'scalar'.)
208
# Note that elements and attributes beginning 'x-' are skipped over
209
# _automatically_, so you can't parse them with this method. A better
210
# way to add extensions is needed - doing this not encouraged but is
211
# sometimes necessary.
214
# read_data() is a deprecated name for parsefile().
215
sub read_data( $ ) { # FIXME remove altogether
216
warn "XMLTV::read_data() deprecated, use XMLTV::parsefile() instead\n";
223
croak 'no <tv> element found' if not /<tv/;
229
=item parse(document)
231
Takes an XMLTV document (a string) and returns the Perl data
232
structure. It is assumed that the document is valid XMLTV; if not
233
the routine may die() with an error (although the current implementation
234
just warns and continues for most small errors).
236
The first element of the listref returned, the encoding, may vary
237
according to the encoding of the input document, the versions of perl
238
and C<XML::Parser> installed, the configuration of the XMLTV library
239
and other factors including, but not limited to, the phase of the
240
moon. With luck it should always be either the encoding of the input
243
Attributes and elements in the XML file whose names begin with 'x-'
244
are skipped silently. You can use these to include information which
245
is not currently handled by the XMLTV format, or by this module.
251
# FIXME commonize with parsefiles()
252
my ($encoding, $credits);
256
sub { $encoding = shift },
257
sub { $credits = shift },
258
sub { for (shift) { $channels{$_->{id}} = $_ } },
259
sub { push @programmes, shift });
260
return [ $encoding, $credits, \%channels, \@programmes ];
265
=item parsefiles(filename...)
267
Like C<parse()> but takes one or more filenames instead of a string
268
document. The data returned is the merging of those file contents:
269
the programmes will be concatenated in their original order, the
270
channels just put together in arbitrary order (ordering of channels
273
It is necessary that each file have the same character encoding, if
274
not, an exception is thrown. Ideally the credits information would
275
also be the same between all the files, since there is no obvious way to
276
merge it - but if the credits information differs from one file to the
277
next, one file is picked arbitrarily to provide credits and a warning
278
is printed. If two files give differing channel definitions for the
279
same XMLTV channel id, then one is picked arbitrarily and a warning
282
In the simple case, with just one file, you neednE<39>t worry
283
about mismatching of encodings, credits or channels.
285
The deprecated function C<parsefile()> is a wrapper allowing just one
289
sub parsefiles( @ ) {
290
die 'one or more filenames required' if not @_;
291
my ($encoding, $credits);
294
parsefiles_callback(sub { $encoding = shift },
295
sub { $credits = shift },
296
sub { for (shift) { $channels{$_->{id}} = $_ } },
297
sub { push @programmes, shift },
299
return [ $encoding, $credits, \%channels, \@programmes ];
302
sub parsefile( $ ) { parsefiles(@_) }
306
=item parse_callback(document, encoding_callback, credits_callback,
307
channel_callback, programme_callback)
309
An alternative interface. Whereas C<parse()> reads the whole document
310
and then returns a finished data structure, with this routine you
311
specify a subroutine to be called as each <channel> element is read
312
and another for each <programme> element.
314
The first argument is the document to parse. The remaining arguments
315
are code references, one for each part of the document.
317
The callback for encoding will be called once with a string giving the
318
encoding. In present releases of this module, it is also possible for
319
the value to be undefined meaning 'unknown', but itE<39>s hoped that
320
future releases will always be able to figure out the encoding used.
322
The callback for credits will be called once with a hash reference.
323
For channels and programmes, the appropriate function will be called
324
zero or more times depending on how many channels / programmes are
327
The four subroutines will be called in order, that is, the encoding
328
and credits will be done before the channel handler is called and all
329
the channels will be dealt with before the first programme handler is
332
If any of the code references is undef, nothing is called for that part
335
For backwards compatibility, if the value for 'encoding callback' is
336
not a code reference but a scalar reference, then the encoding found
337
will be stored in that scalar. Similarly if the 'credits callback'
338
is a scalar reference, the scalar it points to will be set to point
339
to the hash of credits. This style of interface is deprecated: new
340
code should just use four callbacks.
344
my $document = '<tv>...</tv>';
347
sub encoding_cb( $ ) { $encoding = shift }
350
sub credits_cb( $ ) { $credits = shift }
352
# The callback for each channel populates this hash.
354
sub channel_cb( $ ) {
356
$channels{$c->{id}} = $c;
359
# The callback for each programme. We know that channels are
360
# always read before programmes, so the %channels hash will be
363
sub programme_cb( $ ) {
365
print "got programme: $p->{title}->[0]->[0]\n";
366
my $c = $channels{$p->{channel}};
367
print 'channel name is: ', $c->{'display-name'}->[0]->[0], "\n";
371
XMLTV::parse_callback($document, \&encoding_cb, \&credits_cb,
372
\&channel_cb, \&programme_cb);
376
sub new_doc_callback( $$$$ ) {
377
my ($enc_cb, $cred_cb, $ch_cb, $p_cb) = @_;
378
t 'creating new XML::Twig';
379
t '\@Channel_Handlers=' . d \@Channel_Handlers;
380
t '\@Programme_Handlers=' . d \@Programme_Handlers;
381
new XML::Twig(StartTagHandlers =>
385
if ($KEEP_ENCODING) {
386
t 'KeepEncoding on, get original encoding';
387
$enc = $t->encoding();
390
t 'assuming UTF-8 encoding';
394
if (defined $enc_cb) {
399
elsif ($_ eq 'SCALAR') {
403
die "callback should be code ref or scalar ref, or undef";
408
if (defined $cred_cb) {
409
my $cred = get_attrs($node);
414
elsif ($_ eq 'SCALAR') {
418
die "callback should be code ref or scalar ref, or undef";
422
# Most of the above code can be removed in the
429
{ '/tv/channel' => sub {
431
die if not defined $node;
432
my $c = node_to_channel($node);
435
warn "skipping bad channel element\n";
442
'/tv/programme' => sub {
444
die if not defined $node;
445
my $p = node_to_programme($node);
448
warn "skipping bad programme element\n";
456
KeepEncoding => $KEEP_ENCODING,
460
sub parse_callback( $$$$$ ) {
461
my ($str, $enc_cb, $cred_cb, $ch_cb, $p_cb) = @_;
463
new_doc_callback($enc_cb, $cred_cb, $ch_cb, $p_cb)->parse($str);
468
=item parsefiles_callback(encoding_callback, credits_callback,
469
channel_callback, programme_callback,
472
As C<parse_callback()> but takes one or more filenames to open,
473
merging their contents in the same manner as C<parsefiles()>. Note
474
that the reading is still gradual - you get the channels and
475
programmes one at a time, as they are read.
477
Note that the same <channel> may be present in more than one file, so
478
the channel callback will get called more than once. ItE<39>s your
479
responsibility to weed out duplicate channel elements (since writing
480
them out again requires that each have a unique id).
482
For compatibility, there is an alias C<parsefile_callback()> which is
483
the same but takes only a single filename, B<before> the callback
484
arguments. This is deprecated.
487
sub parsefile_callback( $$$$$ ) {
488
my ($f, $enc_cb, $cred_cb, $ch_cb, $p_cb) = @_;
489
parsefiles_callback($enc_cb, $cred_cb, $ch_cb, $p_cb, $f);
492
sub parsefiles_callback( $$$$@ ) {
493
my ($enc_cb, $cred_cb, $ch_cb, $p_cb, @files) = @_;
494
die "one or more files required" if not @files;
495
my $all_encoding; my $have_encoding = 0;
499
my $do_next_file; # to be defined below
500
my $my_enc_cb = sub( $ ) {
502
t 'encoding callback';
503
if ($have_encoding) {
504
t 'seen encoding before, just check';
505
my ($da, $de) = (defined $all_encoding, defined $e);
506
if (not $da and not $de) {
507
warn "two files both have unspecified character encodings, hope they're the same\n";
509
elsif (not $da and $de) {
510
warn "encoding $e not being returned to caller\n";
513
elsif ($da and not $de) {
514
warn "input file with unspecified encoding, assuming same as others ($all_encoding)\n";
516
elsif ($da and $de) {
517
if ($all_encoding ne $e) {
518
die "this file's encoding $e differs from others' $all_encoding - aborting\n";
524
t 'not seen encoding before, call user';
525
$enc_cb->($e) if $enc_cb;
531
my $my_cred_cb = sub( $ ) {
533
if (defined $all_credits) {
534
if (Dumper($all_credits) ne Dumper($c)) {
535
warn "different files have different credits, picking one arbitrarily\n";
536
# In fact, we pick the last file in the list since this is the
537
# first to be opened.
542
$cred_cb->($c) if $cred_cb;
547
my $my_ch_cb = sub( $ ) {
550
if (defined $all_channels{$id} and Dumper($all_channels{$id}) ne Dumper($c)) {
551
warn "differing channels with id $id, picking one arbitrarily\n";
554
$all_channels{$id} = $c;
555
$ch_cb->($c) if $ch_cb;
559
my $my_p_cb = sub( $ ) {
560
$do_next_file->(); # if any
561
$p_cb->(@_) if $p_cb;
564
$do_next_file = sub() {
569
# FIXME commonize these augmented warning messages. Weird
570
# stuff (up to and including segfaults) happens if you
571
# call warn() or die() from these handlers.
573
local $SIG{__WARN__} = sub {
575
$msg = "warning: something's wrong" if not defined $msg;
577
print STDERR "$f: $msg\n";
579
local $SIG{__DIE__} = sub {
581
$msg = "warning: something's wrong" if not defined $msg;
583
print STDERR "$f: $msg\n";
587
my $t = new_doc_callback($my_enc_cb, $my_cred_cb, $my_ch_cb, $my_p_cb);
598
=item write_data(data, options...)
600
Takes a data structure and writes it as XML to standard output. Any
601
extra arguments are passed on to XML::WriterE<39>s constructor, for example
603
my $f = new IO::File '>out.xml'; die if not $f;
604
write_data($data, OUTPUT => $f);
606
The encoding used for the output is given by the first element of the
609
Normally, there will be a warning for any Perl data which is not
610
understood and cannot be written as XMLTV, such as strange keys in
611
hashes. But as an exception, any hash key beginning with an
612
underscore will be skipped over silently. You can store 'internal use
615
If a programme or channel hash contains a key beginning with 'debug',
616
this key and its value will be written out as a comment inside the
617
<programme> or <channel> element. This lets you include small
618
debugging messages in the XML output.
621
sub write_data( $;@ ) {
623
my $writer = new XMLTV::Writer(encoding => $data->[0], @_);
624
$writer->start($data->[1]);
625
$writer->write_channels($data->[2]);
626
$writer->write_programme($_) foreach @{$data->[3]};
635
# Given a node, return a hashref of its attributes. Skips over
636
# the 'x-whatever' attributes.
639
my $node = shift; die if not defined $node;
640
my %r = %{$node->atts()};
657
# Given a node containing only text, return that text (with whitespace
658
# either side stripped). If the node has no children (as in
659
# <foo></foo> or <foo />), this is considered to be the empty string.
661
# Parameter: whether newlines are allowed (defaults to false)
663
sub get_text( $;$ ) {
665
my $allow_nl = shift; $allow_nl = 0 if not defined $allow_nl;
666
my @children = get_subelements($node);
667
if (@children == 0) {
670
elsif (@children == 1) {
671
my $v = $children[0]->pcdata();
672
t 'got pcdata: ' . d $v;
673
if (not defined $v) {
674
my $name = get_name($node);
675
warn "node $name expected to contain text has other stuff\n";
678
# Just hope that the encoding we got uses \n...
679
if (not $allow_nl and $v =~ tr/\n//d) {
680
my $name = get_name($node);
681
warn "removing newlines from content of node $name\n";
685
t 'returning: ' . d $v;
688
elsif (@children > 1) {
689
my $name = get_name($node);
690
warn "node $name expected to contain text has more than one child\n";
696
# Private. Clean up parsed text. Takes ref to scalar.
698
our $v; local *v = shift; die if not defined $v;
699
if ($XML::Twig::VERSION < 3.01 || $KEEP_ENCODING) {
700
# Old versions of XML::Twig had stupid behaviour with
701
# entities - and so do the new ones if KeepEncoding is on.
708
s/&/&/g; # needs to be last
712
t 'new XML::Twig, not KeepEncoding, entities already dealt with';
719
# On Windows there seems to be an inconsistency between
720
# XML::Twig and XML::Writer. The former returns text with
721
# \r\n line endings to the application, but the latter adds \r
722
# characters to text outputted. So reading some text and
723
# writing it again accumulates an extra \r character. We fix
724
# this by removing \r from the input here.
734
# Return a list of all subelements of a node. Whitespace is
735
# ignored; anything else that isn't a subelement is warned about.
736
# Skips over elements with name 'x-whatever'.
738
sub get_subelements( $ ) {
739
grep { (my $tmp = get_name($_)) !~ /^x-/ } $_[0]->children();
746
# Return the element name of a node.
748
sub get_name( $ ) { $_[0]->gi() }
754
# Return some information about a node for debugging.
758
# Doesn't seem to be easy way to get 'type' of node.
759
my $r = 'name: ' . get_name($n) . "\n";
760
for (trunc($n->text())) {
761
$r .= "value: $_\n" if defined and length;
765
# Private. Truncate a string to a reasonable length and add '...' if
770
return undef if not defined;
772
return substr($_, 0, 1000) . '...';
779
=item best_name(languages, pairs [, comparator])
781
The XMLTV format contains many places where human-readable text is
782
given an optional 'lang' attribute, to allow mixed languages. This is
783
represented in Perl as a pair [ text, lang ], although the second
784
element may be missing or undef if the language is unknown. When
785
several alernatives for an element (such as <title>) can be given, the
786
representation is a list of [ text, lang ] pairs. Given such a list,
787
what is the best text to use? It depends on the userE<39>s preferred
790
This function takes a list of acceptable languages and a list of [string,
791
language] pairs, and finds the best one to use. This means first finding
792
the appropriate language and then picking the 'best' string in that
795
The best is normally defined as the first one found in a usable
796
language, since the XMLTV format puts the most canonical versions
797
first. But you can pass in your own comparison function, for example
798
if you want to choose the shortest piece of text that is in an
801
The acceptable languages should be a reference to a list of language
802
codes looking like 'ru', or like 'de_DE'. The text pairs should be a
803
reference to a list of pairs [ string, language ]. (As a special case
804
if this list is empty or undef, that means no text is present, and the
805
result is undef.) The third argument if present should be a cmp-style
806
function that compares two strings of text and returns 1 if the first
807
argument is better, -1 if the second better, 0 if theyE<39>re equally
810
Returns: [s, l] pair, where s is the best of the strings to use and l
811
is its language. This pair is 'live' - it is one of those from the
812
list passed in. So you can use C<best_name()> to find the best pair
813
from a list and then modify the content of that pair.
815
(This routine depends on the C<Lingua::Preferred> module being
816
installed; if that module is missing then the first available
817
language is always chosen.)
821
my $langs = [ 'de', 'fr' ]; # German or French, please
823
# Say we found the following under $p->{title} for a programme $p.
824
my $pairs = [ [ 'La CitE des enfants perdus', 'fr' ],
825
[ 'The City of Lost Children', 'en_US' ] ];
827
my $best = best_name($langs, $pairs);
828
print "chose title $best->[0]\n";
831
sub best_name( $$;$ ) {
832
my ($wanted_langs, $pairs, $compare) = @_;
833
t 'best_name() ENTRY';
834
t 'wanted langs: ' . d $wanted_langs;
835
t '[text,lang] pairs: ' . d $pairs;
836
t 'comparison fn: ' . d $compare;
837
return undef if not defined $pairs;
841
my (%seen_lang, $seen_undef);
842
# Collect the list of available languages.
843
foreach (map { $_->[1] } @pairs) {
845
next if $seen_lang{$_}++;
848
next if $seen_undef++;
850
push @avail_langs, $_;
853
my $pref_lang = which_lang($wanted_langs, \@avail_langs);
855
# Gather up [text, lang] pairs which have the desired language.
858
my ($text, $lang) = @$_;
859
next unless ((not defined $lang)
860
or (defined $pref_lang and $lang eq $pref_lang));
861
push @candidates, $_;
864
return undef if not @candidates;
866
# If a comparison function was passed in, use it to compare the
867
# text strings from the candidate pairs.
869
@candidates = sort { $compare->($a->[0], $b->[0]) } @candidates
872
# Pick the first candidate. This will be the one ordered first by
873
# the comparison function if given, otherwise the earliest in the
876
return $candidates[0];
880
=item list_channel_keys(), list_programme_keys()
882
Some users of this module may wish to enquire at runtime about which
883
keys a programme or channel hash can contain. The data in the hash
884
comes from the attributes and subelements of the corresponding element
885
in the XML. The values of attributes are simply stored as strings,
886
while subelements are processed with a handler which may return a
887
complex data structure. These subroutines returns a hash mapping key
888
to handler name and multiplicity. This lets you know what data types
889
can be expected under each key. For keys which come from attributes
890
rather than subelements, the handler is set to 'scalar', just as for
891
subelements which give a simple string. See L<"DATA STRUCTURE"> for
892
details on what the different handler names mean.
894
It is not possible to find out which keys are mandatory and which
895
optional, only a list of all those which might possibly be present.
896
An example use of these routines is the L<tv_grep(1)> program, which
897
creates its allowed command line arguments from the names of programme
902
sub list_keys( $$ ) {
906
foreach (@{shift()}) {
907
my ($k, $mult) = @$_;
908
$r{$k} = [ 'scalar', $mult ];
912
foreach (@{shift()}) {
913
my ($k, $h_name, $mult) = @$_;
914
$r{$k} = [ $h_name, $mult ];
920
sub list_channel_keys() {
921
list_keys(\@Channel_Attributes, \@Channel_Handlers);
923
sub list_programme_keys() {
924
list_keys(\@Programme_Attributes, \@Programme_Handlers);
929
=item catfiles(w_args, filename...)
931
Concatenate several listings files, writing the output to somewhere
932
specified by C<w_args>. Programmes are catenated together, channels
933
are merged, for credits we just take the first and warn if the others
936
The first argument is a hash reference giving information to pass to
937
C<XMLTV::Writer>E<39>s constructor. But do not specify encoding, this
938
will be taken from the input files. Currently C<catfiles()> will fail
939
work if the input files have different encodings.
946
XMLTV::parsefiles_callback
949
$w = new XMLTV::Writer(%$w_args, encoding => shift);
951
sub { $w->start(shift) },
955
if (not defined $seen_ch{$id}) {
956
$w->write_channel($c);
959
elsif (Dumper($seen_ch{$id}) eq Dumper($c)) {
960
# They're identical, okay.
963
warn "channel $id may differ between two files, "
964
. "picking one arbitrarily\n";
967
sub { $w->write_programme(shift) },
976
Concatenate (and merge) listings data. Programmes are catenated
977
together, channels are merged, for credits we just take the first and
978
warn if the others differ (except that the 'date' of the result is the
979
latest date of all the inputs).
981
Whereas C<catfiles()> reads and writes files, this function takes
982
already-parsed listings data and returns some more listings data. It
983
is much more memory-hungry.
986
sub cat( @ ) { cat_aux(1, @_) }
990
=item cat_noprogrammes
992
Like C<cat()> but ignores the programme data and just returns
993
encoding, credits and channels. This is in case for scalability
994
reasons you want to handle programmes individually, but still
995
merge the smaller data.
998
sub cat_noprogrammes( @ ) { cat_aux(0, @_) }
1002
my ($all_credits_nodate, $all_credits_date);
1005
my $do_progs = shift;
1008
t 'doing arg: ' . d $_;
1009
my ($encoding, $credits, $channels, $progs) = @$_;
1011
if (not defined $all_encoding) {
1012
$all_encoding = $encoding;
1014
elsif ($encoding ne $all_encoding) {
1015
die "different files have different encodings, cannot continue\n";
1018
# If the credits are different between files there's not a lot
1019
# we can do to merge them. Apart from 'date', that is. There
1020
# we can say that the date of the concatenated listings is the
1021
# newest date from all the sources.
1023
my %credits_nodate = %$credits; # copy
1024
my $d = delete $credits_nodate{date};
1026
# Need to 'require' rather than 'use' this because
1027
# XMLTV.pm is loaded during the build process and
1028
# XMLTV::Date isn't available then. Urgh.
1030
require XMLTV::Date;
1031
my $dp = XMLTV::Date::parse_date($d);
1032
for ($all_credits_date) {
1034
or Date_Cmp(XMLTV::Date::parse_date($_), $dp) < 0) {
1040
# Now in uniqueness checks ignore the date.
1041
if (not defined $all_credits_nodate) {
1042
$all_credits_nodate = \%credits_nodate;
1044
elsif (Dumper(\%credits_nodate) ne Dumper($all_credits_nodate)) {
1045
warn "different files have different credits, taking from first file\n";
1048
foreach (keys %$channels) {
1049
if (not defined $all_channels{$_}) {
1050
$all_channels{$_} = $channels->{$_};
1052
elsif (Dumper($all_channels{$_}) ne Dumper($channels->{$_})) {
1053
warn "channel $_ differs between two files, taking first appearance\n";
1057
push @all_progs, @$progs if $do_progs;
1060
$all_encoding = 'UTF-8' if not defined $all_encoding;
1063
%all_credits = %$all_credits_nodate
1064
if defined $all_credits_nodate;
1065
$all_credits{date} = $all_credits_date
1066
if defined $all_credits_date;
1069
return [ $all_encoding, \%all_credits, \%all_channels, \@all_progs ];
1072
return [ $all_encoding, \%all_credits, \%all_channels ];
1077
# For each subelement of programme, we define a subroutine to read it
1078
# and one to write it. The reader takes an node for a single
1079
# subelement and returns its value as a Perl scalar (warning and
1080
# returning undef if error). The writer takes an XML::Writer, an
1081
# element name and a scalar value and writes a subelement for that
1082
# value. Note that the element name is passed in to the writer just
1083
# for symmetry, so that neither the writer or the reader have to know
1084
# what their element is called.
1090
=head1 DATA STRUCTURE
1092
For completeness, we describe more precisely how channels and
1093
programmes are represented in Perl. Each element of the channels list
1094
is a hashref corresponding to one <channel> element, and likewise for
1095
programmes. The possible keys of a channel (programme) hash are the
1096
names of attributes or subelements of <channel> (<programme>).
1098
The values for attributes are not processed in any way; an attribute
1099
C<fred="jim"> in the XML will become a hash element with key C<'fred'>,
1102
But for subelements, there is further processing needed to turn the
1103
XML content of a subelement into Perl data. What is done depends on
1104
what type of data is stored under that subelement. Also, if a certain
1105
element can appear several times then the hash key for that element
1106
points to a list of values rather than just one.
1108
The conversion of a subelementE<39>s content to and from Perl data is
1109
done by a handler. The most common handler is I<with-lang>, used for
1110
human-readable text content plus an optional 'lang' attribute. There
1111
are other handlers for other data structures in the file format.
1112
Often two subelements will share the same handler, since they hold the
1113
same type of data. The handlers defined are as follows; note that
1114
many of them will silently strip leading and trailing whitespace in
1115
element content. Look at the DTD itself for an explanation of the
1118
Unless specified otherwise, it is not allowed for an element expected
1119
to contain text to have empty content, nor for the text to contain
1126
Turns a list of credits (for director, actor, writer, etc.) into a
1127
hash mapping 'role' to a list of names. The names in each role are
1128
kept in the same order.
1131
$Handlers{credits}->[0] = sub( $ ) {
1133
my @roles = qw(director actor writer adapter producer presenter
1135
my %known_role; ++$known_role{$_} foreach @roles;
1137
foreach (get_subelements($node)) {
1138
my $role = get_name($_);
1139
unless ($known_role{$role}++) {
1140
warn "unknown thing in credits: $role";
1143
push @{$r{$role}}, get_text($_);
1147
$Handlers{credits}->[1] = sub( $$$ ) {
1148
my ($w, $e, $v) = @_; die if not defined $v;
1150
return if not %h; # don't write empty element
1151
t 'writing credits: ' . d \%h;
1152
# TODO some 'do nothing' setting in XML::Writer to replace this
1153
# convention of passing undef.
1155
$w->startTag($e) if $w;
1156
foreach (qw[director actor writer adapter producer presenter
1157
commentator guest] ) {
1158
next unless defined $h{$_};
1159
my @people = @{delete $h{$_}};
1160
foreach my $person (@people) {
1161
die if not defined $person;
1162
$w->dataElement($_, $person) if $w;
1165
warn_unknown_keys($e, \%h);
1166
$w->endTag($e) if $w;
1173
Reads and writes a simple string as the content of the XML element.
1176
$Handlers{scalar}->[0] = sub( $ ) {
1178
return get_text($node);
1180
$Handlers{scalar}->[1] = sub( $$$ ) {
1181
my ($w, $e, $v) = @_;
1183
$w->dataElement($e, $v) if $w;
1190
Converts the content of a <length> element into a number of seconds
1191
(so <length units="minutes">5</minutes> would be returned as 300). On
1192
writing out again tries to convert a number of seconds to a time in
1193
minutes or hours if that would look better.
1196
$Handlers{length}->[0] = sub( $ ) {
1197
my $node = shift; die if not defined $node;
1198
my %attrs = %{get_attrs($node)};
1199
my $d = get_text($node);
1200
if ($d =~ /^\s*$/) {
1201
warn "empty 'length' element";
1204
if ($d !~ tr/0-9// or $d =~ tr/0-9//c) {
1205
warn "bad content of 'length' element: $d";
1208
my $units = $attrs{units};
1209
if (not defined $units) {
1210
warn "missing 'units' attr in 'length' element";
1213
# We want to return a length in seconds.
1214
if ($units eq 'seconds') {
1217
elsif ($units eq 'minutes') {
1220
elsif ($units eq 'hours') {
1224
warn "bad value of 'units': $units";
1229
$Handlers{length}->[1] = sub( $$$ ) {
1230
my ($w, $e, $v) = @_;
1233
if ($v % 3600 == 0) {
1237
elsif ($v % 60 == 0) {
1244
$w->dataElement($e, $v, units => $units) if $w;
1249
=item I<episode-num>
1251
The representation in Perl of XMLTVE<39>s odd episode numbers is as a
1252
pair of [ content, system ]. As specified by the DTD, if the system is
1253
not given in the file then 'onscreen' is assumed. Whitespace in the
1254
'xmltv_ns' system is unimportant, so on reading it is normalized to
1255
a single space on either side of each dot.
1258
$Handlers{'episode-num'}->[0] = sub( $ ) {
1259
my $node = shift; die if not defined $node;
1260
my %attrs = %{get_attrs($node)};
1261
my $system = $attrs{system};
1262
$system = 'onscreen' if not defined $system;
1263
my $content = get_text($node);
1264
if ($system eq 'xmltv_ns') {
1265
# Make it look nice.
1266
$content =~ s/\s+//g;
1267
$content =~ s/\./ . /g;
1269
return [ $content, $system ];
1271
$Handlers{'episode-num'}->[1] = sub( $$$ ) {
1272
my ($w, $e, $v) = @_;
1274
if (not ref $v or ref $v ne 'ARRAY') {
1275
warn "not writing episode-num whose content is not an array";
1278
my ($content, $system) = @$v;
1279
$system = 'onscreen' if not defined $system;
1280
$w->dataElement($e, $content, system => $system) if $w;
1287
The <video> section is converted to a hash. The <present> subelement
1288
corresponds to the key 'present' of this hash, 'yes' and 'no' are
1289
converted to Booleans. The same applies to <colour>. The content of
1290
the <aspect> subelement is stored under the key 'aspect'. These keys
1291
can be missing in the hash just as the subelements can be missing in
1295
$Handlers{video}->[0] = sub ( $ ) {
1298
foreach (get_subelements($node)) {
1299
my $name = get_name($_);
1300
my $value = get_text($_);
1301
if ($name eq 'present') {
1302
warn "'present' seen twice" if defined $r{present};
1303
$r{present} = decode_boolean($value);
1305
elsif ($name eq 'colour') {
1306
warn "'colour' seen twice" if defined $r{colour};
1307
$r{colour} = decode_boolean($value);
1309
elsif ($name eq 'aspect') {
1310
warn "'aspect' seen twice" if defined $r{aspect};
1311
$value =~ /^\d+:\d+$/ or warn "bad aspect ratio: $value";
1312
$r{aspect} = $value;
1317
$Handlers{video}->[1] = sub( $$$ ) {
1318
my ($w, $e, $v) = @_;
1319
t "'video' element";
1321
return if not %h; # don't write empty element
1322
$w->startTag($e) if $w;
1323
if (defined (my $val = delete $h{present})) {
1324
$w->dataElement('present', encode_boolean($val)) if $w;
1326
if (defined (my $val = delete $h{colour})) {
1327
$w->dataElement('colour', encode_boolean($val)) if $w;
1329
if (defined (my $val = delete $h{aspect})) {
1330
$w->dataElement('aspect', $val) if $w;
1332
if (defined (my $val = delete $h{quality})) {
1333
$w->dataElement('quality', $val) if $w;
1335
warn_unknown_keys("zz $e", \%h);
1336
$w->endTag($e) if $w;
1343
This is similar to I<video>. <present> is a Boolean value, while
1344
the content of <stereo> is stored unchanged.
1347
$Handlers{audio}->[0] = sub( $ ) {
1350
foreach (get_subelements($node)) {
1351
my $name = get_name($_);
1352
my $value = get_text($_);
1353
if ($name eq 'present') {
1354
warn "'present' seen twice" if defined $r{present};
1355
$r{present} = decode_boolean($value);
1357
elsif ($name eq 'stereo') {
1358
warn "'stereo' seen twice" if defined $r{stereo};
1360
warn "empty 'stereo' element not permitted, should be <stereo>stereo</stereo>";
1363
warn "bad value for 'stereo': '$value'"
1364
if ($value ne 'mono'
1365
and $value ne 'stereo'
1366
and $value ne 'surround'
1367
and $value ne 'dolby digital'
1368
and $value ne 'dolby');
1369
$r{stereo} = $value;
1374
$Handlers{audio}->[1] = sub( $$$ ) {
1375
my ($w, $e, $v) = @_;
1377
return if not %h; # don't write empty element
1378
$w->startTag($e) if $w;
1379
if (defined (my $val = delete $h{present})) {
1380
$w->dataElement('present', encode_boolean($val)) if $w;
1382
if (defined (my $val = delete $h{stereo})) {
1383
$w->dataElement('stereo', $val) if $w;
1385
warn_unknown_keys($e, \%h);
1386
$w->endTag($e) if $w;
1391
=item I<previously-shown>
1393
The 'start' and 'channel' attributes are converted to keys in a hash.
1396
$Handlers{'previously-shown'}->[0] = sub( $ ) {
1397
my $node = shift; die if not defined $node;
1398
my %attrs = %{get_attrs($node)};
1400
foreach (qw(start channel)) {
1401
my $v = delete $attrs{$_};
1402
$r->{$_} = $v if defined $v;
1404
foreach (keys %attrs) {
1405
warn "unknown attribute $_ in previously-shown";
1409
$Handlers{'previously-shown'}->[1] = sub( $$$ ) {
1410
my ($w, $e, $v) = @_;
1411
$w->emptyTag($e, %$v) if $w;
1418
The content of the element is ignored: it signfies something by its
1419
very presence. So the conversion from XML to Perl is a constant true
1420
value whenever the element is found; the conversion from Perl to XML
1421
is to write out the element if true, donE<39>t write anything if false.
1424
$Handlers{presence}->[0] = sub( $ ) {
1426
# The 'new' element is empty, it signifies newness by its very
1431
$Handlers{presence}->[1] = sub( $$$ ) {
1432
my ($w, $e, $v) = @_;
1434
# Not new, so don't create an element.
1437
$w->emptyTag($e) if $w;
1445
The 'type' attribute and the 'language' subelement (both optional)
1446
become keys in a hash. But see I<language> for what to pass as the
1447
value of that element.
1450
$Handlers{subtitles}->[0] = sub( $ ) {
1451
my $node = shift; die if not defined $node;
1452
my %attrs = %{get_attrs($node)};
1454
$r{type} = $attrs{type} if defined $attrs{type};
1455
foreach (get_subelements($node)) {
1456
my $name = get_name($_);
1457
if ($name eq 'language') {
1458
warn "'language' seen twice" if defined $r{language};
1459
$r{language} = read_with_lang($_, 0, 0);
1462
warn "bad content of 'subtitles' element: $name";
1467
$Handlers{subtitles}->[1] = sub( $$$ ) {
1468
my ($w, $e, $v) = @_;
1470
my ($type, $language) = ($v->{type}, $v->{language});
1471
my %attrs; $attrs{type} = $type if defined $type;
1472
if (defined $language) {
1473
$w->startTag($e, %attrs) if $w;
1474
write_with_lang($w, 'language', $language, 0, 0);
1475
$w->endTag($e) if $w;
1478
$w->emptyTag($e, %attrs) if $w;
1486
The rating is represented as a tuple of [ rating, system, icons ].
1487
The last element is itself a listref of structures returned by the
1491
$Handlers{rating}->[0] = sub( $ ) {
1492
my $node = shift; die if not defined $node;
1493
my %attrs = %{get_attrs($node)};
1494
my $system = delete $attrs{system} if exists $attrs{system};
1495
foreach (keys %attrs) {
1496
warn "unknown attribute in rating: $_";
1498
my @children = get_subelements($node);
1500
# First child node is value.
1501
my $value_node = shift @children;
1502
if (not defined $value_node) {
1503
warn "missing 'value' element inside rating";
1506
if ((my $name = get_name($value_node)) ne 'value') {
1507
warn "expected 'value' node inside rating, got '$name'";
1511
my $rating = read_value($value_node);
1513
# Remaining children are icons.
1514
my @icons = map { read_icon($_) } @children;
1516
return [ $rating, $system, \@icons ];
1518
$Handlers{rating}->[1] = sub( $$$ ) {
1519
my ($w, $e, $v) = @_;
1520
if (not ref $v or ref $v ne 'ARRAY') {
1521
warn "not writing rating whose content is not an array";
1524
my ($rating, $system, $icons) = @$v;
1525
if (defined $system) {
1526
$w->startTag($e, system => $system) if $w;
1529
$w->startTag($e) if $w;
1532
write_value($w, 'value', $rating) if $w;
1533
if ($w) { write_icon($w, 'icon', $_) foreach @$icons };
1534
$w->endTag($e) if $w;
1539
=item I<star-rating>
1541
In XML this is a string 'X/Y' plus a list of icons. In Perl represented
1542
as a pair [ rating, icons ] similar to I<rating>.
1544
Multiple star ratings are now supported. For backward compatability,
1545
you may specify a single [rating,icon] or the preferred double array
1546
[[rating,system,icon],[rating2,system2,icon2]] (like 'ratings')
1550
$Handlers{'star-rating'}->[0] = sub( $ ) {
1552
my %attrs = %{get_attrs($node)};
1553
my $system = delete $attrs{system} if exists $attrs{system};
1554
my @children = get_subelements($node);
1556
# First child node is value.
1557
my $value_node = shift @children;
1558
if (not defined $value_node) {
1559
warn "missing 'value' element inside star-rating";
1562
if ((my $name = get_name($value_node)) ne 'value') {
1563
warn "expected 'value' node inside star-rating, got '$name'";
1566
my $rating = read_value($value_node);
1568
# Remaining children are icons.
1569
my @icons = map { read_icon($_) } @children;
1571
return [ $rating, $system, \@icons ];
1573
$Handlers{'star-rating'}->[1] = sub ( $$$ ) {
1574
my ($w, $e, $v) = @_;
1576
# 10/31/2007 star-rating can now have multiple values (and system=)
1577
# let's make it so old code still works!
1579
if (not ref $v or ref $v ne 'ARRAY') {
1581
# warn "not writing star-rating whose content is not an array";
1584
my ($rating, $system, $icons) = @$v;
1585
if (defined $system) {
1586
$w->startTag($e, system => $system) if $w;
1589
$w->startTag($e) if $w;
1591
write_value($w, 'value', $rating) if $w;
1592
if ($w) { write_icon($w, 'icon', $_) foreach @$icons };
1593
$w->endTag($e) if $w;
1600
An icon in XMLTV files is like the <img> element in HTML. It is
1601
represented in Perl as a hashref with 'src' and optionally 'width'
1605
sub write_icon( $$$ ) {
1606
my ($w, $e, $v) = @_;
1607
croak "no 'src' attribute for icon\n" if not defined $v->{src};
1608
croak "bad width $v->{width} for icon\n"
1609
if defined $v->{width} and $v->{width} !~ /^\d+$/;
1610
croak "bad height $v->{height} for icon\n"
1611
if defined $v->{height} and $v->{height} !~ /^\d+$/;
1613
foreach (keys %$v) {
1614
warn "unrecognized key in icon: $_\n"
1615
if $_ ne 'src' and $_ ne 'width' and $_ ne 'height';
1618
$w->emptyTag($e, %$v);
1620
sub read_icon( $ ) {
1621
my $node = shift; die if not defined $node;
1622
my %attrs = %{get_attrs($node)};
1623
warn "missing 'src' attribute in icon" if not defined $attrs{src};
1626
$Handlers{icon}->[0] = \&read_icon;
1627
$Handlers{icon}->[1] = sub( $$$ ) {
1628
my ($w, $e, $v) = @_;
1629
write_icon($w, $e, $v) if $w;
1632
# To keep things tidy some elements that can have icons store their
1633
# textual content inside a subelement called 'value'. These two
1634
# routines are a bit trivial but they're here for consistency.
1636
sub read_value( $ ) {
1637
my $value_node = shift;
1638
my $v = get_text($value_node);
1639
if (not defined $v or $v eq '') {
1640
warn "no content of 'value' element";
1645
sub write_value( $$$ ) {
1646
my ($w, $e, $v) = @_;
1647
$w->dataElement($e, $v) if $w;
1651
# Booleans in XMLTV files are 'yes' or 'no'.
1652
sub decode_boolean( $ ) {
1654
if ($value eq 'no') {
1657
elsif ($value eq 'yes') {
1661
warn "bad boolean: $value";
1665
sub encode_boolean( $ ) {
1667
warn "expected a Perl boolean like 0 or 1, not '$v'\n"
1669
return $v ? 'yes' : 'no';
1677
In XML something like title can be either <title>Foo</title>
1678
or <title lang="en">Foo</title>. In Perl these are stored as
1679
[ 'Foo' ] and [ 'Foo', 'en' ]. For the former [ 'Foo', undef ]
1682
This handler also has two modifiers which may be added to the name
1683
after '/'. I</e> means that empty text is allowed, and will be
1684
returned as the empty tuple [], to mean that the element is present
1685
but has no text. When writing with I</e>, undef will also be
1686
understood as present-but-empty. You cannot however specify a
1687
language if the text is empty.
1689
The modifier I</m> means that the text is allowed to span multiple
1692
So for example I<with-lang/em> is a handler for text with language,
1693
where the text may be empty and may contain newlines. Note that the
1694
I<with-lang-or-empty> of earlier releases has been replaced by
1698
sub read_with_lang( $$$ ) {
1699
my ($node, $allow_empty, $allow_nl) = @_;
1700
die if not defined $node;
1701
my %attrs = %{get_attrs($node)};
1702
my $lang = $attrs{lang} if exists $attrs{lang};
1703
my $value = get_text($node, $allow_nl);
1704
if (not length $value) {
1705
if (not $allow_empty) {
1706
warn 'empty string for with-lang value';
1709
warn 'empty string may not have language' if defined $lang;
1712
if (defined $lang) {
1713
return [ $value, $lang ];
1719
$Handlers{'with-lang'}->[0] = sub( $ ) { read_with_lang($_[0], 0, 0) };
1720
$Handlers{'with-lang/'}->[0] = sub( $ ) { read_with_lang($_[0], 0, 0) };
1721
$Handlers{'with-lang/e'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 0) };
1722
$Handlers{'with-lang/m'}->[0] = sub( $ ) { read_with_lang($_[0], 0, 1) };
1723
$Handlers{'with-lang/em'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 1) };
1724
$Handlers{'with-lang/me'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 1) };
1726
sub write_with_lang( $$$$$ ) {
1727
my ($w, $e, $v, $allow_empty, $allow_nl) = @_;
1728
if (not ref $v or ref $v ne 'ARRAY') {
1729
warn "not writing with-lang whose content is not an array";
1734
if (not $allow_empty) {
1735
warn "not writing no content for $e";
1741
my ($text, $lang) = @$v;
1742
t 'writing character data: ' . d $text;
1743
if (not defined $text) {
1744
warn "not writing undefined value for $e";
1749
# strip whitespace silently.
1750
# we used to use a warn, but later on the code catches this and drops the record
1752
my $old_text = $text;
1756
if (not length $text) {
1757
if (not $allow_empty) {
1758
warn "not writing empty content for $e";
1761
if (defined $lang) {
1762
warn "not writing empty content with language for $e";
1765
$w->emptyTag($e) if $w;
1769
if (not $allow_nl and $text =~ tr/\n//) {
1770
warn "not writing text containing newlines for $e";
1774
if (defined $lang) {
1775
$w->dataElement($e, $text, lang => $lang) if $w;
1778
$w->dataElement($e, $text) if $w;
1781
$Handlers{'with-lang'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 0) };
1782
$Handlers{'with-lang/'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 0) };
1783
$Handlers{'with-lang/e'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 0) };
1784
$Handlers{'with-lang/m'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 1) };
1785
$Handlers{'with-lang/em'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 1) };
1786
$Handlers{'with-lang/me'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 1) };
1789
foreach (keys %Handlers) {
1790
my $v = $Handlers{$_};
1792
or ref($v->[0]) ne 'CODE'
1793
or ref($v->[1]) ne 'CODE') {
1794
die "bad handler pair for $_\n";
1802
Now, which handlers are used for which subelements (keys) of channels
1803
and programmes? And what is the multiplicity (should you expect a
1804
single value or a list of values)?
1806
The following tables map subelements of <channel> and of <programme>
1807
to the handlers used to read and write them. Many elements have their
1808
own handler with the same name, and most of the others use
1809
I<with-lang>. The third column specifies the multiplicity of the
1810
element: B<*> (any number) will give a list of values in Perl, B<+>
1811
(one or more) will give a nonempty list, B<?> (maybe one) will give a
1812
scalar, and B<1> (exactly one) will give a scalar which is not undef.
1814
=head2 Handlers for <channel>
1819
=item display-name, I<with-lang>, B<+>
1821
=item icon, I<icon>, B<*>
1823
=item url, I<scalar>, B<*>
1828
=head2 Handlers for <programme>
1833
=item title, I<with-lang>, B<+>
1835
=item sub-title, I<with-lang>, B<*>
1837
=item desc, I<with-lang/m>, B<*>
1839
=item credits, I<credits>, B<?>
1841
=item date, I<scalar>, B<?>
1843
=item category, I<with-lang>, B<*>
1845
=item language, I<with-lang>, B<?>
1847
=item orig-language, I<with-lang>, B<?>
1849
=item length, I<length>, B<?>
1851
=item icon, I<icon>, B<*>
1853
=item url, I<scalar>, B<*>
1855
=item country, I<with-lang>, B<*>
1857
=item episode-num, I<episode-num>, B<*>
1859
=item video, I<video>, B<?>
1861
=item audio, I<audio>, B<?>
1863
=item previously-shown, I<previously-shown>, B<?>
1865
=item premiere, I<with-lang/em>, B<?>
1867
=item last-chance, I<with-lang/em>, B<?>
1869
=item new, I<presence>, B<?>
1871
=item subtitles, I<subtitles>, B<*>
1873
=item rating, I<rating>, B<*>
1875
=item star-rating, I<star-rating>, B<*>
1880
At present, no parsing or validation on dates is done because dates
1881
may be partially specified in XMLTV. For example '2001' means that
1882
the year is known but not the month, day or time of day. Maybe in the
1883
future dates will be automatically converted to and from
1884
B<Date::Manip> objects. For now they just use the I<scalar> handler.
1885
Similar remarks apply to URLs.
1889
sub node_to_programme( $ ) {
1890
my $node = shift; die if not defined $node;
1893
# Attributes of programme element.
1894
%programme = %{get_attrs($node)};
1895
t 'attributes: ' . d \%programme;
1897
# Check the required attributes are there. As with most checking,
1898
# this isn't an alternative to using a validator but it does save
1899
# some headscratching during debugging.
1901
foreach (qw(start channel)) {
1902
if (not defined $programme{$_}) {
1903
warn "programme missing '$_' attribute\n";
1907
my @known_attrs = map { $_->[0] } @Programme_Attributes;
1908
my %ka; ++$ka{$_} foreach @known_attrs;
1909
foreach (keys %programme) {
1911
warn "deleting unknown attribute '$_'";
1912
delete $programme{$_};
1916
call_handlers_read($node, \@Programme_Handlers, \%programme);
1922
sub node_to_channel( $ ) {
1923
my $node = shift; die if not defined $node;
1925
t 'node_to_channel() ENTRY';
1927
%channel = %{get_attrs($node)};
1928
t 'attributes: ' . d \%channel;
1929
if (not defined $channel{id}) {
1930
warn "channel missing 'id' attribute\n";
1932
foreach (keys %channel) {
1933
unless (/^_/ or $_ eq 'id') {
1934
warn "deleting unknown attribute '$_'";
1935
delete $channel{$_};
1939
t '\@Channel_Handlers=' . d \@Channel_Handlers;
1940
call_handlers_read($node, \@Channel_Handlers, \%channel);
1948
# call_handlers_read()
1950
# Read the subelements of a node according to a list giving a
1951
# handler subroutine for each subelement.
1955
# Reference to list of handlers: tuples of
1956
# [element-name, handler-name, multiplicity]
1957
# Reference to hash for storing results
1959
# Warns if errors, but attempts to contine.
1961
sub call_handlers_read( $$$ ) {
1962
my ($node, $handlers, $r) = @_;
1963
t 'call_handlers_read() using handlers: ' . d $handlers;
1965
die unless ref($r) eq 'HASH';
1966
our %r; local *r = $r;
1967
t 'going through each child of node';
1969
# Current position in handlers. We expect to read the subelements
1970
# in the correct order as specified by the DTD.
1972
my $handler_pos = 0;
1974
SUBELEMENT: foreach (get_subelements($node)) {
1975
t 'doing subelement';
1976
my $name = get_name($_);
1977
t "tag name: $name";
1979
# Search for a handler - from $handler_pos onwards. But
1980
# first, just warn if somebody is trying to use an element in
1981
# the wrong place (trying to go backwards in the list).
1984
foreach my $i (0 .. $handler_pos - 1) {
1985
if ($name eq $handlers->[$i]->[0]) {
1986
warn "element $name not expected here";
1990
for (my $i = $handler_pos; $i < @$handlers; $i++) {
1991
if ($handlers->[$i]->[0] eq $name) {
1997
t "doesn't match name $handlers->[$i]->[0]";
1998
my ($handler_name, $h, $multiplicity)
1999
= @{$handlers->[$i]};
2000
die if not defined $handler_name;
2001
die if $handler_name eq '';
2003
# Before we skip over this element, check that we got
2004
# the necessary values for it.
2006
if ($multiplicity eq '?') {
2007
# Don't need to check whether this set.
2009
elsif ($multiplicity eq '1') {
2010
if (not defined $r{$handler_name}) {
2011
warn "no element $handler_name found";
2014
elsif ($multiplicity eq '*') {
2015
# It's okay if nothing was ever set. We don't
2016
# insist on putting in an empty list.
2019
elsif ($multiplicity eq '+') {
2020
if (not defined $r{$handler_name}) {
2021
warn "no element $handler_name found";
2023
elsif (not @{$r{$handler_name}}) {
2024
warn "strangely, empty list for $handler_name";
2028
warn "bad value of $multiplicity: $!";
2032
if (not defined $found_pos) {
2033
warn "unknown element $name";
2036
# Next time we begin searching from this position.
2037
$handler_pos = $found_pos;
2040
t 'calling handler';
2041
my ($handler_name, $h_name, $multiplicity)
2042
= @{$handlers->[$found_pos]};
2043
die if $handler_name ne $name;
2044
my $h = $Handlers{$h_name}; die "no handler $h_name" if not $h;
2045
my $result = $h->[0]->($_); # call reader sub
2046
t 'result: ' . d $result;
2047
warn("skipping bad $name\n"), next if not defined $result;
2049
# Now set the value. We can't do multiplicity checking yet
2050
# because there might be more elements of this type still to
2053
if ($multiplicity eq '?' or $multiplicity eq '1') {
2054
warn "seen $name twice"
2055
if defined $r{$name};
2056
$r{$name} = $result;
2058
elsif ($multiplicity eq '*' or $multiplicity eq '+') {
2059
push @{$r{$name}}, $result;
2062
warn "bad multiplicity: $multiplicity";
2067
sub warn_unknown_keys( $$ ) {
2068
my $elem_name = shift;
2069
our %k; local *k = shift;
2072
or $warned_unknown_key{$elem_name}->{$_}++
2073
or warn "unknown key $_ in $elem_name hash\n";
2077
package XMLTV::Writer;
2078
use base 'XML::Writer';
2081
use Date::Manip qw/UnixDate DateCalc/;
2083
# Use Log::TraceMessages if installed.
2085
eval { require Log::TraceMessages };
2091
*t = \&Log::TraceMessages::t;
2092
*d = \&Log::TraceMessages::d;
2097
Date::Manip::Date_Init("TZ=UTC");
2100
# Override dataElement() to refuse writing empty or whitespace
2103
sub dataElement( $$$@ ) {
2104
my ($self, $elem, $content, @rest) = @_;
2105
if ($content !~ /\S/) {
2106
warn "not writing empty content for $elem";
2109
return $self->SUPER::dataElement($elem, $content, @rest);
2116
When reading a file you have the choice of using C<parse()> to gulp
2117
the whole file and return a data structure, or using
2118
C<parse_callback()> to get the programmes one at a time, although
2119
channels and other data are still read all at once.
2121
There is a similar choice when writing data: the C<write_data()>
2122
routine prints a whole XMLTV document at once, but if you want to
2123
write an XMLTV document incrementally you can manually create an
2124
C<XMLTV::Writer> object and call methods on it. Synopsis:
2127
my $w = new XMLTV::Writer();
2128
$w->comment("Hello from XML::Writer's comment() method");
2129
$w->start({ 'generator-info-name' => 'Example code in pod' });
2130
my %ch = (id => 'test-channel', 'display-name' => [ [ 'Test', 'en' ] ]);
2131
$w->write_channel(\%ch);
2132
my %prog = (channel => 'test-channel', start => '200203161500',
2133
title => [ [ 'News', 'en' ] ]);
2134
$w->write_programme(\%prog);
2137
XMLTV::Writer inherits from XML::Writer, and provides the following extra
2138
or overridden methods:
2142
=item new(), the constructor
2144
Creates an XMLTV::Writer object and starts writing an XMLTV file, printing
2145
the DOCTYPE line. Arguments are passed on to XML::WriterE<39>s constructor,
2146
except for the following:
2148
the 'encoding' key if present gives the XML character encoding.
2151
my $w = new XMLTV::Writer(encoding => 'ISO-8859-1');
2153
If encoding is not specified, XML::WriterE<39>s default is used
2156
XMLTW::Writer can also filter out specific days from the data. This is
2157
useful if the datasource provides data for periods of time that does not
2158
match the days that the user has asked for. The filtering is controlled
2159
with the days, offset and cutoff arguments:
2161
my $w = new XMLTV::Writer(
2164
cutoff => "050000" );
2166
In this example, XMLTV::Writer will discard all entries that do not have
2167
starttimes larger than or equal to 05:00 tomorrow and less than 05:00
2168
two days after tomorrow. The time offset is stripped off the starttime before
2169
the comparison is made.
2174
my $class = ref($proto) || $proto;
2176
croak 'OUTPUT requires a filehandle, not a filename or anything else'
2177
if exists $args{OUTPUT} and not ref $args{OUTPUT};
2178
my $encoding = delete $args{encoding};
2179
my $days = delete $args{days};
2180
my $offset = delete $args{offset};
2181
my $cutoff = delete $args{cutoff};
2183
my $self = $class->SUPER::new(DATA_MODE => 1, DATA_INDENT => 2, %args);
2184
bless($self, $class);
2186
if (defined $encoding) {
2187
$self->xmlDecl($encoding);
2190
# XML::Writer puts in 'encoding="UTF-8"' even if you don't ask
2193
warn "assuming default UTF-8 encoding for output\n";
2197
# $Log::TraceMessages::On = 1;
2198
$self->{mintime} = "19700101000000";
2199
$self->{maxtime} = "29991231235959";
2202
if (defined( $days ) and defined( $offset ) and defined( $cutoff )) {
2203
$self->{mintime} = UnixDate(
2204
DateCalc( "today", "+" . $offset . " days" ),
2205
"%Y%m%d") . $cutoff;
2206
t "using mintime $self->{mintime}";
2208
$self->{maxtime} = UnixDate(
2209
DateCalc("today", "+" . $offset+$days . " days"),
2210
"%Y%m%d" ) . $cutoff;
2211
t "using maxtime $self->{maxtime}";
2213
elsif (defined( $days ) or defined( $offset ) or defined($cutoff)) {
2214
croak 'You must specify days, offset and cutoff or none of them';
2218
local $^W = 0; $self->doctype('tv', undef, 'xmltv.dtd');
2220
$self->{xmltv_writer_state} = 'new';
2228
Write the start of the <tv> element. Parameter is a hashref which gives
2229
the attributes of this element.
2234
die 'usage: XMLTV::Writer->start(hashref of attrs)' if @_ != 1;
2237
for ($self->{xmltv_writer_state}) {
2241
elsif ($_ eq 'channels' or $_ eq 'programmes') {
2242
croak 'cannot call start() more than once on XMLTV::Writer';
2244
elsif ($_ eq 'end') {
2245
croak 'cannot do anything with end()ed XMLTV::Writer';
2251
$self->startTag('tv', order_attrs(%{$attrs}));
2256
=item write_channels()
2258
Write several channels at once. Parameter is a reference to a hash
2259
mapping channel id to channel details. They will be written sorted
2260
by id, which is reasonable since the order of channels in an XMLTV
2261
file isnE<39>t significant.
2264
sub write_channels {
2265
my ($self, $channels) = @_;
2266
t('write_channels(' . d($self) . ', ' . d($channels) . ') ENTRY');
2267
croak 'expected hashref of channels' if ref $channels ne 'HASH';
2269
for ($self->{xmltv_writer_state}) {
2271
croak 'must call start() on XMLTV::Writer first';
2273
elsif ($_ eq 'channels') {
2276
elsif ($_ eq 'programmes') {
2277
croak 'cannot write channels after writing programmes';
2279
elsif ($_ eq 'end') {
2280
croak 'cannot do anything with end()ed XMLTV::Writer';
2285
my @ids = sort keys %$channels;
2286
t 'sorted list of channel ids: ' . d \@ids;
2288
t "writing channel with id $_";
2289
my $ch = $channels->{$_};
2290
$self->write_channel($ch);
2292
t('write_channels() EXIT');
2297
=item write_channel()
2299
Write a single channel. You can call this routine if you want, but
2300
most of the time C<write_channels()> is a better interface.
2304
my ($self, $ch) = @_;
2305
croak 'undef channel hash passed' if not defined $ch;
2306
croak "expected a hashref, got: $ch" if ref $ch ne 'HASH';
2308
for ($self->{xmltv_writer_state}) {
2310
croak 'must call start() on XMLTV::Writer first';
2312
elsif ($_ eq 'channels') {
2315
elsif ($_ eq 'programmes') {
2316
croak 'cannot write channels after writing programmes';
2318
elsif ($_ eq 'end') {
2319
croak 'cannot do anything with end()ed XMLTV::Writer';
2324
my %ch = %$ch; # make a copy
2325
my $id = delete $ch{id};
2326
die "no 'id' attribute in channel" if not defined $id;
2327
write_element_with_handlers($self, 'channel', { id => $id },
2328
\@XMLTV::Channel_Handlers, \%ch);
2333
=item write_programme()
2335
Write details for a single programme as XML.
2338
sub write_programme {
2340
die 'usage: XMLTV::Writer->write_programme(programme hash)' if @_ != 1;
2342
croak 'write_programme() expects programme hashref'
2343
if ref $ref ne 'HASH';
2344
t('write_programme(' . d($self) . ', ' . d($ref) . ') ENTRY');
2346
for ($self->{xmltv_writer_state}) {
2348
croak 'must call start() on XMLTV::Writer first';
2350
elsif ($_ eq 'channels') {
2353
elsif ($_ eq 'programmes') {
2356
elsif ($_ eq 'end') {
2357
croak 'cannot do anything with end()ed XMLTV::Writer';
2362
# We make a copy of the programme hash and delete elements from it
2363
# as they are dealt with; then we can easily spot any unhandled
2364
# elements at the end.
2368
# First deal with those hash keys that refer to metadata on when
2369
# the programme is broadcast. After taking those out of the hash,
2370
# we can use the handlers to output individual details.
2373
die if not @XMLTV::Programme_Attributes;
2374
foreach (@XMLTV::Programme_Attributes) {
2375
my ($name, $mult) = @$_;
2376
t "looking for key $name";
2377
my $val = delete $p{$name};
2379
# No need to check anything.
2381
elsif ($mult eq '1') {
2382
if (not defined $val) {
2383
warn "programme hash missing $name key, skipping";
2387
else { die "bad multiplicity for attribute: $mult" }
2388
$attrs{$name} = $val if defined $val;
2391
# We use string comparisons without timeoffsets for comparing times.
2392
my( $start ) = split( /\s+/, $attrs{start} );
2393
if( $start lt $self->{mintime} or
2394
$start ge $self->{maxtime} ) {
2395
t "skipping programme with start $attrs{start}";
2399
t "beginning 'programme' element";
2400
write_element_with_handlers($self, 'programme', \%attrs,
2401
\@XMLTV::Programme_Handlers, \%p);
2408
Say youE<39>ve finished writing programmes. This ends the <tv> element
2415
for ($self->{xmltv_writer_state}) {
2417
croak 'must call start() on XMLTV::Writer first';
2419
elsif ($_ eq 'channels' or $_ eq 'programmes') {
2422
elsif ($_ eq 'end') {
2423
croak 'cannot do anything with end()ed XMLTV::Writer';
2428
$self->endTag('tv');
2429
$self->SUPER::end(@_);
2436
# In XML the order of attributes is not significant. But to make
2437
# things look nice we try to output them in the same order as given in
2440
# Takes a list of (key, value, key, value, ...) and returns one with
2441
# keys in a nice-looking order.
2444
die "expected even number of elements, from a hash"
2446
my @a = ((map { $_->[0] } (@XMLTV::Channel_Attributes,
2447
@XMLTV::Programme_Attributes)),
2448
qw(date source-info-url source-info-name source-data-url
2449
generator-info-name generator-info-url));
2454
if (exists $in{$_}) {
2455
my $v = delete $in{$_};
2460
foreach (sort keys %in) {
2461
warn "unknown attribute $_" unless /^_/;
2462
push @r, $_, $in{$_};
2471
# Writes the elements of a hash to an XMLTV::Writer using a list of
2472
# handlers. Deletes keys (modifying the hash passed in) as they are
2475
# Requires all mandatory keys be present in the hash - if you're not
2476
# sure then use check_multiplicity() first.
2478
# Returns true if the element was successfully written, or if any
2479
# errors found don't look serious enough to cause bad XML. If the
2480
# XML::Writer object passed in is undef then nothing is written (since
2481
# the write handlers are coded like that.)
2483
sub call_handlers_write( $$$ ) {
2484
my ($self, $handlers, $input) = @_;
2485
t 'writing input hash: ' . d $input;
2486
die if not defined $input;
2489
foreach (@$handlers) {
2490
my ($name, $h_name, $multiplicity) = @$_;
2491
my $h = $XMLTV::Handlers{$h_name}; die "no handler $h_name" if not $h;
2492
my $writer = $h->[1]; die if not defined $writer;
2493
t "doing handler for $name$multiplicity";
2494
local $SIG{__WARN__} = sub {
2495
warn "$name element: $_[0]";
2498
my $val = delete $input->{$name};
2499
t 'got value(s): ' . d $val;
2500
if ($multiplicity eq '1') {
2501
$writer->($self, $name, $val);
2503
elsif ($multiplicity eq '?') {
2504
$writer->($self, $name, $val) if defined $val;
2506
elsif ($multiplicity eq '*' or $multiplicity eq '+') {
2507
croak "value for key $name should be an array ref"
2508
if defined $val and ref $val ne 'ARRAY';
2510
t 'writing value: ' . d $_;
2511
$writer->($self, $name, $_);
2512
t 'finished writing multiple values';
2516
warn "bad multiplicity specifier: $multiplicity";
2519
t 'leftover keys: ' . d([ sort keys %$input ]);
2526
# Warns about missing keys that are supposed to be mandatory. Returns
2527
# true iff everything is okay.
2529
sub check_multiplicity( $$ ) {
2530
my ($handlers, $input) = @_;
2531
foreach (@$handlers) {
2532
my ($name, $h_name, $multiplicity) = @$_;
2533
t "checking handler for $name: $h_name with multiplicity $multiplicity";
2534
if ($multiplicity eq '1') {
2535
if (not defined $input->{$name}) {
2536
warn "hash missing value for $name";
2540
elsif ($multiplicity eq '?') {
2541
# Okay if not present.
2543
elsif ($multiplicity eq '*') {
2544
# Not present, or undef, is treated as empty list.
2546
elsif ($multiplicity eq '+') {
2547
t 'one or more, checking for a listref with no undef values';
2548
my $val = $input->{$name};
2549
if (not defined $val) {
2550
warn "hash missing value for $name (expected list)";
2553
if (ref($val) ne 'ARRAY') {
2554
die "hash has bad contents for $name (expected list)";
2558
t 'all values: ' . d $val;
2559
my @new_val = grep { defined } @$val;
2560
t 'values that are defined: ' . d \@new_val;
2561
if (@new_val != @$val) {
2562
warn "hash had some undef elements in list for $name, removed";
2567
warn "hash has empty list of $name properties (expected at least one)";
2572
warn "bad multiplicity specifier: $multiplicity";
2581
# Write a complete element with attributes, and subelements written
2582
# using call_handlers_write(). The advantage over doing it by hand is
2583
# that if some required keys are missing, nothing is written (rather
2584
# than an incomplete and invalid element).
2586
sub write_element_with_handlers( $$$$$ ) {
2587
my ($w, $name, $attrs, $handlers, $hash) = @_;
2588
if (not check_multiplicity($handlers, $hash)) {
2589
warn "keys missing in $name hash, not writing";
2593
# Special 'debug' keys written as comments inside the element.
2595
foreach (grep /^debug/, keys %$hash) {
2596
$debug_keys{$_} = delete $hash->{$_};
2599
# Call all the handlers with no writer object and make sure
2602
if (not call_handlers_write(undef, $handlers, { %$hash })) {
2603
warn "bad data inside $name element, not writing\n";
2607
$w->startTag($name, order_attrs(%$attrs));
2608
foreach (sort keys %debug_keys) {
2609
my $val = $debug_keys{$_};
2610
$w->comment((defined $val) ? "$_: $val" : $_);
2612
call_handlers_write($w, $handlers, $hash);
2613
XMLTV::warn_unknown_keys($name, $hash);
2623
Ed Avis, ed@membled.com
2627
The file format is defined by the DTD xmltv.dtd, which is included in
2628
the xmltv package along with this module. It should be installed in
2629
your systemE<39>s standard place for SGML and XML DTDs.
2631
The xmltv package has a web page at
2632
<http://membled.com/work/apps/xmltv/> which carries
2633
information about the file format and the various tools and apps which
2634
are distributed with this module.
2
# $Id: XMLTV.pm.in,v 1.137 2008/07/14 04:05:56 rmeden Exp $
8
our @EXPORT_OK = qw(read_data parse parsefile parsefiles write_data
9
best_name list_channel_keys list_programme_keys);
11
# For the time being the version of this library is tied to that of
12
# the xmltv package as a whole. This number should be checked by the
15
our $VERSION = '0.5.52';
17
# Work around changing behaviour of XML::Twig. On some systems (like
18
# mine) it always returns UTF-8 data unless KeepEncoding is specified.
19
# However the encoding() method tells you the encoding of the original
20
# document, not of the data you receive. To be sure of what you're
21
# getting, it is easiest on such a system to not give KeepEncoding and
24
# But on other systems (seemingly perl 5.8 and above), XML::Twig tries
25
# to keep the original document's encoding in the strings returned.
26
# You then have to call encoding() to find out what you're getting.
27
# To make sure of this behaviour we set KeepEncoding to true on such a
30
# Setting KeepEncoding true everywhere seems to do no harm, it's a
31
# pity that we lose conversion to UTF-8 but at least it's the same
32
# everywhere. So the library is distributed with this flag on.
34
my $KEEP_ENCODING = 1;
36
my %warned_unknown_key;
37
sub warn_unknown_keys( $$ );
43
XMLTV - Perl extension to read and write TV listings in XMLTV format
48
my $data = XMLTV::parsefile('tv.xml');
49
my ($encoding, $credits, $ch, $progs) = @$data;
50
my $langs = [ 'en', 'fr' ];
51
print 'source of listings is: ', $credits->{'source-info-name'}, "\n"
52
if defined $credits->{'source-info-name'};
53
foreach (values %$ch) {
54
my ($text, $lang) = @{XMLTV::best_name($langs, $_->{'display-name'})};
55
print "channel $_->{id} has name $text\n";
56
print "...in language $lang\n" if defined $lang;
59
print "programme on channel $_->{channel} at time $_->{start}\n";
60
next if not defined $_->{desc};
61
foreach (@{$_->{desc}}) {
62
my ($text, $lang) = @$_;
63
print "has description $text\n";
64
print "...in language $lang\n" if defined $lang;
68
The value of $data will be something a bit like:
71
{ 'source-info-name' => 'Ananova', 'generator-info-name' => 'XMLTV' },
72
{ 'radio-4.bbc.co.uk' => { 'display-name' => [ [ 'en', 'BBC Radio 4' ],
75
'id' => 'radio-4.bbc.co.uk' },
77
[ { start => '200111121800', title => [ [ 'Simpsons', 'en' ] ],
78
channel => 'radio-4.bbc.co.uk' },
83
This module provides an interface to read and write files in XMLTV
84
format (a TV listings format defined by xmltv.dtd). In general element
85
names in the XML correspond to hash keys in the Perl data structure.
86
You can think of this module as a bit like B<XML::Simple>, but
87
specialized to the XMLTV file format.
89
The Perl data structure corresponding to an XMLTV file has four
90
elements. The first gives the character encoding used for text data,
91
typically UTF-8 or ISO-8859-1. (The encoding value could also be
92
undef meaning 'unknown', when the library canE<39>t work out what it
93
is.) The second element gives the attributes of the root <tv>
94
element, which give information about the source of the TV listings.
95
The third element is a list of channels, each list element being a
96
hash corresponding to one <channel> element. The fourth element is
97
similarly a list of programmes. More details about the data structure
98
are given later. The easiest way to find out what it looks like is to
99
load some small XMLTV files and use B<Data::Dumper> to print out the
109
use XML::Writer 0.600;
114
# Use Lingua::Preferred if available, else kludge a replacement.
115
sub my_which_lang { return $_[1]->[0] }
117
eval { require Lingua::Preferred };
118
*which_lang = $@ ? \&my_which_lang : \&Lingua::Preferred::which_lang;
121
# Use Log::TraceMessages if installed.
123
eval { require Log::TraceMessages };
129
*t = \&Log::TraceMessages::t;
130
*d = \&Log::TraceMessages::d;
134
# Attributes and subelements of channel. Each subelement additionally
135
# needs a handler defined. Multiplicity is given for both, but for
136
# attributes the only allowable values are '1' and '?'.
138
# Ordering of attributes is not really important, but we keep the same
139
# order as they are given in the DTD so that output looks nice.
141
# The ordering of the subelements list gives the order in which these
142
# elements must appear in the DTD. In fact, these lists just
143
# duplicate information in the DTD and add details of what handlers
146
our @Channel_Attributes = ([ 'id', '1' ]);
147
our @Channel_Handlers =
149
[ 'display-name', 'with-lang', '+' ],
150
[ 'icon', 'icon', '*' ],
151
[ 'url', 'scalar', '*' ],
154
# Same for <programme> elements.
155
our @Programme_Attributes =
159
[ 'pdc-start', '?' ],
160
[ 'vps-start', '?' ],
162
[ 'videoplus', '?' ],
166
our @Programme_Handlers =
168
[ 'title', 'with-lang', '+' ],
169
[ 'sub-title', 'with-lang', '*' ],
170
[ 'desc', 'with-lang/m', '*' ],
171
[ 'credits', 'credits', '?' ],
172
[ 'date', 'scalar', '?' ],
173
[ 'category', 'with-lang', '*' ],
174
[ 'language', 'with-lang', '?' ],
175
[ 'orig-language', 'with-lang', '?' ],
176
[ 'length', 'length', '?' ],
177
[ 'icon', 'icon', '*' ],
178
[ 'url', 'scalar', '*' ],
179
[ 'country', 'with-lang', '*' ],
180
[ 'episode-num', 'episode-num', '*' ],
181
[ 'video', 'video', '?' ],
182
[ 'audio', 'audio', '?' ],
183
[ 'previously-shown', 'previously-shown', '?' ],
184
[ 'premiere', 'with-lang/em', '?' ],
185
[ 'last-chance', 'with-lang/em', '?' ],
186
[ 'new', 'presence', '?' ],
187
[ 'subtitles', 'subtitles', '*' ],
188
[ 'rating', 'rating', '*' ],
189
[ 'star-rating', 'star-rating', '*' ],
192
# And a hash mapping names like 'with-lang' to pairs of subs. The
193
# first for reading, the second for writing. Note that the writers
194
# alter the passed-in data as a side effect! (If the writing sub is
195
# called with an undef XML::Writer then it writes nothing but still
196
# warns for (most) bad data checks - and still alters the data.)
200
# Undocumented interface for adding extensions to the XMLTV format:
201
# first add an entry to @XMLTV::Channel_Handlers or
202
# @XMLTV::Programme_Handlers with your new element's name, 'type' and
203
# multiplicity. The 'type' should be a string you invent yourself.
204
# Then $XMLTV::Handlers{'type'} should be a pair of subroutines, a
205
# reader and a writer. (Unless you want to use one of the existing
206
# types such as 'with-lang' or 'scalar'.)
208
# Note that elements and attributes beginning 'x-' are skipped over
209
# _automatically_, so you can't parse them with this method. A better
210
# way to add extensions is needed - doing this not encouraged but is
211
# sometimes necessary.
214
# read_data() is a deprecated name for parsefile().
215
sub read_data( $ ) { # FIXME remove altogether
216
warn "XMLTV::read_data() deprecated, use XMLTV::parsefile() instead\n";
223
croak 'no <tv> element found' if not /<tv/;
229
=item parse(document)
231
Takes an XMLTV document (a string) and returns the Perl data
232
structure. It is assumed that the document is valid XMLTV; if not
233
the routine may die() with an error (although the current implementation
234
just warns and continues for most small errors).
236
The first element of the listref returned, the encoding, may vary
237
according to the encoding of the input document, the versions of perl
238
and C<XML::Parser> installed, the configuration of the XMLTV library
239
and other factors including, but not limited to, the phase of the
240
moon. With luck it should always be either the encoding of the input
243
Attributes and elements in the XML file whose names begin with 'x-'
244
are skipped silently. You can use these to include information which
245
is not currently handled by the XMLTV format, or by this module.
251
# FIXME commonize with parsefiles()
252
my ($encoding, $credits);
256
sub { $encoding = shift },
257
sub { $credits = shift },
258
sub { for (shift) { $channels{$_->{id}} = $_ } },
259
sub { push @programmes, shift });
260
return [ $encoding, $credits, \%channels, \@programmes ];
265
=item parsefiles(filename...)
267
Like C<parse()> but takes one or more filenames instead of a string
268
document. The data returned is the merging of those file contents:
269
the programmes will be concatenated in their original order, the
270
channels just put together in arbitrary order (ordering of channels
273
It is necessary that each file have the same character encoding, if
274
not, an exception is thrown. Ideally the credits information would
275
also be the same between all the files, since there is no obvious way to
276
merge it - but if the credits information differs from one file to the
277
next, one file is picked arbitrarily to provide credits and a warning
278
is printed. If two files give differing channel definitions for the
279
same XMLTV channel id, then one is picked arbitrarily and a warning
282
In the simple case, with just one file, you neednE<39>t worry
283
about mismatching of encodings, credits or channels.
285
The deprecated function C<parsefile()> is a wrapper allowing just one
289
sub parsefiles( @ ) {
290
die 'one or more filenames required' if not @_;
291
my ($encoding, $credits);
294
parsefiles_callback(sub { $encoding = shift },
295
sub { $credits = shift },
296
sub { for (shift) { $channels{$_->{id}} = $_ } },
297
sub { push @programmes, shift },
299
return [ $encoding, $credits, \%channels, \@programmes ];
302
sub parsefile( $ ) { parsefiles(@_) }
306
=item parse_callback(document, encoding_callback, credits_callback,
307
channel_callback, programme_callback)
309
An alternative interface. Whereas C<parse()> reads the whole document
310
and then returns a finished data structure, with this routine you
311
specify a subroutine to be called as each <channel> element is read
312
and another for each <programme> element.
314
The first argument is the document to parse. The remaining arguments
315
are code references, one for each part of the document.
317
The callback for encoding will be called once with a string giving the
318
encoding. In present releases of this module, it is also possible for
319
the value to be undefined meaning 'unknown', but itE<39>s hoped that
320
future releases will always be able to figure out the encoding used.
322
The callback for credits will be called once with a hash reference.
323
For channels and programmes, the appropriate function will be called
324
zero or more times depending on how many channels / programmes are
327
The four subroutines will be called in order, that is, the encoding
328
and credits will be done before the channel handler is called and all
329
the channels will be dealt with before the first programme handler is
332
If any of the code references is undef, nothing is called for that part
335
For backwards compatibility, if the value for 'encoding callback' is
336
not a code reference but a scalar reference, then the encoding found
337
will be stored in that scalar. Similarly if the 'credits callback'
338
is a scalar reference, the scalar it points to will be set to point
339
to the hash of credits. This style of interface is deprecated: new
340
code should just use four callbacks.
344
my $document = '<tv>...</tv>';
347
sub encoding_cb( $ ) { $encoding = shift }
350
sub credits_cb( $ ) { $credits = shift }
352
# The callback for each channel populates this hash.
354
sub channel_cb( $ ) {
356
$channels{$c->{id}} = $c;
359
# The callback for each programme. We know that channels are
360
# always read before programmes, so the %channels hash will be
363
sub programme_cb( $ ) {
365
print "got programme: $p->{title}->[0]->[0]\n";
366
my $c = $channels{$p->{channel}};
367
print 'channel name is: ', $c->{'display-name'}->[0]->[0], "\n";
371
XMLTV::parse_callback($document, \&encoding_cb, \&credits_cb,
372
\&channel_cb, \&programme_cb);
376
sub new_doc_callback( $$$$ ) {
377
my ($enc_cb, $cred_cb, $ch_cb, $p_cb) = @_;
378
t 'creating new XML::Twig';
379
t '\@Channel_Handlers=' . d \@Channel_Handlers;
380
t '\@Programme_Handlers=' . d \@Programme_Handlers;
381
new XML::Twig(StartTagHandlers =>
385
if ($KEEP_ENCODING) {
386
t 'KeepEncoding on, get original encoding';
387
$enc = $t->encoding();
390
t 'assuming UTF-8 encoding';
394
if (defined $enc_cb) {
399
elsif ($_ eq 'SCALAR') {
403
die "callback should be code ref or scalar ref, or undef";
408
if (defined $cred_cb) {
409
my $cred = get_attrs($node);
414
elsif ($_ eq 'SCALAR') {
418
die "callback should be code ref or scalar ref, or undef";
422
# Most of the above code can be removed in the
429
{ '/tv/channel' => sub {
431
die if not defined $node;
432
my $c = node_to_channel($node);
435
warn "skipping bad channel element\n";
442
'/tv/programme' => sub {
444
die if not defined $node;
445
my $p = node_to_programme($node);
448
warn "skipping bad programme element\n";
456
KeepEncoding => $KEEP_ENCODING,
460
sub parse_callback( $$$$$ ) {
461
my ($str, $enc_cb, $cred_cb, $ch_cb, $p_cb) = @_;
463
new_doc_callback($enc_cb, $cred_cb, $ch_cb, $p_cb)->parse($str);
468
=item parsefiles_callback(encoding_callback, credits_callback,
469
channel_callback, programme_callback,
472
As C<parse_callback()> but takes one or more filenames to open,
473
merging their contents in the same manner as C<parsefiles()>. Note
474
that the reading is still gradual - you get the channels and
475
programmes one at a time, as they are read.
477
Note that the same <channel> may be present in more than one file, so
478
the channel callback will get called more than once. ItE<39>s your
479
responsibility to weed out duplicate channel elements (since writing
480
them out again requires that each have a unique id).
482
For compatibility, there is an alias C<parsefile_callback()> which is
483
the same but takes only a single filename, B<before> the callback
484
arguments. This is deprecated.
487
sub parsefile_callback( $$$$$ ) {
488
my ($f, $enc_cb, $cred_cb, $ch_cb, $p_cb) = @_;
489
parsefiles_callback($enc_cb, $cred_cb, $ch_cb, $p_cb, $f);
492
sub parsefiles_callback( $$$$@ ) {
493
my ($enc_cb, $cred_cb, $ch_cb, $p_cb, @files) = @_;
494
die "one or more files required" if not @files;
495
my $all_encoding; my $have_encoding = 0;
499
my $do_next_file; # to be defined below
500
my $my_enc_cb = sub( $ ) {
502
t 'encoding callback';
503
if ($have_encoding) {
504
t 'seen encoding before, just check';
505
my ($da, $de) = (defined $all_encoding, defined $e);
506
if (not $da and not $de) {
507
warn "two files both have unspecified character encodings, hope they're the same\n";
509
elsif (not $da and $de) {
510
warn "encoding $e not being returned to caller\n";
513
elsif ($da and not $de) {
514
warn "input file with unspecified encoding, assuming same as others ($all_encoding)\n";
516
elsif ($da and $de) {
517
if ($all_encoding ne $e) {
518
die "this file's encoding $e differs from others' $all_encoding - aborting\n";
524
t 'not seen encoding before, call user';
525
$enc_cb->($e) if $enc_cb;
531
my $my_cred_cb = sub( $ ) {
533
if (defined $all_credits) {
534
if (Dumper($all_credits) ne Dumper($c)) {
535
warn "different files have different credits, picking one arbitrarily\n";
536
# In fact, we pick the last file in the list since this is the
537
# first to be opened.
542
$cred_cb->($c) if $cred_cb;
547
my $my_ch_cb = sub( $ ) {
550
if (defined $all_channels{$id} and Dumper($all_channels{$id}) ne Dumper($c)) {
551
warn "differing channels with id $id, picking one arbitrarily\n";
554
$all_channels{$id} = $c;
555
$ch_cb->($c) if $ch_cb;
559
my $my_p_cb = sub( $ ) {
560
$do_next_file->(); # if any
561
$p_cb->(@_) if $p_cb;
564
$do_next_file = sub() {
569
# FIXME commonize these augmented warning messages. Weird
570
# stuff (up to and including segfaults) happens if you
571
# call warn() or die() from these handlers.
573
local $SIG{__WARN__} = sub {
575
$msg = "warning: something's wrong" if not defined $msg;
577
print STDERR "$f: $msg\n";
579
local $SIG{__DIE__} = sub {
581
$msg = "warning: something's wrong" if not defined $msg;
583
print STDERR "$f: $msg\n";
587
my $t = new_doc_callback($my_enc_cb, $my_cred_cb, $my_ch_cb, $my_p_cb);
598
=item write_data(data, options...)
600
Takes a data structure and writes it as XML to standard output. Any
601
extra arguments are passed on to XML::WriterE<39>s constructor, for example
603
my $f = new IO::File '>out.xml'; die if not $f;
604
write_data($data, OUTPUT => $f);
606
The encoding used for the output is given by the first element of the
609
Normally, there will be a warning for any Perl data which is not
610
understood and cannot be written as XMLTV, such as strange keys in
611
hashes. But as an exception, any hash key beginning with an
612
underscore will be skipped over silently. You can store 'internal use
615
If a programme or channel hash contains a key beginning with 'debug',
616
this key and its value will be written out as a comment inside the
617
<programme> or <channel> element. This lets you include small
618
debugging messages in the XML output.
621
sub write_data( $;@ ) {
623
my $writer = new XMLTV::Writer(encoding => $data->[0], @_);
624
$writer->start($data->[1]);
625
$writer->write_channels($data->[2]);
626
$writer->write_programme($_) foreach @{$data->[3]};
635
# Given a node, return a hashref of its attributes. Skips over
636
# the 'x-whatever' attributes.
639
my $node = shift; die if not defined $node;
640
my %r = %{$node->atts()};
657
# Given a node containing only text, return that text (with whitespace
658
# either side stripped). If the node has no children (as in
659
# <foo></foo> or <foo />), this is considered to be the empty string.
661
# Parameter: whether newlines are allowed (defaults to false)
663
sub get_text( $;$ ) {
665
my $allow_nl = shift; $allow_nl = 0 if not defined $allow_nl;
666
my @children = get_subelements($node);
667
if (@children == 0) {
670
elsif (@children == 1) {
671
my $v = $children[0]->pcdata();
672
t 'got pcdata: ' . d $v;
673
if (not defined $v) {
674
my $name = get_name($node);
675
warn "node $name expected to contain text has other stuff\n";
678
# Just hope that the encoding we got uses \n...
679
if (not $allow_nl and $v =~ tr/\n//d) {
680
my $name = get_name($node);
681
warn "removing newlines from content of node $name\n";
685
t 'returning: ' . d $v;
688
elsif (@children > 1) {
689
my $name = get_name($node);
690
warn "node $name expected to contain text has more than one child\n";
696
# Private. Clean up parsed text. Takes ref to scalar.
698
our $v; local *v = shift; die if not defined $v;
699
if ($XML::Twig::VERSION < 3.01 || $KEEP_ENCODING) {
700
# Old versions of XML::Twig had stupid behaviour with
701
# entities - and so do the new ones if KeepEncoding is on.
708
s/&/&/g; # needs to be last
712
t 'new XML::Twig, not KeepEncoding, entities already dealt with';
719
# On Windows there seems to be an inconsistency between
720
# XML::Twig and XML::Writer. The former returns text with
721
# \r\n line endings to the application, but the latter adds \r
722
# characters to text outputted. So reading some text and
723
# writing it again accumulates an extra \r character. We fix
724
# this by removing \r from the input here.
734
# Return a list of all subelements of a node. Whitespace is
735
# ignored; anything else that isn't a subelement is warned about.
736
# Skips over elements with name 'x-whatever'.
738
sub get_subelements( $ ) {
739
grep { (my $tmp = get_name($_)) !~ /^x-/ } $_[0]->children();
746
# Return the element name of a node.
748
sub get_name( $ ) { $_[0]->gi() }
754
# Return some information about a node for debugging.
758
# Doesn't seem to be easy way to get 'type' of node.
759
my $r = 'name: ' . get_name($n) . "\n";
760
for (trunc($n->text())) {
761
$r .= "value: $_\n" if defined and length;
765
# Private. Truncate a string to a reasonable length and add '...' if
770
return undef if not defined;
772
return substr($_, 0, 1000) . '...';
779
=item best_name(languages, pairs [, comparator])
781
The XMLTV format contains many places where human-readable text is
782
given an optional 'lang' attribute, to allow mixed languages. This is
783
represented in Perl as a pair [ text, lang ], although the second
784
element may be missing or undef if the language is unknown. When
785
several alernatives for an element (such as <title>) can be given, the
786
representation is a list of [ text, lang ] pairs. Given such a list,
787
what is the best text to use? It depends on the userE<39>s preferred
790
This function takes a list of acceptable languages and a list of [string,
791
language] pairs, and finds the best one to use. This means first finding
792
the appropriate language and then picking the 'best' string in that
795
The best is normally defined as the first one found in a usable
796
language, since the XMLTV format puts the most canonical versions
797
first. But you can pass in your own comparison function, for example
798
if you want to choose the shortest piece of text that is in an
801
The acceptable languages should be a reference to a list of language
802
codes looking like 'ru', or like 'de_DE'. The text pairs should be a
803
reference to a list of pairs [ string, language ]. (As a special case
804
if this list is empty or undef, that means no text is present, and the
805
result is undef.) The third argument if present should be a cmp-style
806
function that compares two strings of text and returns 1 if the first
807
argument is better, -1 if the second better, 0 if theyE<39>re equally
810
Returns: [s, l] pair, where s is the best of the strings to use and l
811
is its language. This pair is 'live' - it is one of those from the
812
list passed in. So you can use C<best_name()> to find the best pair
813
from a list and then modify the content of that pair.
815
(This routine depends on the C<Lingua::Preferred> module being
816
installed; if that module is missing then the first available
817
language is always chosen.)
821
my $langs = [ 'de', 'fr' ]; # German or French, please
823
# Say we found the following under $p->{title} for a programme $p.
824
my $pairs = [ [ 'La CitE des enfants perdus', 'fr' ],
825
[ 'The City of Lost Children', 'en_US' ] ];
827
my $best = best_name($langs, $pairs);
828
print "chose title $best->[0]\n";
831
sub best_name( $$;$ ) {
832
my ($wanted_langs, $pairs, $compare) = @_;
833
t 'best_name() ENTRY';
834
t 'wanted langs: ' . d $wanted_langs;
835
t '[text,lang] pairs: ' . d $pairs;
836
t 'comparison fn: ' . d $compare;
837
return undef if not defined $pairs;
841
my (%seen_lang, $seen_undef);
842
# Collect the list of available languages.
843
foreach (map { $_->[1] } @pairs) {
845
next if $seen_lang{$_}++;
848
next if $seen_undef++;
850
push @avail_langs, $_;
853
my $pref_lang = which_lang($wanted_langs, \@avail_langs);
855
# Gather up [text, lang] pairs which have the desired language.
858
my ($text, $lang) = @$_;
859
next unless ((not defined $lang)
860
or (defined $pref_lang and $lang eq $pref_lang));
861
push @candidates, $_;
864
return undef if not @candidates;
866
# If a comparison function was passed in, use it to compare the
867
# text strings from the candidate pairs.
869
@candidates = sort { $compare->($a->[0], $b->[0]) } @candidates
872
# Pick the first candidate. This will be the one ordered first by
873
# the comparison function if given, otherwise the earliest in the
876
return $candidates[0];
880
=item list_channel_keys(), list_programme_keys()
882
Some users of this module may wish to enquire at runtime about which
883
keys a programme or channel hash can contain. The data in the hash
884
comes from the attributes and subelements of the corresponding element
885
in the XML. The values of attributes are simply stored as strings,
886
while subelements are processed with a handler which may return a
887
complex data structure. These subroutines returns a hash mapping key
888
to handler name and multiplicity. This lets you know what data types
889
can be expected under each key. For keys which come from attributes
890
rather than subelements, the handler is set to 'scalar', just as for
891
subelements which give a simple string. See L<"DATA STRUCTURE"> for
892
details on what the different handler names mean.
894
It is not possible to find out which keys are mandatory and which
895
optional, only a list of all those which might possibly be present.
896
An example use of these routines is the L<tv_grep(1)> program, which
897
creates its allowed command line arguments from the names of programme
902
sub list_keys( $$ ) {
906
foreach (@{shift()}) {
907
my ($k, $mult) = @$_;
908
$r{$k} = [ 'scalar', $mult ];
912
foreach (@{shift()}) {
913
my ($k, $h_name, $mult) = @$_;
914
$r{$k} = [ $h_name, $mult ];
920
sub list_channel_keys() {
921
list_keys(\@Channel_Attributes, \@Channel_Handlers);
923
sub list_programme_keys() {
924
list_keys(\@Programme_Attributes, \@Programme_Handlers);
929
=item catfiles(w_args, filename...)
931
Concatenate several listings files, writing the output to somewhere
932
specified by C<w_args>. Programmes are catenated together, channels
933
are merged, for credits we just take the first and warn if the others
936
The first argument is a hash reference giving information to pass to
937
C<XMLTV::Writer>E<39>s constructor. But do not specify encoding, this
938
will be taken from the input files. Currently C<catfiles()> will fail
939
work if the input files have different encodings.
946
XMLTV::parsefiles_callback
949
$w = new XMLTV::Writer(%$w_args, encoding => shift);
951
sub { $w->start(shift) },
955
if (not defined $seen_ch{$id}) {
956
$w->write_channel($c);
959
elsif (Dumper($seen_ch{$id}) eq Dumper($c)) {
960
# They're identical, okay.
963
warn "channel $id may differ between two files, "
964
. "picking one arbitrarily\n";
967
sub { $w->write_programme(shift) },
976
Concatenate (and merge) listings data. Programmes are catenated
977
together, channels are merged, for credits we just take the first and
978
warn if the others differ (except that the 'date' of the result is the
979
latest date of all the inputs).
981
Whereas C<catfiles()> reads and writes files, this function takes
982
already-parsed listings data and returns some more listings data. It
983
is much more memory-hungry.
986
sub cat( @ ) { cat_aux(1, @_) }
990
=item cat_noprogrammes
992
Like C<cat()> but ignores the programme data and just returns
993
encoding, credits and channels. This is in case for scalability
994
reasons you want to handle programmes individually, but still
995
merge the smaller data.
998
sub cat_noprogrammes( @ ) { cat_aux(0, @_) }
1002
my ($all_credits_nodate, $all_credits_date);
1005
my $do_progs = shift;
1008
t 'doing arg: ' . d $_;
1009
my ($encoding, $credits, $channels, $progs) = @$_;
1011
if (not defined $all_encoding) {
1012
$all_encoding = $encoding;
1014
elsif ($encoding ne $all_encoding) {
1015
die "different files have different encodings, cannot continue\n";
1018
# If the credits are different between files there's not a lot
1019
# we can do to merge them. Apart from 'date', that is. There
1020
# we can say that the date of the concatenated listings is the
1021
# newest date from all the sources.
1023
my %credits_nodate = %$credits; # copy
1024
my $d = delete $credits_nodate{date};
1026
# Need to 'require' rather than 'use' this because
1027
# XMLTV.pm is loaded during the build process and
1028
# XMLTV::Date isn't available then. Urgh.
1030
require XMLTV::Date;
1031
my $dp = XMLTV::Date::parse_date($d);
1032
for ($all_credits_date) {
1034
or Date_Cmp(XMLTV::Date::parse_date($_), $dp) < 0) {
1040
# Now in uniqueness checks ignore the date.
1041
if (not defined $all_credits_nodate) {
1042
$all_credits_nodate = \%credits_nodate;
1044
elsif (Dumper(\%credits_nodate) ne Dumper($all_credits_nodate)) {
1045
warn "different files have different credits, taking from first file\n";
1048
foreach (keys %$channels) {
1049
if (not defined $all_channels{$_}) {
1050
$all_channels{$_} = $channels->{$_};
1052
elsif (Dumper($all_channels{$_}) ne Dumper($channels->{$_})) {
1053
warn "channel $_ differs between two files, taking first appearance\n";
1057
push @all_progs, @$progs if $do_progs;
1060
$all_encoding = 'UTF-8' if not defined $all_encoding;
1063
%all_credits = %$all_credits_nodate
1064
if defined $all_credits_nodate;
1065
$all_credits{date} = $all_credits_date
1066
if defined $all_credits_date;
1069
return [ $all_encoding, \%all_credits, \%all_channels, \@all_progs ];
1072
return [ $all_encoding, \%all_credits, \%all_channels ];
1077
# For each subelement of programme, we define a subroutine to read it
1078
# and one to write it. The reader takes an node for a single
1079
# subelement and returns its value as a Perl scalar (warning and
1080
# returning undef if error). The writer takes an XML::Writer, an
1081
# element name and a scalar value and writes a subelement for that
1082
# value. Note that the element name is passed in to the writer just
1083
# for symmetry, so that neither the writer or the reader have to know
1084
# what their element is called.
1090
=head1 DATA STRUCTURE
1092
For completeness, we describe more precisely how channels and
1093
programmes are represented in Perl. Each element of the channels list
1094
is a hashref corresponding to one <channel> element, and likewise for
1095
programmes. The possible keys of a channel (programme) hash are the
1096
names of attributes or subelements of <channel> (<programme>).
1098
The values for attributes are not processed in any way; an attribute
1099
C<fred="jim"> in the XML will become a hash element with key C<'fred'>,
1102
But for subelements, there is further processing needed to turn the
1103
XML content of a subelement into Perl data. What is done depends on
1104
what type of data is stored under that subelement. Also, if a certain
1105
element can appear several times then the hash key for that element
1106
points to a list of values rather than just one.
1108
The conversion of a subelementE<39>s content to and from Perl data is
1109
done by a handler. The most common handler is I<with-lang>, used for
1110
human-readable text content plus an optional 'lang' attribute. There
1111
are other handlers for other data structures in the file format.
1112
Often two subelements will share the same handler, since they hold the
1113
same type of data. The handlers defined are as follows; note that
1114
many of them will silently strip leading and trailing whitespace in
1115
element content. Look at the DTD itself for an explanation of the
1118
Unless specified otherwise, it is not allowed for an element expected
1119
to contain text to have empty content, nor for the text to contain
1126
Turns a list of credits (for director, actor, writer, etc.) into a
1127
hash mapping 'role' to a list of names. The names in each role are
1128
kept in the same order.
1131
$Handlers{credits}->[0] = sub( $ ) {
1133
my @roles = qw(director actor writer adapter producer presenter
1135
my %known_role; ++$known_role{$_} foreach @roles;
1137
foreach (get_subelements($node)) {
1138
my $role = get_name($_);
1139
unless ($known_role{$role}++) {
1140
warn "unknown thing in credits: $role";
1143
push @{$r{$role}}, get_text($_);
1147
$Handlers{credits}->[1] = sub( $$$ ) {
1148
my ($w, $e, $v) = @_; die if not defined $v;
1150
return if not %h; # don't write empty element
1151
t 'writing credits: ' . d \%h;
1152
# TODO some 'do nothing' setting in XML::Writer to replace this
1153
# convention of passing undef.
1155
$w->startTag($e) if $w;
1156
foreach (qw[director actor writer adapter producer presenter
1157
commentator guest] ) {
1158
next unless defined $h{$_};
1159
my @people = @{delete $h{$_}};
1160
foreach my $person (@people) {
1161
die if not defined $person;
1162
$w->dataElement($_, $person) if $w;
1165
warn_unknown_keys($e, \%h);
1166
$w->endTag($e) if $w;
1173
Reads and writes a simple string as the content of the XML element.
1176
$Handlers{scalar}->[0] = sub( $ ) {
1178
return get_text($node);
1180
$Handlers{scalar}->[1] = sub( $$$ ) {
1181
my ($w, $e, $v) = @_;
1183
$w->dataElement($e, $v) if $w;
1190
Converts the content of a <length> element into a number of seconds
1191
(so <length units="minutes">5</minutes> would be returned as 300). On
1192
writing out again tries to convert a number of seconds to a time in
1193
minutes or hours if that would look better.
1196
$Handlers{length}->[0] = sub( $ ) {
1197
my $node = shift; die if not defined $node;
1198
my %attrs = %{get_attrs($node)};
1199
my $d = get_text($node);
1200
if ($d =~ /^\s*$/) {
1201
warn "empty 'length' element";
1204
if ($d !~ tr/0-9// or $d =~ tr/0-9//c) {
1205
warn "bad content of 'length' element: $d";
1208
my $units = $attrs{units};
1209
if (not defined $units) {
1210
warn "missing 'units' attr in 'length' element";
1213
# We want to return a length in seconds.
1214
if ($units eq 'seconds') {
1217
elsif ($units eq 'minutes') {
1220
elsif ($units eq 'hours') {
1224
warn "bad value of 'units': $units";
1229
$Handlers{length}->[1] = sub( $$$ ) {
1230
my ($w, $e, $v) = @_;
1233
if ($v % 3600 == 0) {
1237
elsif ($v % 60 == 0) {
1244
$w->dataElement($e, $v, units => $units) if $w;
1249
=item I<episode-num>
1251
The representation in Perl of XMLTVE<39>s odd episode numbers is as a
1252
pair of [ content, system ]. As specified by the DTD, if the system is
1253
not given in the file then 'onscreen' is assumed. Whitespace in the
1254
'xmltv_ns' system is unimportant, so on reading it is normalized to
1255
a single space on either side of each dot.
1258
$Handlers{'episode-num'}->[0] = sub( $ ) {
1259
my $node = shift; die if not defined $node;
1260
my %attrs = %{get_attrs($node)};
1261
my $system = $attrs{system};
1262
$system = 'onscreen' if not defined $system;
1263
my $content = get_text($node);
1264
if ($system eq 'xmltv_ns') {
1265
# Make it look nice.
1266
$content =~ s/\s+//g;
1267
$content =~ s/\./ . /g;
1269
return [ $content, $system ];
1271
$Handlers{'episode-num'}->[1] = sub( $$$ ) {
1272
my ($w, $e, $v) = @_;
1274
if (not ref $v or ref $v ne 'ARRAY') {
1275
warn "not writing episode-num whose content is not an array";
1278
my ($content, $system) = @$v;
1279
$system = 'onscreen' if not defined $system;
1280
$w->dataElement($e, $content, system => $system) if $w;
1287
The <video> section is converted to a hash. The <present> subelement
1288
corresponds to the key 'present' of this hash, 'yes' and 'no' are
1289
converted to Booleans. The same applies to <colour>. The content of
1290
the <aspect> subelement is stored under the key 'aspect'. These keys
1291
can be missing in the hash just as the subelements can be missing in
1295
$Handlers{video}->[0] = sub ( $ ) {
1298
foreach (get_subelements($node)) {
1299
my $name = get_name($_);
1300
my $value = get_text($_);
1301
if ($name eq 'present') {
1302
warn "'present' seen twice" if defined $r{present};
1303
$r{present} = decode_boolean($value);
1305
elsif ($name eq 'colour') {
1306
warn "'colour' seen twice" if defined $r{colour};
1307
$r{colour} = decode_boolean($value);
1309
elsif ($name eq 'aspect') {
1310
warn "'aspect' seen twice" if defined $r{aspect};
1311
$value =~ /^\d+:\d+$/ or warn "bad aspect ratio: $value";
1312
$r{aspect} = $value;
1314
elsif ($name eq 'quality') {
1315
warn "'quality' seen twice" if defined $r{quality};
1316
$r{quality} = $value;
1321
$Handlers{video}->[1] = sub( $$$ ) {
1322
my ($w, $e, $v) = @_;
1323
t "'video' element";
1325
return if not %h; # don't write empty element
1326
$w->startTag($e) if $w;
1327
if (defined (my $val = delete $h{present})) {
1328
$w->dataElement('present', encode_boolean($val)) if $w;
1330
if (defined (my $val = delete $h{colour})) {
1331
$w->dataElement('colour', encode_boolean($val)) if $w;
1333
if (defined (my $val = delete $h{aspect})) {
1334
$w->dataElement('aspect', $val) if $w;
1336
if (defined (my $val = delete $h{quality})) {
1337
$w->dataElement('quality', $val) if $w;
1339
warn_unknown_keys("zz $e", \%h);
1340
$w->endTag($e) if $w;
1347
This is similar to I<video>. <present> is a Boolean value, while
1348
the content of <stereo> is stored unchanged.
1351
$Handlers{audio}->[0] = sub( $ ) {
1354
foreach (get_subelements($node)) {
1355
my $name = get_name($_);
1356
my $value = get_text($_);
1357
if ($name eq 'present') {
1358
warn "'present' seen twice" if defined $r{present};
1359
$r{present} = decode_boolean($value);
1361
elsif ($name eq 'stereo') {
1362
warn "'stereo' seen twice" if defined $r{stereo};
1364
warn "empty 'stereo' element not permitted, should be <stereo>stereo</stereo>";
1367
warn "bad value for 'stereo': '$value'"
1368
if ($value ne 'mono'
1369
and $value ne 'stereo'
1370
and $value ne 'surround'
1371
and $value ne 'dolby digital'
1372
and $value ne 'dolby');
1373
$r{stereo} = $value;
1378
$Handlers{audio}->[1] = sub( $$$ ) {
1379
my ($w, $e, $v) = @_;
1381
return if not %h; # don't write empty element
1382
$w->startTag($e) if $w;
1383
if (defined (my $val = delete $h{present})) {
1384
$w->dataElement('present', encode_boolean($val)) if $w;
1386
if (defined (my $val = delete $h{stereo})) {
1387
$w->dataElement('stereo', $val) if $w;
1389
warn_unknown_keys($e, \%h);
1390
$w->endTag($e) if $w;
1395
=item I<previously-shown>
1397
The 'start' and 'channel' attributes are converted to keys in a hash.
1400
$Handlers{'previously-shown'}->[0] = sub( $ ) {
1401
my $node = shift; die if not defined $node;
1402
my %attrs = %{get_attrs($node)};
1404
foreach (qw(start channel)) {
1405
my $v = delete $attrs{$_};
1406
$r->{$_} = $v if defined $v;
1408
foreach (keys %attrs) {
1409
warn "unknown attribute $_ in previously-shown";
1413
$Handlers{'previously-shown'}->[1] = sub( $$$ ) {
1414
my ($w, $e, $v) = @_;
1415
$w->emptyTag($e, %$v) if $w;
1422
The content of the element is ignored: it signfies something by its
1423
very presence. So the conversion from XML to Perl is a constant true
1424
value whenever the element is found; the conversion from Perl to XML
1425
is to write out the element if true, donE<39>t write anything if false.
1428
$Handlers{presence}->[0] = sub( $ ) {
1430
# The 'new' element is empty, it signifies newness by its very
1435
$Handlers{presence}->[1] = sub( $$$ ) {
1436
my ($w, $e, $v) = @_;
1438
# Not new, so don't create an element.
1441
$w->emptyTag($e) if $w;
1449
The 'type' attribute and the 'language' subelement (both optional)
1450
become keys in a hash. But see I<language> for what to pass as the
1451
value of that element.
1454
$Handlers{subtitles}->[0] = sub( $ ) {
1455
my $node = shift; die if not defined $node;
1456
my %attrs = %{get_attrs($node)};
1458
$r{type} = $attrs{type} if defined $attrs{type};
1459
foreach (get_subelements($node)) {
1460
my $name = get_name($_);
1461
if ($name eq 'language') {
1462
warn "'language' seen twice" if defined $r{language};
1463
$r{language} = read_with_lang($_, 0, 0);
1466
warn "bad content of 'subtitles' element: $name";
1471
$Handlers{subtitles}->[1] = sub( $$$ ) {
1472
my ($w, $e, $v) = @_;
1474
my ($type, $language) = ($v->{type}, $v->{language});
1475
my %attrs; $attrs{type} = $type if defined $type;
1476
if (defined $language) {
1477
$w->startTag($e, %attrs) if $w;
1478
write_with_lang($w, 'language', $language, 0, 0);
1479
$w->endTag($e) if $w;
1482
$w->emptyTag($e, %attrs) if $w;
1490
The rating is represented as a tuple of [ rating, system, icons ].
1491
The last element is itself a listref of structures returned by the
1495
$Handlers{rating}->[0] = sub( $ ) {
1496
my $node = shift; die if not defined $node;
1497
my %attrs = %{get_attrs($node)};
1498
my $system = delete $attrs{system} if exists $attrs{system};
1499
foreach (keys %attrs) {
1500
warn "unknown attribute in rating: $_";
1502
my @children = get_subelements($node);
1504
# First child node is value.
1505
my $value_node = shift @children;
1506
if (not defined $value_node) {
1507
warn "missing 'value' element inside rating";
1510
if ((my $name = get_name($value_node)) ne 'value') {
1511
warn "expected 'value' node inside rating, got '$name'";
1515
my $rating = read_value($value_node);
1517
# Remaining children are icons.
1518
my @icons = map { read_icon($_) } @children;
1520
return [ $rating, $system, \@icons ];
1522
$Handlers{rating}->[1] = sub( $$$ ) {
1523
my ($w, $e, $v) = @_;
1524
if (not ref $v or ref $v ne 'ARRAY') {
1525
warn "not writing rating whose content is not an array";
1528
my ($rating, $system, $icons) = @$v;
1529
if (defined $system) {
1530
$w->startTag($e, system => $system) if $w;
1533
$w->startTag($e) if $w;
1536
write_value($w, 'value', $rating) if $w;
1537
if ($w) { write_icon($w, 'icon', $_) foreach @$icons };
1538
$w->endTag($e) if $w;
1543
=item I<star-rating>
1545
In XML this is a string 'X/Y' plus a list of icons. In Perl represented
1546
as a pair [ rating, icons ] similar to I<rating>.
1548
Multiple star ratings are now supported. For backward compatability,
1549
you may specify a single [rating,icon] or the preferred double array
1550
[[rating,system,icon],[rating2,system2,icon2]] (like 'ratings')
1554
$Handlers{'star-rating'}->[0] = sub( $ ) {
1556
my %attrs = %{get_attrs($node)};
1557
my $system = delete $attrs{system} if exists $attrs{system};
1558
my @children = get_subelements($node);
1560
# First child node is value.
1561
my $value_node = shift @children;
1562
if (not defined $value_node) {
1563
warn "missing 'value' element inside star-rating";
1566
if ((my $name = get_name($value_node)) ne 'value') {
1567
warn "expected 'value' node inside star-rating, got '$name'";
1570
my $rating = read_value($value_node);
1572
# Remaining children are icons.
1573
my @icons = map { read_icon($_) } @children;
1575
return [ $rating, $system, \@icons ];
1577
$Handlers{'star-rating'}->[1] = sub ( $$$ ) {
1578
my ($w, $e, $v) = @_;
1580
# 10/31/2007 star-rating can now have multiple values (and system=)
1581
# let's make it so old code still works!
1583
if (not ref $v or ref $v ne 'ARRAY') {
1585
# warn "not writing star-rating whose content is not an array";
1588
my ($rating, $system, $icons) = @$v;
1589
if (defined $system) {
1590
$w->startTag($e, system => $system) if $w;
1593
$w->startTag($e) if $w;
1595
write_value($w, 'value', $rating) if $w;
1596
if ($w) { write_icon($w, 'icon', $_) foreach @$icons };
1597
$w->endTag($e) if $w;
1604
An icon in XMLTV files is like the <img> element in HTML. It is
1605
represented in Perl as a hashref with 'src' and optionally 'width'
1609
sub write_icon( $$$ ) {
1610
my ($w, $e, $v) = @_;
1611
croak "no 'src' attribute for icon\n" if not defined $v->{src};
1612
croak "bad width $v->{width} for icon\n"
1613
if defined $v->{width} and $v->{width} !~ /^\d+$/;
1614
croak "bad height $v->{height} for icon\n"
1615
if defined $v->{height} and $v->{height} !~ /^\d+$/;
1617
foreach (keys %$v) {
1618
warn "unrecognized key in icon: $_\n"
1619
if $_ ne 'src' and $_ ne 'width' and $_ ne 'height';
1622
$w->emptyTag($e, %$v);
1624
sub read_icon( $ ) {
1625
my $node = shift; die if not defined $node;
1626
my %attrs = %{get_attrs($node)};
1627
warn "missing 'src' attribute in icon" if not defined $attrs{src};
1630
$Handlers{icon}->[0] = \&read_icon;
1631
$Handlers{icon}->[1] = sub( $$$ ) {
1632
my ($w, $e, $v) = @_;
1633
write_icon($w, $e, $v) if $w;
1636
# To keep things tidy some elements that can have icons store their
1637
# textual content inside a subelement called 'value'. These two
1638
# routines are a bit trivial but they're here for consistency.
1640
sub read_value( $ ) {
1641
my $value_node = shift;
1642
my $v = get_text($value_node);
1643
if (not defined $v or $v eq '') {
1644
warn "no content of 'value' element";
1649
sub write_value( $$$ ) {
1650
my ($w, $e, $v) = @_;
1651
$w->dataElement($e, $v) if $w;
1655
# Booleans in XMLTV files are 'yes' or 'no'.
1656
sub decode_boolean( $ ) {
1658
if ($value eq 'no') {
1661
elsif ($value eq 'yes') {
1665
warn "bad boolean: $value";
1669
sub encode_boolean( $ ) {
1671
warn "expected a Perl boolean like 0 or 1, not '$v'\n"
1673
return $v ? 'yes' : 'no';
1681
In XML something like title can be either <title>Foo</title>
1682
or <title lang="en">Foo</title>. In Perl these are stored as
1683
[ 'Foo' ] and [ 'Foo', 'en' ]. For the former [ 'Foo', undef ]
1686
This handler also has two modifiers which may be added to the name
1687
after '/'. I</e> means that empty text is allowed, and will be
1688
returned as the empty tuple [], to mean that the element is present
1689
but has no text. When writing with I</e>, undef will also be
1690
understood as present-but-empty. You cannot however specify a
1691
language if the text is empty.
1693
The modifier I</m> means that the text is allowed to span multiple
1696
So for example I<with-lang/em> is a handler for text with language,
1697
where the text may be empty and may contain newlines. Note that the
1698
I<with-lang-or-empty> of earlier releases has been replaced by
1702
sub read_with_lang( $$$ ) {
1703
my ($node, $allow_empty, $allow_nl) = @_;
1704
die if not defined $node;
1705
my %attrs = %{get_attrs($node)};
1706
my $lang = $attrs{lang} if exists $attrs{lang};
1707
my $value = get_text($node, $allow_nl);
1708
if (not length $value) {
1709
if (not $allow_empty) {
1710
warn 'empty string for with-lang value';
1713
warn 'empty string may not have language' if defined $lang;
1716
if (defined $lang) {
1717
return [ $value, $lang ];
1723
$Handlers{'with-lang'}->[0] = sub( $ ) { read_with_lang($_[0], 0, 0) };
1724
$Handlers{'with-lang/'}->[0] = sub( $ ) { read_with_lang($_[0], 0, 0) };
1725
$Handlers{'with-lang/e'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 0) };
1726
$Handlers{'with-lang/m'}->[0] = sub( $ ) { read_with_lang($_[0], 0, 1) };
1727
$Handlers{'with-lang/em'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 1) };
1728
$Handlers{'with-lang/me'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 1) };
1730
sub write_with_lang( $$$$$ ) {
1731
my ($w, $e, $v, $allow_empty, $allow_nl) = @_;
1732
if (not ref $v or ref $v ne 'ARRAY') {
1733
warn "not writing with-lang whose content is not an array";
1738
if (not $allow_empty) {
1739
warn "not writing no content for $e";
1745
my ($text, $lang) = @$v;
1746
t 'writing character data: ' . d $text;
1747
if (not defined $text) {
1748
warn "not writing undefined value for $e";
1753
# strip whitespace silently.
1754
# we used to use a warn, but later on the code catches this and drops the record
1756
my $old_text = $text;
1760
if (not length $text) {
1761
if (not $allow_empty) {
1762
warn "not writing empty content for $e";
1765
if (defined $lang) {
1766
warn "not writing empty content with language for $e";
1769
$w->emptyTag($e) if $w;
1773
if (not $allow_nl and $text =~ tr/\n//) {
1774
warn "not writing text containing newlines for $e";
1778
if (defined $lang) {
1779
$w->dataElement($e, $text, lang => $lang) if $w;
1782
$w->dataElement($e, $text) if $w;
1785
$Handlers{'with-lang'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 0) };
1786
$Handlers{'with-lang/'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 0) };
1787
$Handlers{'with-lang/e'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 0) };
1788
$Handlers{'with-lang/m'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 1) };
1789
$Handlers{'with-lang/em'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 1) };
1790
$Handlers{'with-lang/me'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 1) };
1793
foreach (keys %Handlers) {
1794
my $v = $Handlers{$_};
1796
or ref($v->[0]) ne 'CODE'
1797
or ref($v->[1]) ne 'CODE') {
1798
die "bad handler pair for $_\n";
1806
Now, which handlers are used for which subelements (keys) of channels
1807
and programmes? And what is the multiplicity (should you expect a
1808
single value or a list of values)?
1810
The following tables map subelements of <channel> and of <programme>
1811
to the handlers used to read and write them. Many elements have their
1812
own handler with the same name, and most of the others use
1813
I<with-lang>. The third column specifies the multiplicity of the
1814
element: B<*> (any number) will give a list of values in Perl, B<+>
1815
(one or more) will give a nonempty list, B<?> (maybe one) will give a
1816
scalar, and B<1> (exactly one) will give a scalar which is not undef.
1818
=head2 Handlers for <channel>
1823
=item display-name, I<with-lang>, B<+>
1825
=item icon, I<icon>, B<*>
1827
=item url, I<scalar>, B<*>
1832
=head2 Handlers for <programme>
1837
=item title, I<with-lang>, B<+>
1839
=item sub-title, I<with-lang>, B<*>
1841
=item desc, I<with-lang/m>, B<*>
1843
=item credits, I<credits>, B<?>
1845
=item date, I<scalar>, B<?>
1847
=item category, I<with-lang>, B<*>
1849
=item language, I<with-lang>, B<?>
1851
=item orig-language, I<with-lang>, B<?>
1853
=item length, I<length>, B<?>
1855
=item icon, I<icon>, B<*>
1857
=item url, I<scalar>, B<*>
1859
=item country, I<with-lang>, B<*>
1861
=item episode-num, I<episode-num>, B<*>
1863
=item video, I<video>, B<?>
1865
=item audio, I<audio>, B<?>
1867
=item previously-shown, I<previously-shown>, B<?>
1869
=item premiere, I<with-lang/em>, B<?>
1871
=item last-chance, I<with-lang/em>, B<?>
1873
=item new, I<presence>, B<?>
1875
=item subtitles, I<subtitles>, B<*>
1877
=item rating, I<rating>, B<*>
1879
=item star-rating, I<star-rating>, B<*>
1884
At present, no parsing or validation on dates is done because dates
1885
may be partially specified in XMLTV. For example '2001' means that
1886
the year is known but not the month, day or time of day. Maybe in the
1887
future dates will be automatically converted to and from
1888
B<Date::Manip> objects. For now they just use the I<scalar> handler.
1889
Similar remarks apply to URLs.
1893
sub node_to_programme( $ ) {
1894
my $node = shift; die if not defined $node;
1897
# Attributes of programme element.
1898
%programme = %{get_attrs($node)};
1899
t 'attributes: ' . d \%programme;
1901
# Check the required attributes are there. As with most checking,
1902
# this isn't an alternative to using a validator but it does save
1903
# some headscratching during debugging.
1905
foreach (qw(start channel)) {
1906
if (not defined $programme{$_}) {
1907
warn "programme missing '$_' attribute\n";
1911
my @known_attrs = map { $_->[0] } @Programme_Attributes;
1912
my %ka; ++$ka{$_} foreach @known_attrs;
1913
foreach (keys %programme) {
1915
warn "deleting unknown attribute '$_'";
1916
delete $programme{$_};
1920
call_handlers_read($node, \@Programme_Handlers, \%programme);
1926
sub node_to_channel( $ ) {
1927
my $node = shift; die if not defined $node;
1929
t 'node_to_channel() ENTRY';
1931
%channel = %{get_attrs($node)};
1932
t 'attributes: ' . d \%channel;
1933
if (not defined $channel{id}) {
1934
warn "channel missing 'id' attribute\n";
1936
foreach (keys %channel) {
1937
unless (/^_/ or $_ eq 'id') {
1938
warn "deleting unknown attribute '$_'";
1939
delete $channel{$_};
1943
t '\@Channel_Handlers=' . d \@Channel_Handlers;
1944
call_handlers_read($node, \@Channel_Handlers, \%channel);
1952
# call_handlers_read()
1954
# Read the subelements of a node according to a list giving a
1955
# handler subroutine for each subelement.
1959
# Reference to list of handlers: tuples of
1960
# [element-name, handler-name, multiplicity]
1961
# Reference to hash for storing results
1963
# Warns if errors, but attempts to contine.
1965
sub call_handlers_read( $$$ ) {
1966
my ($node, $handlers, $r) = @_;
1967
t 'call_handlers_read() using handlers: ' . d $handlers;
1969
die unless ref($r) eq 'HASH';
1970
our %r; local *r = $r;
1971
t 'going through each child of node';
1973
# Current position in handlers. We expect to read the subelements
1974
# in the correct order as specified by the DTD.
1976
my $handler_pos = 0;
1978
SUBELEMENT: foreach (get_subelements($node)) {
1979
t 'doing subelement';
1980
my $name = get_name($_);
1981
t "tag name: $name";
1983
# Search for a handler - from $handler_pos onwards. But
1984
# first, just warn if somebody is trying to use an element in
1985
# the wrong place (trying to go backwards in the list).
1988
foreach my $i (0 .. $handler_pos - 1) {
1989
if ($name eq $handlers->[$i]->[0]) {
1990
warn "element $name not expected here";
1994
for (my $i = $handler_pos; $i < @$handlers; $i++) {
1995
if ($handlers->[$i]->[0] eq $name) {
2001
t "doesn't match name $handlers->[$i]->[0]";
2002
my ($handler_name, $h, $multiplicity)
2003
= @{$handlers->[$i]};
2004
die if not defined $handler_name;
2005
die if $handler_name eq '';
2007
# Before we skip over this element, check that we got
2008
# the necessary values for it.
2010
if ($multiplicity eq '?') {
2011
# Don't need to check whether this set.
2013
elsif ($multiplicity eq '1') {
2014
if (not defined $r{$handler_name}) {
2015
warn "no element $handler_name found";
2018
elsif ($multiplicity eq '*') {
2019
# It's okay if nothing was ever set. We don't
2020
# insist on putting in an empty list.
2023
elsif ($multiplicity eq '+') {
2024
if (not defined $r{$handler_name}) {
2025
warn "no element $handler_name found";
2027
elsif (not @{$r{$handler_name}}) {
2028
warn "strangely, empty list for $handler_name";
2032
warn "bad value of $multiplicity: $!";
2036
if (not defined $found_pos) {
2037
warn "unknown element $name";
2040
# Next time we begin searching from this position.
2041
$handler_pos = $found_pos;
2044
t 'calling handler';
2045
my ($handler_name, $h_name, $multiplicity)
2046
= @{$handlers->[$found_pos]};
2047
die if $handler_name ne $name;
2048
my $h = $Handlers{$h_name}; die "no handler $h_name" if not $h;
2049
my $result = $h->[0]->($_); # call reader sub
2050
t 'result: ' . d $result;
2051
warn("skipping bad $name\n"), next if not defined $result;
2053
# Now set the value. We can't do multiplicity checking yet
2054
# because there might be more elements of this type still to
2057
if ($multiplicity eq '?' or $multiplicity eq '1') {
2058
warn "seen $name twice"
2059
if defined $r{$name};
2060
$r{$name} = $result;
2062
elsif ($multiplicity eq '*' or $multiplicity eq '+') {
2063
push @{$r{$name}}, $result;
2066
warn "bad multiplicity: $multiplicity";
2071
sub warn_unknown_keys( $$ ) {
2072
my $elem_name = shift;
2073
our %k; local *k = shift;
2076
or $warned_unknown_key{$elem_name}->{$_}++
2077
or warn "unknown key $_ in $elem_name hash\n";
2081
package XMLTV::Writer;
2082
use base 'XML::Writer';
2085
use Date::Manip qw/UnixDate DateCalc/;
2087
# Use Log::TraceMessages if installed.
2089
eval { require Log::TraceMessages };
2095
*t = \&Log::TraceMessages::t;
2096
*d = \&Log::TraceMessages::d;
2101
Date::Manip::Date_Init("TZ=UTC");
2104
# Override dataElement() to refuse writing empty or whitespace
2107
sub dataElement( $$$@ ) {
2108
my ($self, $elem, $content, @rest) = @_;
2109
if ($content !~ /\S/) {
2110
warn "not writing empty content for $elem";
2113
return $self->SUPER::dataElement($elem, $content, @rest);
2120
When reading a file you have the choice of using C<parse()> to gulp
2121
the whole file and return a data structure, or using
2122
C<parse_callback()> to get the programmes one at a time, although
2123
channels and other data are still read all at once.
2125
There is a similar choice when writing data: the C<write_data()>
2126
routine prints a whole XMLTV document at once, but if you want to
2127
write an XMLTV document incrementally you can manually create an
2128
C<XMLTV::Writer> object and call methods on it. Synopsis:
2131
my $w = new XMLTV::Writer();
2132
$w->comment("Hello from XML::Writer's comment() method");
2133
$w->start({ 'generator-info-name' => 'Example code in pod' });
2134
my %ch = (id => 'test-channel', 'display-name' => [ [ 'Test', 'en' ] ]);
2135
$w->write_channel(\%ch);
2136
my %prog = (channel => 'test-channel', start => '200203161500',
2137
title => [ [ 'News', 'en' ] ]);
2138
$w->write_programme(\%prog);
2141
XMLTV::Writer inherits from XML::Writer, and provides the following extra
2142
or overridden methods:
2146
=item new(), the constructor
2148
Creates an XMLTV::Writer object and starts writing an XMLTV file, printing
2149
the DOCTYPE line. Arguments are passed on to XML::WriterE<39>s constructor,
2150
except for the following:
2152
the 'encoding' key if present gives the XML character encoding.
2155
my $w = new XMLTV::Writer(encoding => 'ISO-8859-1');
2157
If encoding is not specified, XML::WriterE<39>s default is used
2160
XMLTW::Writer can also filter out specific days from the data. This is
2161
useful if the datasource provides data for periods of time that does not
2162
match the days that the user has asked for. The filtering is controlled
2163
with the days, offset and cutoff arguments:
2165
my $w = new XMLTV::Writer(
2168
cutoff => "050000" );
2170
In this example, XMLTV::Writer will discard all entries that do not have
2171
starttimes larger than or equal to 05:00 tomorrow and less than 05:00
2172
two days after tomorrow. The time offset is stripped off the starttime before
2173
the comparison is made.
2178
my $class = ref($proto) || $proto;
2180
croak 'OUTPUT requires a filehandle, not a filename or anything else'
2181
if exists $args{OUTPUT} and not ref $args{OUTPUT};
2182
my $encoding = delete $args{encoding};
2183
my $days = delete $args{days};
2184
my $offset = delete $args{offset};
2185
my $cutoff = delete $args{cutoff};
2187
my $self = $class->SUPER::new(DATA_MODE => 1, DATA_INDENT => 2, %args);
2188
bless($self, $class);
2190
if (defined $encoding) {
2191
$self->xmlDecl($encoding);
2194
# XML::Writer puts in 'encoding="UTF-8"' even if you don't ask
2197
warn "assuming default UTF-8 encoding for output\n";
2201
# $Log::TraceMessages::On = 1;
2202
$self->{mintime} = "19700101000000";
2203
$self->{maxtime} = "29991231235959";
2206
if (defined( $days ) and defined( $offset ) and defined( $cutoff )) {
2207
$self->{mintime} = UnixDate(
2208
DateCalc( "today", "+" . $offset . " days" ),
2209
"%Y%m%d") . $cutoff;
2210
t "using mintime $self->{mintime}";
2212
$self->{maxtime} = UnixDate(
2213
DateCalc("today", "+" . $offset+$days . " days"),
2214
"%Y%m%d" ) . $cutoff;
2215
t "using maxtime $self->{maxtime}";
2217
elsif (defined( $days ) or defined( $offset ) or defined($cutoff)) {
2218
croak 'You must specify days, offset and cutoff or none of them';
2222
local $^W = 0; $self->doctype('tv', undef, 'xmltv.dtd');
2224
$self->{xmltv_writer_state} = 'new';
2232
Write the start of the <tv> element. Parameter is a hashref which gives
2233
the attributes of this element.
2238
die 'usage: XMLTV::Writer->start(hashref of attrs)' if @_ != 1;
2241
for ($self->{xmltv_writer_state}) {
2245
elsif ($_ eq 'channels' or $_ eq 'programmes') {
2246
croak 'cannot call start() more than once on XMLTV::Writer';
2248
elsif ($_ eq 'end') {
2249
croak 'cannot do anything with end()ed XMLTV::Writer';
2255
$self->startTag('tv', order_attrs(%{$attrs}));
2260
=item write_channels()
2262
Write several channels at once. Parameter is a reference to a hash
2263
mapping channel id to channel details. They will be written sorted
2264
by id, which is reasonable since the order of channels in an XMLTV
2265
file isnE<39>t significant.
2268
sub write_channels {
2269
my ($self, $channels) = @_;
2270
t('write_channels(' . d($self) . ', ' . d($channels) . ') ENTRY');
2271
croak 'expected hashref of channels' if ref $channels ne 'HASH';
2273
for ($self->{xmltv_writer_state}) {
2275
croak 'must call start() on XMLTV::Writer first';
2277
elsif ($_ eq 'channels') {
2280
elsif ($_ eq 'programmes') {
2281
croak 'cannot write channels after writing programmes';
2283
elsif ($_ eq 'end') {
2284
croak 'cannot do anything with end()ed XMLTV::Writer';
2289
my @ids = sort keys %$channels;
2290
t 'sorted list of channel ids: ' . d \@ids;
2292
t "writing channel with id $_";
2293
my $ch = $channels->{$_};
2294
$self->write_channel($ch);
2296
t('write_channels() EXIT');
2301
=item write_channel()
2303
Write a single channel. You can call this routine if you want, but
2304
most of the time C<write_channels()> is a better interface.
2308
my ($self, $ch) = @_;
2309
croak 'undef channel hash passed' if not defined $ch;
2310
croak "expected a hashref, got: $ch" if ref $ch ne 'HASH';
2312
for ($self->{xmltv_writer_state}) {
2314
croak 'must call start() on XMLTV::Writer first';
2316
elsif ($_ eq 'channels') {
2319
elsif ($_ eq 'programmes') {
2320
croak 'cannot write channels after writing programmes';
2322
elsif ($_ eq 'end') {
2323
croak 'cannot do anything with end()ed XMLTV::Writer';
2328
my %ch = %$ch; # make a copy
2329
my $id = delete $ch{id};
2330
die "no 'id' attribute in channel" if not defined $id;
2331
write_element_with_handlers($self, 'channel', { id => $id },
2332
\@XMLTV::Channel_Handlers, \%ch);
2337
=item write_programme()
2339
Write details for a single programme as XML.
2342
sub write_programme {
2344
die 'usage: XMLTV::Writer->write_programme(programme hash)' if @_ != 1;
2346
croak 'write_programme() expects programme hashref'
2347
if ref $ref ne 'HASH';
2348
t('write_programme(' . d($self) . ', ' . d($ref) . ') ENTRY');
2350
for ($self->{xmltv_writer_state}) {
2352
croak 'must call start() on XMLTV::Writer first';
2354
elsif ($_ eq 'channels') {
2357
elsif ($_ eq 'programmes') {
2360
elsif ($_ eq 'end') {
2361
croak 'cannot do anything with end()ed XMLTV::Writer';
2366
# We make a copy of the programme hash and delete elements from it
2367
# as they are dealt with; then we can easily spot any unhandled
2368
# elements at the end.
2372
# First deal with those hash keys that refer to metadata on when
2373
# the programme is broadcast. After taking those out of the hash,
2374
# we can use the handlers to output individual details.
2377
die if not @XMLTV::Programme_Attributes;
2378
foreach (@XMLTV::Programme_Attributes) {
2379
my ($name, $mult) = @$_;
2380
t "looking for key $name";
2381
my $val = delete $p{$name};
2383
# No need to check anything.
2385
elsif ($mult eq '1') {
2386
if (not defined $val) {
2387
warn "programme hash missing $name key, skipping";
2391
else { die "bad multiplicity for attribute: $mult" }
2392
$attrs{$name} = $val if defined $val;
2395
# We use string comparisons without timeoffsets for comparing times.
2396
my( $start ) = split( /\s+/, $attrs{start} );
2397
if( $start lt $self->{mintime} or
2398
$start ge $self->{maxtime} ) {
2399
t "skipping programme with start $attrs{start}";
2403
t "beginning 'programme' element";
2404
write_element_with_handlers($self, 'programme', \%attrs,
2405
\@XMLTV::Programme_Handlers, \%p);
2412
Say youE<39>ve finished writing programmes. This ends the <tv> element
2419
for ($self->{xmltv_writer_state}) {
2421
croak 'must call start() on XMLTV::Writer first';
2423
elsif ($_ eq 'channels' or $_ eq 'programmes') {
2426
elsif ($_ eq 'end') {
2427
croak 'cannot do anything with end()ed XMLTV::Writer';
2432
$self->endTag('tv');
2433
$self->SUPER::end(@_);
2440
# In XML the order of attributes is not significant. But to make
2441
# things look nice we try to output them in the same order as given in
2444
# Takes a list of (key, value, key, value, ...) and returns one with
2445
# keys in a nice-looking order.
2448
die "expected even number of elements, from a hash"
2450
my @a = ((map { $_->[0] } (@XMLTV::Channel_Attributes,
2451
@XMLTV::Programme_Attributes)),
2452
qw(date source-info-url source-info-name source-data-url
2453
generator-info-name generator-info-url));
2458
if (exists $in{$_}) {
2459
my $v = delete $in{$_};
2464
foreach (sort keys %in) {
2465
warn "unknown attribute $_" unless /^_/;
2466
push @r, $_, $in{$_};
2475
# Writes the elements of a hash to an XMLTV::Writer using a list of
2476
# handlers. Deletes keys (modifying the hash passed in) as they are
2479
# Requires all mandatory keys be present in the hash - if you're not
2480
# sure then use check_multiplicity() first.
2482
# Returns true if the element was successfully written, or if any
2483
# errors found don't look serious enough to cause bad XML. If the
2484
# XML::Writer object passed in is undef then nothing is written (since
2485
# the write handlers are coded like that.)
2487
sub call_handlers_write( $$$ ) {
2488
my ($self, $handlers, $input) = @_;
2489
t 'writing input hash: ' . d $input;
2490
die if not defined $input;
2493
foreach (@$handlers) {
2494
my ($name, $h_name, $multiplicity) = @$_;
2495
my $h = $XMLTV::Handlers{$h_name}; die "no handler $h_name" if not $h;
2496
my $writer = $h->[1]; die if not defined $writer;
2497
t "doing handler for $name$multiplicity";
2498
local $SIG{__WARN__} = sub {
2499
warn "$name element: $_[0]";
2502
my $val = delete $input->{$name};
2503
t 'got value(s): ' . d $val;
2504
if ($multiplicity eq '1') {
2505
$writer->($self, $name, $val);
2507
elsif ($multiplicity eq '?') {
2508
$writer->($self, $name, $val) if defined $val;
2510
elsif ($multiplicity eq '*' or $multiplicity eq '+') {
2511
croak "value for key $name should be an array ref"
2512
if defined $val and ref $val ne 'ARRAY';
2514
t 'writing value: ' . d $_;
2515
$writer->($self, $name, $_);
2516
t 'finished writing multiple values';
2520
warn "bad multiplicity specifier: $multiplicity";
2523
t 'leftover keys: ' . d([ sort keys %$input ]);
2530
# Warns about missing keys that are supposed to be mandatory. Returns
2531
# true iff everything is okay.
2533
sub check_multiplicity( $$ ) {
2534
my ($handlers, $input) = @_;
2535
foreach (@$handlers) {
2536
my ($name, $h_name, $multiplicity) = @$_;
2537
t "checking handler for $name: $h_name with multiplicity $multiplicity";
2538
if ($multiplicity eq '1') {
2539
if (not defined $input->{$name}) {
2540
warn "hash missing value for $name";
2544
elsif ($multiplicity eq '?') {
2545
# Okay if not present.
2547
elsif ($multiplicity eq '*') {
2548
# Not present, or undef, is treated as empty list.
2550
elsif ($multiplicity eq '+') {
2551
t 'one or more, checking for a listref with no undef values';
2552
my $val = $input->{$name};
2553
if (not defined $val) {
2554
warn "hash missing value for $name (expected list)";
2557
if (ref($val) ne 'ARRAY') {
2558
die "hash has bad contents for $name (expected list)";
2562
t 'all values: ' . d $val;
2563
my @new_val = grep { defined } @$val;
2564
t 'values that are defined: ' . d \@new_val;
2565
if (@new_val != @$val) {
2566
warn "hash had some undef elements in list for $name, removed";
2571
warn "hash has empty list of $name properties (expected at least one)";
2576
warn "bad multiplicity specifier: $multiplicity";
2585
# Write a complete element with attributes, and subelements written
2586
# using call_handlers_write(). The advantage over doing it by hand is
2587
# that if some required keys are missing, nothing is written (rather
2588
# than an incomplete and invalid element).
2590
sub write_element_with_handlers( $$$$$ ) {
2591
my ($w, $name, $attrs, $handlers, $hash) = @_;
2592
if (not check_multiplicity($handlers, $hash)) {
2593
warn "keys missing in $name hash, not writing";
2597
# Special 'debug' keys written as comments inside the element.
2599
foreach (grep /^debug/, keys %$hash) {
2600
$debug_keys{$_} = delete $hash->{$_};
2603
# Call all the handlers with no writer object and make sure
2606
if (not call_handlers_write(undef, $handlers, { %$hash })) {
2607
warn "bad data inside $name element, not writing\n";
2611
$w->startTag($name, order_attrs(%$attrs));
2612
foreach (sort keys %debug_keys) {
2613
my $val = $debug_keys{$_};
2614
$w->comment((defined $val) ? "$_: $val" : $_);
2616
call_handlers_write($w, $handlers, $hash);
2617
XMLTV::warn_unknown_keys($name, $hash);
2627
Ed Avis, ed@membled.com
2631
The file format is defined by the DTD xmltv.dtd, which is included in
2632
the xmltv package along with this module. It should be installed in
2633
your systemE<39>s standard place for SGML and XML DTDs.
2635
The xmltv package has a web page at
2636
<http://membled.com/work/apps/xmltv/> which carries
2637
information about the file format and the various tools and apps which
2638
are distributed with this module.