2
# $Id: IMDB.pm,v 1.53 2007/01/12 20:08:47 jveldhuis Exp $
4
# The IMDB file contains two packages:
5
# 1. XMLTV::IMDB::Cruncher package which parses and manages IMDB "lists" files
7
# 2. XMLTV::IMDB package that uses data files from the Cruncher package to
8
# update/add details to XMLTV programme nodes.
10
# FUTURE - multiple hits on the same 'title only' could try and look for
11
# character names matching from description to imdb.com character
14
# FUTURE - multiple hits on 'title only' should probably pick latest
15
# tv series over any older ones. May make for better guesses.
17
# BUG - we identify 'presenters' by the word "Host" appearing in the character
18
# description. For some movies, character names include the word Host.
19
# ex. Animal, The (2001) has a character named "Badger Milk Host".
21
# BUG - if there is a matching title with > 1 entry (say made for tv-movie and
22
# at tv-mini series) made in the same year (or even "close" years) it is
23
# possible for us to pick the wrong one we should pick the one with the
24
# closest year, not just the first closest match based on the result ordering
25
# for instance Ghost Busters was made in 1984, and into a tv series in
26
# 1986. if we have a list of GhostBusters 1983, we should pick the 1984 movie
27
# and not 1986 tv series...maybe :) but currently we'll pick the first
28
# returned close enough match instead of trying the closest date match of
41
my $self={ @_ }; # remaining args become attributes
43
for ('imdbDir', 'verbose') {
44
die "invalid usage - no $_" if ( !defined($self->{$_}));
47
$self->{replaceDates}=0 if ( !defined($self->{replaceDates}));
48
$self->{replaceTitles}=0 if ( !defined($self->{replaceTitles}));
49
$self->{replaceCategories}=0 if ( !defined($self->{replaceCategories}));
50
$self->{replaceURLs}=0 if ( !defined($self->{replaceURLs}));
51
$self->{replaceDirectors}=1 if ( !defined($self->{replaceDirectors}));
52
$self->{replaceActors}=0 if ( !defined($self->{replaceActors}));
53
$self->{replacePresentors}=1 if ( !defined($self->{replacePresentors}));
54
$self->{replaceCommentators}=1 if ( !defined($self->{replaceCommentators}));
55
$self->{replaceStarRatings}=0 if ( !defined($self->{replaceStarRatings}));
57
$self->{updateDates}=1 if ( !defined($self->{updateDates}));
58
$self->{updateTitles}=1 if ( !defined($self->{updateTitles}));
59
$self->{updateCategories}=1 if ( !defined($self->{updateCategories}));
60
$self->{updateCategoriesWithGenres}=1 if ( !defined($self->{updateCategoriesWithGenres}));
61
$self->{updateURLs}=1 if ( !defined($self->{updateURLs}));
62
$self->{updateDirectors}=1 if ( !defined($self->{updateDirectors}));
63
$self->{updateActors}=1 if ( !defined($self->{updateActors}));
64
$self->{updatePresentors}=1 if ( !defined($self->{updatePresentors}));
65
$self->{updateCommentators}=1 if ( !defined($self->{updateCommentators}));
66
$self->{updateStarRatings}=1 if ( !defined($self->{updateStarRatings}));
68
$self->{moviedbIndex}="$self->{imdbDir}/moviedb.idx";
69
$self->{moviedbData}="$self->{imdbDir}/moviedb.dat";
70
$self->{moviedbInfo}="$self->{imdbDir}/moviedb.info";
71
$self->{moviedbOffline}="$self->{imdbDir}/moviedb.offline";
73
# default is not to cache lookups
74
$self->{cacheLookups}=0 if ( !defined($self->{cacheLookups}) );
75
$self->{cacheLookupSize}=0 if ( !defined($self->{cacheLookupSize}) );
77
$self->{cachedLookups}->{tv_series}->{_cacheSize_}=0;
81
$self->{categories}={'movie' =>'Movie',
82
'tv_movie' =>'TV Movie', # made for tv
83
'video_movie' =>'Video Movie', # went straight to video or was made for it
84
'tv_series' =>'TV Series',
85
'tv_mini_series' =>'TV Mini Series'};
87
$self->{stats}->{programCount}=0;
89
for my $cat (keys %{$self->{categories}}) {
90
$self->{stats}->{perfect}->{$cat}=0;
91
$self->{stats}->{close}->{$cat}=0;
93
$self->{stats}->{perfectMatches}=0;
94
$self->{stats}->{closeMatches}=0;
96
$self->{stats}->{startTime}=time();
106
open(INFO, "< $file") || return("imdbDir index file \"$file\":$!\n");
109
if ( s/^([^:]+)://o ) {
117
sub checkIndexesOkay($)
120
if ( ! -d "$self->{imdbDir}" ) {
121
return("imdbDir \"$self->{imdbDir}\" does not exist\n");
124
if ( -f "$self->{moviedbOffline}" ) {
125
return("imdbDir index offline: check $self->{moviedbOffline} for details");
128
for my $file ($self->{moviedbIndex}, $self->{moviedbData}, $self->{moviedbInfo}) {
129
if ( ! -f "$file" ) {
130
return("imdbDir index file \"$file\" does not exist\n");
134
$VERSION=~m/^(\d+)\.(\d+)$/o || die "package corrupt, VERSION string invalid ($VERSION)";
135
my ($major, $minor)=($1, $2);
137
my $info=loadDBInfo($self->{moviedbInfo});
138
return($info) if ( ref $info eq 'SCALAR' );
140
if ( !defined($info->{db_version}) ) {
141
return("imdbDir index db missing version information, rerun --prepStage all\n");
143
if ( $info->{db_version}=~m/^(\d+)\.(\d+)$/o ) {
144
if ( $1 != $major || $minor < $2 ) {
145
return("imdbDir index db requires updating, rerun --prepStage all\n");
147
if ( $1 == 0 && $2 == 1 ) {
148
return("imdbDir index db requires update, rerun --prepStage 5 (bug:actresses never appear)\n");
150
if ( $1 == 0 && $2 == 2 ) {
151
# 0.2 -> 0.3 upgrade requires prepStage 5 to be re-run
152
return("imdbDir index db requires minor reindexing, rerun --prepStage 3 and 5\n");
154
if ( $1 == 0 && $2 == 3 ) {
155
# 0.2 -> 0.3 upgrade requires prepStage 5 to be re-run
156
return("imdbDir index db requires major reindexing, rerun --prepStage 2 and new prepStages 5,6 and 7\n");
158
if ( $1 == 0 && $2 == 4 ) {
159
# 0.2 -> 0.3 upgrade requires prepStage 5 to be re-run
160
return("imdbDir index db corrupt (got version 0.4), rerun --prepStage all\n");
166
return("imdbDir index version of '$info->{db_version}' is invalid, rerun --prepStage all\n".
167
"if problem persists, submit bug report to xmltv-devel\@lists.sf.net\n");
171
sub basicVerificationOfIndexes($)
175
# check that the imdbdir is invalid and up and running
176
my $title="Army of Darkness";
179
$self->openMovieIndex() || return("basic verification of indexes failed\n".
180
"database index isn't readable");
182
my $res=$self->getMovieMatches($title, $year);
183
if ( !defined($res) ) {
184
$self->closeMovieIndex();
185
return("basic verification of indexes failed\n".
186
"no match for basic verification of movie \"$title, $year\"\n");
188
if ( !defined($res->{exactMatch}) ) {
189
$self->closeMovieIndex();
190
return("basic verification of indexes failed\n".
191
"no exact match for movie \"$title, $year\"\n");
193
if ( scalar(@{$res->{exactMatch}})!= 1) {
194
$self->closeMovieIndex();
195
return("basic verification of indexes failed\n".
196
"got more than one exact match for movie \"$title, $year\"\n");
198
my @exact=@{$res->{exactMatch}};
199
if ( $exact[0]->{title} ne $title ) {
200
$self->closeMovieIndex();
201
return("basic verification of indexes failed\n".
202
"title associated with key \"$title, $year\" is bad\n");
205
if ( $exact[0]->{year} ne "$year" ) {
206
$self->closeMovieIndex();
207
return("basic verification of indexes failed\n".
208
"year associated with key \"$title, $year\" is bad\n");
211
my $id=$exact[0]->{id};
212
$res=$self->getMovieIdDetails($id);
213
if ( !defined($res) ) {
214
$self->closeMovieIndex();
215
return("basic verification of indexes failed\n".
216
"no movie details for movie \"$title, $year\" (id=$id)\n");
219
if ( !defined($res->{directors}) ) {
220
$self->closeMovieIndex();
221
return("basic verification of indexes failed\n".
222
"movie details didn't provide any director for movie \"$title, $year\" (id=$id)\n");
224
if ( !$res->{directors}[0]=~m/Raimi/o ) {
225
$self->closeMovieIndex();
226
return("basic verification of indexes failed\n".
227
"movie details didn't show Raimi as the main director for movie \"$title, $year\" (id=$id)\n");
229
if ( !defined($res->{actors}) ) {
230
$self->closeMovieIndex();
231
return("basic verification of indexes failed\n".
232
"movie details didn't provide any cast movie \"$title, $year\" (id=$id)\n");
234
if ( !$res->{actors}[0]=~m/Campbell/o ) {
235
$self->closeMovieIndex();
236
return("basic verification of indexes failed\n".
237
"movie details didn't show Bruce Campbell as the main actor in movie \"$title, $year\" (id=$id)\n");
239
if ( $res->{genres}[0] ne "Action" ||
240
$res->{genres}[1] ne "Adventure" ||
241
$res->{genres}[2] ne "Comedy" ||
242
$res->{genres}[3] ne "Fantasy" ||
243
$res->{genres}[4] ne "Horror" ) {
244
$self->closeMovieIndex();
245
return("basic verification of indexes failed\n".
246
"movie details didn't show genres correctly for movie \"$title, $year\" (id=$id)\n");
248
if ( !defined($res->{ratingDist}) ||
249
!defined($res->{ratingVotes}) ||
250
!defined($res->{ratingRank}) ) {
251
$self->closeMovieIndex();
252
return("basic verification of indexes failed\n".
253
"movie details didn't show imdbratings for movie \"$title, $year\" (id=$id)\n");
255
$self->closeMovieIndex();
260
sub sanityCheckDatabase($)
265
$errline=$self->checkIndexesOkay();
266
return($errline) if ( defined($errline) );
267
$errline=$self->basicVerificationOfIndexes();
268
return($errline) if ( defined($errline) );
276
print STDERR "tv_imdb: $_[1]\n";
281
if ( $_[0]->{verbose} ) {
282
print STDERR "tv_imdb: $_[1]\n";
290
if ( $self->{verbose} > 1 ) {
291
print STDERR "tv_imdb: $mess\n";
297
sub openMovieIndex($)
301
if ( !open($self->{INDEX_FD}, "< $self->{moviedbIndex}") ) {
304
if ( !open($self->{DBASE_FD}, "< $self->{moviedbData}") ) {
305
close($self->{INDEX_FD});
311
sub closeMovieIndex($)
315
close($self->{INDEX_FD});
316
delete($self->{INDEX_FD});
318
close($self->{DBASE_FD});
319
delete($self->{DBASE_FD});
324
# moviedbIndex file has the format:
326
# where key is a url encoded title followed by the year of production and a colon
327
sub getMovieMatches($$$)
333
# Articles are put at the end of a title ( in all languages )
334
#$match=~s/^(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/$2, $1/og;
337
if ( defined($year) ) {
341
# to encode s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg
342
# to decode s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge;
346
$match=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg;
348
$self->debug("looking for \"$match\" in $self->{moviedbIndex}");
349
if ( !$self->{INDEX_FD} ) {
350
die "internal error: index not open";
353
my $FD=$self->{INDEX_FD};
354
Search::Dict::look(*{$FD}, $match, 0, 0);
357
last if ( !m/^$match/ );
360
my @arr=split('\t', $_);
361
if ( scalar(@arr) != 5 ) {
362
warn "$self->{moviedbIndex} corrupt (correct key:$_)";
366
if ( $arr[0] eq $match ) {
367
# return title and id
368
#$arr[1]=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og;
370
#$arr[0]=~s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge;
371
#$self->debug("exact:$arr[1] ($arr[2]) qualifier=$arr[3] id=$arr[4]");
373
if ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\)$//o ) {
375
elsif ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\/[IVX]+\)$//o ) {
378
die "unable to decode year from title key \"$title\", report to xmltv-devel\@lists.sf.net";
380
$title=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og;
381
$self->debug("exact:$title ($arr[2]) qualifier=$arr[3] id=$arr[4]");
382
push(@{$results->{exactMatch}}, {'key'=> $arr[1],
385
'qualifier'=>$arr[3],
390
#s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge;
392
#$arr[1]=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og;
393
#$arr[0]=~s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge;
394
#$self->debug("close:$arr[1] ($arr[2]) qualifier=$arr[3] id=$arr[4]");
397
if ( $title=~m/^\"/o && $title=~m/\"\s*\(/o ) { #"
399
$title=~s/\"(\s*\()/$1/o; #"
402
if ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\)$//o ) {
404
elsif ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\/[IVX]+\)$//o ) {
407
die "unable to decode year from title key \"$title\", report to xmltv-devel\@lists.sf.net";
409
$title=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og;
410
$self->debug("close:$title ($arr[2]) qualifier=$arr[3] id=$arr[4]");
411
push(@{$results->{closeMatch}}, {'key'=> $arr[1],
414
'qualifier'=>$arr[3],
418
#print "MovieMatches on ($match) = ".Dumper($results)."\n";
422
sub getMovieExactMatch($$$)
427
my $res=$self->getMovieMatches($title, $year);
429
return(undef) if ( !defined($res) );
430
if ( !defined($res->{exactMatch}) ) {
433
if ( scalar(@{$res->{exactMatch}}) != 1 ) {
436
return($res->{exactMatch}[0]);
439
sub getMovieCloseMatches($$)
444
my $res=$self->getMovieMatches($title, undef) || return(undef);
446
if ( defined($res->{exactMatch})) {
447
die "corrupt imdb database - hit on \"$title\"";
449
return(undef) if ( !defined($res->{closeMatch}) );
450
my @arr=@{$res->{closeMatch}};
451
#print "CLOSE DUMP=".Dumper(@arr)."\n";
455
sub getMovieIdDetails($$)
460
if ( !$self->{DBASE_FD} ) {
461
die "internal error: index not open";
464
my $FD=$self->{DBASE_FD};
465
Search::Dict::look(*{$FD}, "$id:", 0, 0);
467
last if ( !m/^$id:/ );
470
my ($directors, $actors, $genres, $ratingDist, $ratingVotes, $ratingRank)=split('\t', $_);
471
if ( $directors ne "<>" ) {
472
for my $name (split('\|', $directors)) {
473
# remove (I) etc from imdb.com names (kept in place for reference)
474
$name=~s/\s\([IVX]+\)$//o;
475
# switch name around to be surname last
476
$name=~s/^([^,]+),\s*(.*)$/$2 $1/o;
477
push(@{$results->{directors}}, $name);
480
if ( $actors ne "<>" ) {
481
for my $name (split('\|', $actors)) {
482
# remove (I) etc from imdb.com names (kept in place for reference)
484
if ( $name=~s/\[([^\]]+)\]$//o ) {
487
$name=~s/\s\([IVX]+\)$//o;
489
# switch name around to be surname last
490
$name=~s/^([^,]+),\s*(.*)$/$2 $1/o;
491
if ( $HostNarrator ) {
492
if ( $HostNarrator=~s/,*Host//o ) {
493
push(@{$results->{presenter}}, $name);
495
if ( $HostNarrator=~s/,*Narrator//o ) {
496
push(@{$results->{commentator}}, $name);
500
push(@{$results->{actors}}, $name);
504
if ( $genres ne "<>" ) {
505
push(@{$results->{genres}}, split('\|', $genres));
507
$results->{ratingDist}=$ratingDist if ( $ratingDist ne "<>" );
508
$results->{ratingVotes}=$ratingVotes if ( $ratingVotes ne "<>" );
509
$results->{ratingRank}=$ratingRank if ( $ratingRank ne "<>" );
512
warn "lookup of movie (id=$id) resulted in garbage ($_)";
515
if ( !defined($results) ) {
516
# some movies we don't have any details for
517
$results->{noDetails}=1;
519
#print "MovieDetails($id) = ".Dumper($results)."\n";
524
# FUTURE - close hit could be just missing or extra
526
# "Run Silent, Run Deep" for imdb's "Run Silent Run Deep"
527
# "Cherry, Harry and Raquel" for imdb's "Cherry, Harry and Raquel!"
528
# "Cat Women of the Moon" for imdb's "Cat-Women of the Moon"
529
# "Baywatch Hawaiian Wedding" for imdb's "Baywatch: Hawaiian Wedding" :)
531
# FIXED - "Victoria and Albert" appears for imdb's "Victoria & Albert" (and -> &)
532
# FIXED - "Columbo Cries Wolf" appears instead of "Columbo:Columbo Cries Wolf"
533
# FIXED - Place the article last, for multiple languages. For instance
534
# Los amantes del c�rculo polar -> amantes del c�rculo polar, Los
535
# FIXED - common international vowel changes. For instance
536
# "Anna Kar�nin" (�->e)
538
sub alternativeTitles($)
543
push(@titles, $title);
544
# try the & -> and conversion
545
if ( $title=~m/\&/o ) {
547
while ( $t=~s/(\s)\&(\s)/$1and$2/o ) {
551
# try the and -> & conversion
552
if ( $title=~m/\sand\s/io ) {
554
while ( $t=~s/(\s)and(\s)/$1\&$2/io ) {
559
# try the "Columbo: Columbo cries Wolf" -> "Columbo cries Wolf" conversion
561
if ( m/^[^:]+:.+$/io ) {
563
while ( $t=~s/^[^:]+:\s*(.+)\s*$/$1/io ) {
569
# Place the articles last
571
if ( m/^(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/io ) {
573
$t=~s/^(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/$2, $1/iog;
578
# convert all the special language characters
580
if ( m/[����������������������������������������������������������]/io ) {
582
$t=~s/[������������]/a/gio;
583
$t=~s/[��������]/e/gio;
584
$t=~s/[��������]/i/gio;
585
$t=~s/[������������]/o/gio;
586
$t=~s/[��������]/u/gio;
600
sub findMovieInfo($$$$)
602
my ($self, $title, $year, $exact)=@_;
604
my @titles=@{alternativeTitles($title)};
607
# try an exact match first :)
608
for my $mytitle ( @titles ) {
609
my $info=$self->getMovieExactMatch($mytitle, $year);
610
if ( defined($info) ) {
611
if ( $info->{qualifier} eq "movie" ) {
612
$self->status("perfect hit on movie \"$info->{key}\"");
613
$info->{matchLevel}="perfect";
616
elsif ( $info->{qualifier} eq "tv_movie" ) {
617
$self->status("perfect hit on made-for-tv-movie \"$info->{key}\"");
618
$info->{matchLevel}="perfect";
621
elsif ( $info->{qualifier} eq "video_movie" ) {
622
$self->status("perfect hit on made-for-video-movie \"$info->{key}\"");
623
$info->{matchLevel}="perfect";
626
elsif ( $info->{qualifier} eq "video_game" ) {
628
elsif ( $info->{qualifier} eq "tv_series" ) {
630
elsif ( $info->{qualifier} eq "tv_mini_series" ) {
633
$self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\"");
634
$self->error("weird trailing qualifier \"$info->{qualifier}\"");
635
$self->error("submit bug report to xmltv-devel\@lists.sf.net");
638
$self->debug("no exact title/year hit on \"$mytitle ($year)\"");
642
elsif ( $exact == 2 ) {
643
# looking for first exact match on the title, don't have a year to compare
645
for my $mytitle ( @titles ) {
646
# try close hit if only one :)
648
my @closeMatches=$self->getMovieCloseMatches("$mytitle");
650
# we traverse the hits twice, first looking for success,
651
# then again to produce warnings about missed close matches
652
for my $info (@closeMatches) {
653
next if ( !defined($info) );
656
# within one year with exact match good enough
657
if ( lc($mytitle) eq lc($info->{title}) ) {
659
if ( $info->{qualifier} eq "movie" ) {
660
$self->status("close enough hit on movie \"$info->{key}\" (since no 'date' field present)");
661
$info->{matchLevel}="close";
664
elsif ( $info->{qualifier} eq "tv_movie" ) {
665
$self->status("close enough hit on made-for-tv-movie \"$info->{key}\" (since no 'date' field present)");
666
$info->{matchLevel}="close";
669
elsif ( $info->{qualifier} eq "video_movie" ) {
670
$self->status("close enough hit on made-for-video-movie \"$info->{key}\" (since no 'date' field present)");
671
$info->{matchLevel}="close";
674
elsif ( $info->{qualifier} eq "video_game" ) {
676
elsif ( $info->{qualifier} eq "tv_series" ) {
678
elsif ( $info->{qualifier} eq "tv_mini_series" ) {
681
$self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\"");
682
$self->error("weird trailing qualifier \"$info->{qualifier}\"");
683
$self->error("submit bug report to xmltv-devel\@lists.sf.net");
692
# otherwise we're looking for a title match with a close year
693
for my $mytitle ( @titles ) {
694
# try close hit if only one :)
696
my @closeMatches=$self->getMovieCloseMatches("$mytitle");
698
# we traverse the hits twice, first looking for success,
699
# then again to produce warnings about missed close matches
700
for my $info (@closeMatches) {
701
next if ( !defined($info) );
704
# within one year with exact match good enough
705
if ( lc($mytitle) eq lc($info->{title}) ) {
706
my $yearsOff=abs(int($info->{year})-$year);
708
$info->{matchLevel}="close";
710
if ( $yearsOff <= 2 ) {
711
my $showYear=int($info->{year});
713
if ( $info->{qualifier} eq "movie" ) {
714
$self->status("close enough hit on movie \"$info->{key}\" (off by $yearsOff years)");
717
elsif ( $info->{qualifier} eq "tv_movie" ) {
718
$self->status("close enough hit on made-for-tv-movie \"$info->{key}\" (off by $yearsOff years)");
721
elsif ( $info->{qualifier} eq "video_movie" ) {
722
$self->status("close enough hit on made-for-video-movie \"$info->{key}\" (off by $yearsOff years)");
725
elsif ( $info->{qualifier} eq "video_game" ) {
726
$self->status("ignoring close hit on video-game \"$info->{key}\"");
728
elsif ( $info->{qualifier} eq "tv_series" ) {
729
$self->status("ignoring close hit on tv series \"$info->{key}\"");
730
#$self->status("close enough hit on tv series \"$info->{key}\" (off by $yearsOff years)");
732
elsif ( $info->{qualifier} eq "tv_mini_series" ) {
733
$self->status("ignoring close hit on tv mini-series \"$info->{key}\"");
734
#$self->status("close enough hit on tv mini-series \"$info->{key}\" (off by $yearsOff years)");
737
$self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\"");
738
$self->error("weird trailing qualifier \"$info->{qualifier}\"");
739
$self->error("submit bug report to xmltv-devel\@lists.sf.net");
745
# if we found at least something, but nothing matched
746
# produce warnings about missed, but close matches
747
for my $info (@closeMatches) {
748
next if ( !defined($info) );
750
# within one year with exact match good enough
751
if ( lc($mytitle) eq lc($info->{title}) ) {
752
my $yearsOff=abs(int($info->{year})-$year);
753
if ( $yearsOff <= 2 ) {
754
#die "internal error: key \"$info->{key}\" failed to be processed properly";
756
elsif ( $yearsOff <= 5 ) {
757
# report these as status
758
$self->status("ignoring close, but not good enough hit on \"$info->{key}\" (off by $yearsOff years)");
761
# report these as debug messages
762
$self->debug("ignoring close hit on \"$info->{key}\" (off by $yearsOff years)");
766
$self->debug("ignoring close hit on \"$info->{key}\" (title did not match)");
770
#$self->status("failed to lookup \"$title ($year)\"");
774
sub findTVSeriesInfo($$)
776
my ($self, $title)=@_;
778
if ( $self->{cacheLookups} ) {
779
my $id=$self->{cachedLookups}->{tv_series}->{$title};
781
if ( defined($id) ) {
782
#print STDERR "REF= (".ref($id).")\n";
790
my @titles=@{alternativeTitles($title)};
792
# try an exact match first :)
795
for my $mytitle ( @titles ) {
796
# try close hit if only one :)
798
my @closeMatches=$self->getMovieCloseMatches("$mytitle");
800
for my $info (@closeMatches) {
801
next if ( !defined($info) );
804
if ( lc($mytitle) eq lc($info->{title}) ) {
806
$info->{matchLevel}="perfect";
808
if ( $info->{qualifier} eq "movie" ) {
809
#$self->status("ignoring close hit on movie \"$info->{key}\"");
811
elsif ( $info->{qualifier} eq "tv_movie" ) {
812
#$self->status("ignoring close hit on tv movie \"$info->{key}\"");
814
elsif ( $info->{qualifier} eq "video_movie" ) {
815
#$self->status("ignoring close hit on made-for-video-movie \"$info->{key}\"");
817
elsif ( $info->{qualifier} eq "video_game" ) {
818
#$self->status("ignoring close hit on made-for-video-movie \"$info->{key}\"");
820
elsif ( $info->{qualifier} eq "tv_series" ) {
822
$self->status("perfect hit on tv series \"$info->{key}\"");
825
elsif ( $info->{qualifier} eq "tv_mini_series" ) {
827
$self->status("perfect hit on tv mini-series \"$info->{key}\"");
831
$self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\"");
832
$self->error("weird trailing qualifier \"$info->{qualifier}\"");
833
$self->error("submit bug report to xmltv-devel\@lists.sf.net");
837
last if ( defined($idInfo) );
840
if ( $self->{cacheLookups} ) {
841
# flush cache after this lookup if its gotten too big
842
if ( $self->{cachedLookups}->{tv_series}->{_cacheSize_} >
843
$self->{cacheLookupSize} ) {
844
delete($self->{cachedLookups}->{tv_series});
845
$self->{cachedLookups}->{tv_series}->{_cacheSize_}=0;
847
if ( defined($idInfo) ) {
848
$self->{cachedLookups}->{tv_series}->{$title}=$idInfo;
851
$self->{cachedLookups}->{tv_series}->{$title}="";
853
$self->{cachedLookups}->{tv_series}->{_cacheSize_}++;
855
if ( defined($idInfo) ) {
859
#$self->status("failed to lookup tv series \"$title\"");
865
# todo - add country of origin
866
# todo - video (colour/aspect etc) details
867
# todo - audio (stereo) details
868
# todo - ratings ? - use certificates.list
869
# todo - add description - plot summaries ? - which one do we choose ?
872
# todo - running time (duration)
873
# todo - identify 'Host' and 'Narrator's and put them in as
874
# credits:presenter and credits:commentator resp.
875
# todo - check program length - probably a warning if longer ?
876
# can we update length (separate from runnning time in the output ?)
877
# todo - icon - url from www.imdb.com of programme image ?
878
# this could be done by scraping for the hyper linked poster
879
# <a name="poster"><img src="http://ia.imdb.com/media/imdb/01/I/60/69/80m.jpg" height="139" width="99" border="0"></a>
880
# and grabbin' out the img entry. (BTW ..../npa.jpg seems to line up with no poster available)
885
my ($self, $prog, $idInfo)=@_;
887
my $title=$prog->{title}->[0]->[0];
889
if ( $self->{updateDates} ) {
892
# don't add dates only fix them for tv_series
893
if ( $idInfo->{qualifier} eq "movie" ||
894
$idInfo->{qualifier} eq "video_movie" ||
895
$idInfo->{qualifier} eq "tv_movie" ) {
896
#$self->debug("adding 'date' field (\"$idInfo->{year}\") on \"$title\"");
897
$date=int($idInfo->{year});
900
#$self->debug("not adding 'date' field to $idInfo->{qualifier} \"$title\"");
904
if ( $self->{replaceDates} ) {
905
if ( defined($prog->{date}) && defined($date) ) {
906
$self->debug("replacing 'date' field");
907
delete($prog->{date});
912
# only set date if not already defined
913
if ( !defined($prog->{date}) && defined($date) ) {
919
if ( $self->{updateTitles} ) {
920
if ( $idInfo->{title} ne $title ) {
921
if ( $self->{replaceTitles} ) {
922
$self->debug("replacing (all) 'title' from \"$title\" to \"$idInfo->{title}\"");
923
delete($prog->{title});
928
push(@list, [$idInfo->{title}, undef]);
930
if ( defined($prog->{title}) ) {
931
my $name=$idInfo->{title};
933
for my $v (@{$prog->{title}}) {
934
if ( lc($v->[0]) eq lc($name) ) {
942
$prog->{title}=\@list;
946
if ( $self->{updateURLs} ) {
947
if ( $self->{replaceURLs} ) {
948
if ( defined($prog->{url}) ) {
949
$self->debug("replacing (all) 'url'");
950
delete($prog->{url});
954
# add url to programme on www.imdb.com
955
my $url=$idInfo->{key};
957
$url=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg;
958
$url="http://us.imdb.com/M/title-exact?".$url;
960
if ( defined($prog->{url}) ) {
963
for (@{$prog->{url}}) {
964
# skip urls for imdb.com that we're probably safe to replace
965
if ( !m;^http://us.imdb.com/M/title-exact;o ) {
972
push(@{$prog->{url}}, $url);
976
# squirrel away movie qualifier so its first on the list of replacements
978
push(@categories, [$self->{categories}->{$idInfo->{qualifier}}, 'en']);
979
if ( !defined($self->{categories}->{$idInfo->{qualifier}}) ) {
980
die "how did we get here with an invalid qualifier '$idInfo->{qualifier}'";
983
my $details=$self->getMovieIdDetails($idInfo->{id});
984
if ( $details->{noDetails} ) {
985
# we don't have any details on this movie
989
if ( $self->{updateDirectors} && defined($details->{directors}) ) {
990
# only update directors if we have exactly one or if
991
# its a movie of some kind, add more than one.
992
if ( scalar(@{$details->{directors}}) == 1 ||
993
$idInfo->{qualifier} eq "movie" ||
994
$idInfo->{qualifier} eq "video_movie" ||
995
$idInfo->{qualifier} eq "tv_movie" ) {
997
if ( $self->{replaceDirectors} ) {
998
if ( defined($prog->{credits}->{director}) ) {
999
$self->debug("replacing director(s)");
1000
delete($prog->{credits}->{director});
1005
# add top 3 billing directors list form www.imdb.com
1006
for my $name (splice(@{$details->{directors}},0,3)) {
1010
# preserve all existing directors listed if we did't already have them.
1011
if ( defined($prog->{credits}->{director}) ) {
1012
for my $name (@{$prog->{credits}->{director}}) {
1015
if ( lc eq lc($name) ) {
1024
$prog->{credits}->{director}=\@list;
1027
$self->debug("not adding 'director' field to $idInfo->{qualifier} \"$title\"");
1031
if ( $self->{updateActors} && defined($details->{actors}) ) {
1032
if ( $self->{replaceActors} ) {
1033
if ( defined($prog->{credits}->{actor}) ) {
1034
$self->debug("replacing actor(s) on $idInfo->{qualifier} \"$idInfo->{key}\"");
1035
delete($prog->{credits}->{actor});
1040
# add top 3 billing actors list form www.imdb.com
1041
for my $name (splice(@{$details->{actors}},0,3)) {
1044
# preserve all existing actors listed if we did't already have them.
1045
if ( defined($prog->{credits}->{actor}) ) {
1046
for my $name (@{$prog->{credits}->{actor}}) {
1049
if ( lc eq lc($name) ) {
1058
$prog->{credits}->{actor}=\@list;
1061
if ( $self->{updatePresentors} && defined($details->{presenter}) ) {
1062
if ( $self->{replacePresentors} ) {
1063
if ( defined($prog->{credits}->{presenter}) ) {
1064
$self->debug("replacing presentor");
1065
delete($prog->{credits}->{presenter});
1068
$prog->{credits}->{presenter}=$details->{presenter};
1070
if ( $self->{updateCommentators} && defined($details->{commentator}) ) {
1071
if ( $self->{replaceCommentators} ) {
1072
if ( defined($prog->{credits}->{commentator}) ) {
1073
$self->debug("replacing commentator");
1074
delete($prog->{credits}->{commentator});
1077
$prog->{credits}->{commentator}=$details->{commentator};
1080
# push genres as categories
1081
if ( $self->{updateCategoriesWithGenres} ) {
1082
if ( defined($details->{genres}) ) {
1083
for (@{$details->{genres}}) {
1084
push(@categories, [$_, 'en']);
1089
# current xmltv 0.5 doens't support more than one star rating,
1090
# so we deal with this slightly different
1091
if ( $self->{updateStarRatings} && defined($details->{ratingRank}) ) {
1092
if ( $self->{replaceStarRatings} ) {
1093
if ( defined($prog->{'star-rating'}) ) {
1094
$self->debug("replacing 'star-rating'");
1095
delete($prog->{'star-rating'});
1097
$prog->{'star-rating'}=["$details->{ratingRank}/10", undef];
1100
# if a star rating exists, then we leave it in place
1101
if ( !defined($prog->{'star-rating'}) ) {
1102
$prog->{'star-rating'}=["$details->{ratingRank}/10", undef];
1108
if ( $self->{updateCategories} ) {
1109
if ( $self->{replaceCategories} ) {
1110
if ( defined($prog->{category}) ) {
1111
$self->debug("replacing (all) 'category'");
1112
delete($prog->{category});
1115
if ( defined($prog->{category}) ) {
1117
for my $value (@{$prog->{category}}) {
1119
#print "checking category $value->[0] with $mycategory\n";
1120
for my $c (@categories) {
1121
if ( lc($c->[0]) eq lc($value->[0]) ) {
1126
push(@categories, $value);
1130
$prog->{category}=\@categories;
1136
sub augmentProgram($$$)
1138
my ($self, $prog, $movies_only)=@_;
1140
$self->{stats}->{programCount}++;
1142
# assume first title in first language is the one we want.
1143
my $title=$prog->{title}->[0]->[0];
1145
if ( defined($prog->{date}) && $prog->{date}=~m/^\d\d\d\d$/o ) {
1147
# for programs with dates we try:
1148
# - exact matches on movies
1149
# - exact matches on tv series
1150
# - close matches on movies
1151
my $id=$self->findMovieInfo($title, $prog->{date}, 1); # exact match
1152
if ( !defined($id) ) {
1153
$id=$self->findTVSeriesInfo($title);
1154
if ( !defined($id) ) {
1155
$id=$self->findMovieInfo($title, $prog->{date}, 0); # close match
1158
if ( defined($id) ) {
1159
$self->{stats}->{$id->{matchLevel}."Matches"}++;
1160
$self->{stats}->{$id->{matchLevel}}->{$id->{qualifier}}++;
1161
return($self->applyFound($prog, $id));
1163
$self->status("failed to find a match for movie \"$title ($prog->{date})\"");
1165
# fall through and try again as a tv series
1168
if ( !$movies_only ) {
1169
my $id=$self->findTVSeriesInfo($title);
1170
if ( defined($id) ) {
1171
$self->{stats}->{$id->{matchLevel}."Matches"}++;
1172
$self->{stats}->{$id->{matchLevel}}->{$id->{qualifier}}++;
1173
return($self->applyFound($prog, $id));
1177
# this has hard to support 'close' results, unless we know
1178
# for certain we're looking for a movie (ie duration etc)
1179
# this is a bad idea.
1180
my $id=$self->findMovieInfo($title, undef, 2); # any title match
1181
if ( defined($id) ) {
1182
$self->{stats}->{$id->{matchLevel}."Matches"}++;
1183
$self->{stats}->{$id->{matchLevel}}->{$id->{qualifier}}++;
1184
return($self->applyFound($prog, $id));
1187
$self->status("failed to find a match for show \"$title\"");
1193
# todo - add in stats on other things added (urls ?, actors, directors,categories)
1194
# separate out from what was added or updated
1196
sub getStatsLines($)
1199
my $totalChannelsParsed=shift;
1202
my %stats=%{$self->{stats}};
1204
my $ret=sprintf("Checked %d programs, on %d channels\n", $stats{programCount}, $totalChannelsParsed);
1206
for my $cat (sort keys %{$self->{categories}}) {
1207
$ret.=sprintf(" found %d %s titles", $stats{perfect}->{$cat}+$stats{close}->{$cat},
1208
$self->{categories}->{$cat});
1209
if ( $stats{close}->{$cat} != 0 ) {
1210
if ( $stats{close}->{$cat} == 1 ) {
1211
$ret.=sprintf(" (%d was not perfect)", $stats{close}->{$cat});
1214
$ret.=sprintf(" (%d were not perfect)", $stats{close}->{$cat});
1220
$ret.=sprintf(" augmented %.2f%% of the programs, parsing %.2f programs/sec\n",
1221
($stats{programCount}!=0)?(($stats{perfectMatches}+$stats{closeMatches})*100)/$stats{programCount}:0,
1222
($endTime!=$stats{startTime} && $stats{programCount} != 0)?
1223
$stats{programCount}/($endTime-$stats{startTime}):0);
1230
package XMLTV::IMDB::Crunch;
1233
# Use Term::ProgressBar if installed.
1234
use constant Have_bar => eval {
1235
require Term::ProgressBar;
1236
$Term::ProgressBar::VERSION >= 2;
1240
# This package parses and manages to index imdb plain text files from
1241
# ftp.imdb.com/interfaces. (see http://www.imdb.com/interfaces for
1244
# I might, given time build a download manager that:
1245
# - downloads the latest plain text files
1246
# - understands how to download each week's diffs and apply them
1247
# Currently, the 'downloadMissingFiles' flag in the hash of attributes
1248
# passed triggers a simple-minded downloader.
1250
# I may also roll this project into a xmltv-free imdb-specific
1251
# perl interface that just supports callbacks and understands more of
1252
# the imdb file formats.
1258
my $self={ @_ }; # remaining args become attributes
1259
for ($self->{downloadMissingFiles}) {
1260
$_=0 if not defined; # default
1263
for ('imdbDir', 'verbose') {
1264
die "invalid usage - no $_" if ( !defined($self->{$_}));
1266
if ( ! -d "$self->{imdbDir}" ) {
1267
if ( $self->{downloadMissingFiles} ) {
1268
warn "creating directory $self->{imdbDir}\n";
1269
mkdir $self->{imdbDir}, 0777
1270
or die "cannot mkdir $self->{imdbDir}: $!";
1273
die "$self->{imdbDir}:does not exist";
1276
my $listsDir = "$self->{imdbDir}/lists";
1277
if ( ! -d $listsDir ) {
1278
mkdir $listsDir, 0777 or die "cannot mkdir $listsDir: $!";
1281
my %missingListFiles; # maps 'movies' to filename ...movies.gz
1282
for ('movies', 'actors', 'actresses', 'directors', 'genres', 'ratings') {
1283
my $filename="$listsDir/$_.list";
1284
my $filenameGz="$filename.gz";
1285
my $filenameExists = -f $filename;
1286
my $filenameSize = -s $filename;
1287
my $filenameGzExists = -f $filenameGz;
1288
my $filenameGzSize = -s $filenameGz;
1290
if ( $filenameExists and not $filenameSize ) {
1291
warn "removing zero-length $filename\n";
1292
unlink $filename or die "cannot unlink $filename: $!";
1293
$filenameExists = 0;
1295
if ( $filenameGzExists and not $filenameGzSize ) {
1296
warn "removing zero-length $filenameGz\n";
1297
unlink $filenameGz or die "cannot unlink $filenameGz: $!";
1298
$filenameGzExists = 0;
1301
if ( not $filenameExists and not $filenameGzExists ) {
1302
# Just report one of the filenames, keep the message simple.
1303
warn "$filenameGz does not exist\n";
1304
$missingListFiles{$_}=$filenameGz;
1306
elsif ( not $filenameExists and $filenameGzExists ) {
1307
$self->{imdbListFiles}->{$_}=$filenameGz;
1309
elsif ( $filenameExists and not $filenameGzExists ) {
1310
$self->{imdbListFiles}->{$_}=$filename;
1312
elsif ( $filenameExists and $filenameGzExists ) {
1313
die "both $filename and $filenameGz exist, remove one of them\n";
1317
if ( $self->{downloadMissingFiles} ) {
1318
my $baseUrl = 'ftp://ftp.fu-berlin.de/pub/misc/movies/database/';
1319
foreach ( sort keys %missingListFiles ) {
1320
my $url = "$baseUrl/$_.list.gz";
1321
my $filename = delete $missingListFiles{$_};
1322
my $partial = "$filename.partial";
1324
if (not -s $partial) {
1325
print STDERR "removing empty $partial\n";
1326
unlink $partial or die "cannot unlink $partial: $!";
1330
$partial already exists, remove it or try renaming to $filename and
1331
resuming the download of <$url> by hand.
1339
Trying to download <$url>.
1340
With a slow network link this could fail; it might be better to
1341
download the file by hand and save it as
1346
# For downloading we use LWP::Simple::getstore() to write
1349
my $resp = getstore($url, $filename);
1350
my $got_size = -s $filename;
1351
if (defined $resp and is_success($resp)) {
1352
die if not $got_size;
1353
print STDERR "<$url>\n\t-> $filename, success\n\n";
1356
my $msg = "failed to download $url to $filename";
1357
$msg .= ", http response code: $resp" if defined $resp;
1360
warn "renaming $filename -> $partial\n";
1361
rename $filename, $partial
1362
or die "cannot rename $filename to $partial: $!";
1363
warn "You might try continuing the download of <$url> manually.\n";
1368
$self->{downloadMissingFiles} = 0;
1372
if ( %missingListFiles ) {
1373
print STDERR "tv_imdb: requires you to download the above files from ftp.imdb.com\n";
1374
print STDERR " see http://www.imdb.com/interfaces for details\n";
1375
print STDERR " or try the --download option\n";
1379
$self->{moviedbIndex}="$self->{imdbDir}/moviedb.idx";
1380
$self->{moviedbData}="$self->{imdbDir}/moviedb.dat";
1381
$self->{moviedbInfo}="$self->{imdbDir}/moviedb.info";
1382
$self->{moviedbOffline}="$self->{imdbDir}/moviedb.offline";
1384
bless($self, $type);
1390
my ($self, $file)=@_;
1392
if ( defined($file) ) {
1393
if ( !open($self->{logfd}, "> $file") ) {
1394
print STDERR "$file:$!\n";
1397
$self->{errorCountInLog}=0;
1400
close($self->{logfd});
1401
$self->{logfd}=undef;
1409
if ( defined($self->{logfd}) ) {
1410
print {$self->{logfd}} $_[0]."\n";
1411
$self->{errorCountInLog}++;
1414
print STDERR $_[0]."\n";
1422
if ( $self->{verbose} ) {
1423
print STDERR $_[0]."\n";
1430
sub openMaybeGunzip($)
1433
return gunzip_open($_) if m/\.gz$/;
1434
return new IO::File("< $_");
1438
sub closeMaybeGunzip($$)
1440
if ( $_[0]=~m/\.gz$/o ) {
1441
# Would close($fh) but that causes segfaults on my system.
1442
# Investigating, but in the meantime just leave it open.
1444
#return gunzip_close($_[1]);
1447
# Apparently this can also segfault (wtf?).
1448
#return close($_[1]);
1451
sub readMoviesOrGenres($$$$)
1453
my ($self, $whichMoviesOrGenres, $countEstimate, $file)=@_;
1454
my $startTime=time();
1456
my $whatAreWeParsing;
1459
if ( $whichMoviesOrGenres eq "Movies" ) {
1460
$header="MOVIES LIST";
1461
$whatAreWeParsing=1;
1463
elsif ( $whichMoviesOrGenres eq "Genres" ) {
1464
$header="8: THE GENRES LIST";
1465
$whatAreWeParsing=2;
1467
my $fh = openMaybeGunzip($file) || return(-2);
1470
if ( m/^$header/ ) {
1471
if ( !($_=<$fh>) || !m/^===========/o ) {
1472
$self->error("missing ======= after $header at line $lineCount");
1473
closeMaybeGunzip($file, $fh);
1476
if ( !($_=<$fh>) || !m/^\s*$/o ) {
1477
$self->error("missing empty line after ======= at line $lineCount");
1478
closeMaybeGunzip($file, $fh);
1483
elsif ( $lineCount > 1000 ) {
1484
$self->error("$file: stopping at line $lineCount, didn't see \"$header\" line");
1485
closeMaybeGunzip($file, $fh);
1490
my $progress=Term::ProgressBar->new({name => "parsing $whichMoviesOrGenres",
1491
count => $countEstimate,
1495
$progress->minor(0) if Have_bar;
1496
$progress->max_update_rate(1) if Have_bar;
1503
#print "read line $lineCount:$line\n";
1505
# end is line consisting of only '-'
1506
last if ( $line=~m/^\-\-\-\-\-\-\-+/o );
1510
my $tab=index($line, "\t");
1512
my $mkey=substr($line, 0, $tab);
1514
next if ($mkey=~m/\s*\{\{SUSPENDED\}\}/o);
1516
if ( $whatAreWeParsing == 2 ) {
1517
# don't see what these are...?
1518
# ignore {{SUSPENDED}}
1519
$mkey=~s/\s*\{\{SUSPENDED\}\}//o;
1521
# ignore {Twelve Angry Men (1954)}
1522
$mkey=~s/\s*\{[^\}]+\}//go;
1524
# skip enties that have {} in them since they're tv episodes
1525
#next if ( $mkey=~s/\s*\{[^\}]+\}$//o );
1527
my $genre=substr($line, $tab);
1529
# genres sometimes has more than one tab
1531
if ( defined($self->{movies}{$mkey}) ) {
1532
$self->{movies}{$mkey}.="|".$genre;
1535
$self->{movies}{$mkey}=$genre;
1536
# returned count is number of unique titles found
1541
push(@{$self->{movies}}, $mkey);
1542
# returned count is number of titles found
1547
# re-adjust target so progress bar doesn't seem too wonky
1548
if ( $count > $countEstimate ) {
1549
$countEstimate = $progress->target($count+1000);
1550
$next_update=$progress->update($count);
1552
elsif ( $count > $next_update ) {
1553
$next_update=$progress->update($count);
1558
$self->error("$file:$lineCount: unrecognized format (missing tab)");
1559
$next_update=$progress->update($count) if Have_bar;
1562
$progress->update($countEstimate) if Have_bar;
1564
$self->status(sprintf("parsing $whichMoviesOrGenres found $count titles and ".
1565
"$lineCount lines in %d seconds",time()-$startTime));
1567
closeMaybeGunzip($file, $fh);
1571
sub readCastOrDirectors($$$)
1573
my ($self, $whichCastOrDirector, $castCountEstimate, $file)=@_;
1574
my $startTime=time();
1577
my $whatAreWeParsing;
1580
if ( $whichCastOrDirector eq "Actors" ) {
1581
$header="THE ACTORS LIST";
1582
$whatAreWeParsing=1;
1584
elsif ( $whichCastOrDirector eq "Actresses" ) {
1585
$header="THE ACTRESSES LIST";
1586
$whatAreWeParsing=2;
1588
elsif ( $whichCastOrDirector eq "Directors" ) {
1589
$header="THE DIRECTORS LIST";
1590
$whatAreWeParsing=3;
1593
die "why are we here ?";
1596
my $fh = openMaybeGunzip($file) || return(-2);
1597
my $progress=Term::ProgressBar->new({name => "parsing $whichCastOrDirector",
1598
count => $castCountEstimate,
1601
$progress->minor(0) if Have_bar;
1602
$progress->max_update_rate(1) if Have_bar;
1606
if ( m/^$header/ ) {
1607
if ( !($_=<$fh>) || !m/^===========/o ) {
1608
$self->error("missing ======= after $header at line $lineCount");
1609
closeMaybeGunzip($file, $fh);
1612
if ( !($_=<$fh>) || !m/^\s*$/o ) {
1613
$self->error("missing empty line after ======= at line $lineCount");
1614
closeMaybeGunzip($file, $fh);
1617
if ( !($_=<$fh>) || !m/^Name\s+Titles\s*$/o ) {
1618
$self->error("missing name/titles line after ======= at line $lineCount");
1619
closeMaybeGunzip($file, $fh);
1622
if ( !($_=<$fh>) || !m/^[\s\-]+$/o ) {
1623
$self->error("missing name/titles suffix line after ======= at line $lineCount");
1624
closeMaybeGunzip($file, $fh);
1629
elsif ( $lineCount > 1000 ) {
1630
$self->error("$file: stopping at line $lineCount, didn't see \"$header\" line");
1631
closeMaybeGunzip($file, $fh);
1643
#$self->status("read line $lineCount:$line");
1645
# end is line consisting of only '-'
1646
last if ( $line=~m/^\-\-\-\-\-\-\-+/o );
1648
next if ( length($line) == 0 );
1650
if ( $line=~s/^([^\t]+)\t+//o ) {
1655
# re-adjust target so progress bar doesn't seem too wonky
1656
if ( $castNames > $castCountEstimate ) {
1657
$castCountEstimate = $progress->target($castNames+100);
1658
$next_update=$progress->update($castNames);
1660
elsif ( $castNames > $next_update ) {
1661
$next_update=$progress->update($castNames);
1667
my $HostNarrator="";
1668
if ( $whatAreWeParsing < 3 ) {
1669
# actors or actresses
1671
if ( $line=~s/\s*<(\d+)>//o ) {
1672
$billing=sprintf("%04d", int($1));
1675
if ( (my $start=index($line, " [")) != -1 ) {
1676
#my $end=rindex($line, "]");
1677
my $ex=substr($line, $start+1);
1679
if ( $ex=~s/Host//o ) {
1680
if ( length($HostNarrator) ) {
1683
$HostNarrator.="Host";
1685
if ( $ex=~s/Narrator//o ) {
1686
if ( length($HostNarrator) ) {
1689
$HostNarrator.="Narrator";
1691
$line=substr($line, 0, $start);
1692
# ignore character name
1695
# try ignoring these
1696
next if ($line=~m/\s*\{\{SUSPENDED\}\}/o);
1698
# don't see what these are...?
1699
# ignore {{SUSPENDED}}
1700
$line=~s/\s*\{\{SUSPENDED\}\}//o;
1702
# ignore {Twelve Angry Men (1954)}
1703
$line=~s/\s*\{[^\}]+\}//o;
1705
if ( $whatAreWeParsing < 3 ) {
1706
if ( $line=~s/\s*\(aka ([^\)]+)\).*$//o ) {
1710
if ( $line=~s/ (\(.*)$//o ) {
1716
if ( $whatAreWeParsing < 3 ) {
1717
if ( $line=~s/\s+Narrator$//o ) {
1722
my $val=$self->{movies}{$line};
1724
if ( length($HostNarrator) ) {
1725
$name.="[$HostNarrator]";
1727
if ( defined($billing) ) {
1728
if ( defined($val) ) {
1729
$self->{movies}{$line}=$val."|$billing:$name";
1732
$self->{movies}{$line}="$billing:$name";
1736
if ( defined($val) ) {
1737
$self->{movies}{$line}=$val."|$name";
1740
$self->{movies}{$line}=$name;
1745
$progress->update($castCountEstimate) if Have_bar;
1747
$self->status(sprintf("parsing $whichCastOrDirector found $castNames names, ".
1748
"$count titles and $lineCount lines in %d seconds",time()-$startTime));
1749
closeMaybeGunzip($file, $fh);
1754
sub readRatings($$$$)
1756
my ($self, $countEstimate, $file)=@_;
1757
my $startTime=time();
1760
my $fh = openMaybeGunzip($file) || return(-2);
1763
if ( m/^MOVIE RATINGS REPORT/o ) {
1764
if ( !($_=<$fh>) || !m/^\s*$/o) {
1765
$self->error("missing empty line after \"MOVIE RATINGS REPORT\" at line $lineCount");
1766
closeMaybeGunzip($file, $fh);
1769
if ( !($_=<$fh>) || !m/^New Distribution Votes Rank Title/o ) {
1770
$self->error("missing \"New Distribution Votes Rank Title\" at line $lineCount");
1771
closeMaybeGunzip($file, $fh);
1776
elsif ( $lineCount > 1000 ) {
1777
$self->error("$file: stopping at line $lineCount, didn't see \"MOVIE RATINGS REPORT\" line");
1778
closeMaybeGunzip($file, $fh);
1783
my $progress=Term::ProgressBar->new({name => "parsing Ratings",
1784
count => $countEstimate,
1788
$progress->minor(0) if Have_bar;
1789
$progress->max_update_rate(1) if Have_bar;
1796
#print "read line $lineCount:$line";
1800
# skip empty lines (only really appear right before last line ending with ----
1801
next if ( $line=~m/^\s*$/o );
1802
# end is line consisting of only '-'
1803
last if ( $line=~m/^\-\-\-\-\-\-\-+/o );
1805
if ( $line=~s/^\s+([\.|\*|\d]+)\s+(\d+)\s+(\d+)\.(\d+)\s+//o ) {
1806
$self->{movies}{$line}=[$1,$2,"$3.$4"];
1809
# re-adjust target so progress bar doesn't seem too wonky
1810
if ( $count > $countEstimate ) {
1811
$countEstimate = $progress->target($count+1000);
1812
$next_update=$progress->update($count);
1814
elsif ( $count > $next_update ) {
1815
$next_update=$progress->update($count);
1820
$self->error("$file:$lineCount: unrecognized format");
1821
$next_update=$progress->update($count) if Have_bar;
1824
$progress->update($countEstimate) if Have_bar;
1826
$self->status(sprintf("parsing Ratings found $count titles and ".
1827
"$lineCount lines in %d seconds",time()-$startTime));
1828
closeMaybeGunzip($file, $fh);
1832
sub stageComplete($)
1834
my ($self, $stage)=@_;
1836
if ( -f "$self->{imdbDir}/stage$stage.data" ) {
1846
my $ret=XMLTV::IMDB::loadDBInfo($self->{moviedbInfo});
1847
if ( ref $ret eq 'SCALAR' ) {
1850
$self->{dbinfo}=$ret;
1856
my ($self, $key, $value)=@_;
1857
$self->{dbinfo}->{$key}=$value;
1862
my ($self, $key, $defaultValue)=@_;
1863
if ( defined($self->{dbinfo}->{$key}) ) {
1864
return($self->{dbinfo}->{$key});
1866
return($defaultValue);
1872
open(INFO, "> $self->{moviedbInfo}") || return(1);
1873
for (sort keys %{$self->{dbinfo}}) {
1874
print INFO "".$_.":".$self->{dbinfo}->{$_}."\n";
1880
sub dbinfoCalcEstimate($$$)
1882
my ($self, $key, $estimateSizePerEntry)=@_;
1884
if ( !defined($self->{imdbListFiles}->{$key}) ) {
1885
die ("invalid call");
1887
my $fileSize=int(-s "$self->{imdbListFiles}->{$key}");
1889
# if compressed, then attempt to run gzip -l
1890
if ( $self->{imdbListFiles}->{$key}=~m/.gz$/) {
1891
if ( open(my $fd, "gzip -l ".$self->{imdbListFiles}->{$key}."|") ) {
1892
# if parse fails, then defalt to wild ass guess of compression of 65%
1893
$fileSize=int($fileSize*100)/(100-65);
1896
if ( m/^\s*\d+\s+(\d+)/ ) {
1903
# wild ass guess of compression of 65%
1904
$fileSize=int($fileSize*100)/(100-65);
1907
my $countEstimate=int($fileSize/$estimateSizePerEntry);
1909
$self->dbinfoAdd($key."_list_file", $self->{imdbListFiles}->{$key});
1910
$self->dbinfoAdd($key."_list_file_size", $fileSize);
1911
$self->dbinfoAdd($key."_list_count_estimate", $countEstimate);
1912
return($countEstimate);
1917
my ($self, $stage)=@_;
1919
my $startTime=time();
1920
if ( $stage == 1 ) {
1921
$self->status("parsing Movies list for stage $stage..");
1922
my $countEstimate=$self->dbinfoCalcEstimate("movies", 43);
1924
my $num=$self->readMoviesOrGenres("Movies", $countEstimate, "$self->{imdbListFiles}->{movies}");
1927
$self->error("you need to download $self->{imdbListFiles}->{movies} from ftp.imdb.com");
1931
elsif ( abs($num - $countEstimate) > $countEstimate*.05 ) {
1932
$self->status("ARG estimate of $countEstimate for movies needs updating, found $num");
1934
$self->dbinfoAdd("db_stat_movie_count", "$num");
1936
$self->status("writing stage1 data ..");
1938
my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
1939
my $progress=Term::ProgressBar->new({name => "writing titles",
1940
count => $countEstimate,
1943
$progress->minor(0) if Have_bar;
1944
$progress->max_update_rate(1) if Have_bar;
1947
open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
1949
for my $movie (@{$self->{movies}}) {
1950
print OUT "$movie\n";
1954
# re-adjust target so progress bar doesn't seem too wonky
1955
if ( $count > $countEstimate ) {
1956
$countEstimate = $progress->target($count+100);
1957
$next_update=$progress->update($count);
1959
elsif ( $count > $next_update ) {
1960
$next_update=$progress->update($count);
1964
$progress->update($countEstimate) if Have_bar;
1966
delete($self->{movies});
1969
elsif ( $stage == 2 ) {
1970
$self->status("parsing Directors list for stage $stage..");
1972
my $countEstimate=$self->dbinfoCalcEstimate("directors", 184);
1974
my $num=$self->readCastOrDirectors("Directors", $countEstimate, "$self->{imdbListFiles}->{directors}");
1977
$self->error("you need to download $self->{imdbListFiles}->{directors} from ftp.imdb.com (see http://www.imdb.com/interfaces)");
1981
elsif ( abs($num - $countEstimate) > $countEstimate*.05 ) {
1982
$self->status("ARG estimate of $countEstimate for directors needs updating, found $num");
1984
$self->dbinfoAdd("db_stat_director_count", "$num");
1986
$self->status("writing stage2 data ..");
1988
my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
1989
my $progress=Term::ProgressBar->new({name => "writing directors",
1990
count => $countEstimate,
1993
$progress->minor(0) if Have_bar;
1994
$progress->max_update_rate(1) if Have_bar;
1998
open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
1999
for my $key (keys %{$self->{movies}}) {
2001
for (split('\|', $self->{movies}{$key})) {
2006
push(@list, sprintf("%03d:%s", $dir{$_}, $_));
2009
for my $c (reverse sort {$a cmp $b} @list) {
2010
my ($num, $name)=split(':', $c);
2014
print OUT "$key\t$value\n";
2018
# re-adjust target so progress bar doesn't seem too wonky
2019
if ( $count > $countEstimate ) {
2020
$countEstimate = $progress->target($count+100);
2021
$next_update=$progress->update($count);
2023
elsif ( $count > $next_update ) {
2024
$next_update=$progress->update($count);
2028
$progress->update($countEstimate) if Have_bar;
2030
delete($self->{movies});
2032
#unlink("$self->{imdbDir}/stage1.data");
2034
elsif ( $stage == 3 ) {
2035
$self->status("parsing Actors list for stage $stage..");
2037
#print "re-reading movies into memory for reverse lookup..\n";
2038
my $countEstimate=$self->dbinfoCalcEstimate("actors", 349);
2040
my $num=$self->readCastOrDirectors("Actors", $countEstimate, "$self->{imdbListFiles}->{actors}");
2043
$self->error("you need to download $self->{imdbListFiles}->{actors} from ftp.imdb.com (see http://www.imdb.com/interfaces)");
2047
elsif ( abs($num - $countEstimate) > $countEstimate*.05 ) {
2048
$self->status("ARG estimate of $countEstimate for actors needs updating, found $num");
2050
$self->dbinfoAdd("db_stat_actor_count", "$num");
2052
$self->status("writing stage3 data ..");
2054
my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2055
my $progress=Term::ProgressBar->new({name => "writing actors",
2056
count => $countEstimate,
2059
$progress->minor(0) if Have_bar;
2060
$progress->max_update_rate(1) if Have_bar;
2064
open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2065
for my $key (keys %{$self->{movies}}) {
2066
print OUT "$key\t$self->{movies}{$key}\n";
2070
# re-adjust target so progress bar doesn't seem too wonky
2071
if ( $count > $countEstimate ) {
2072
$countEstimate = $progress->target($count+100);
2073
$next_update=$progress->update($count);
2075
elsif ( $count > $next_update ) {
2076
$next_update=$progress->update($count);
2080
$progress->update($countEstimate) if Have_bar;
2082
delete($self->{movies});
2085
elsif ( $stage == 4 ) {
2086
$self->status("parsing Actresses list for stage $stage..");
2088
my $countEstimate=$self->dbinfoCalcEstimate("actresses", 311);
2089
my $num=$self->readCastOrDirectors("Actresses", $countEstimate, "$self->{imdbListFiles}->{actresses}");
2092
$self->error("you need to download $self->{imdbListFiles}->{actresses} from ftp.imdb.com (see http://www.imdb.com/interfaces)");
2096
elsif ( abs($num - $countEstimate) > $countEstimate*.05 ) {
2097
$self->status("ARG estimate of $countEstimate for actresses needs updating, found $num");
2099
$self->dbinfoAdd("db_stat_actress_count", "$num");
2101
$self->status("writing stage4 data ..");
2103
my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2104
my $progress=Term::ProgressBar->new({name => "writing actresses",
2105
count => $countEstimate,
2108
$progress->minor(0) if Have_bar;
2109
$progress->max_update_rate(1) if Have_bar;
2113
open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2114
for my $key (keys %{$self->{movies}}) {
2115
print OUT "$key\t$self->{movies}{$key}\n";
2118
# re-adjust target so progress bar doesn't seem too wonky
2119
if ( $count > $countEstimate ) {
2120
$countEstimate = $progress->target($count+100);
2121
$next_update=$progress->update($count);
2123
elsif ( $count > $next_update ) {
2124
$next_update=$progress->update($count);
2128
$progress->update($countEstimate) if Have_bar;
2130
delete($self->{movies});
2132
#unlink("$self->{imdbDir}/stage3.data");
2134
elsif ( $stage == 5 ) {
2135
$self->status("parsing Genres list for stage $stage..");
2136
my $countEstimate=$self->dbinfoCalcEstimate("genres", 61);
2138
my $num=$self->readMoviesOrGenres("Genres", $countEstimate, "$self->{imdbListFiles}->{genres}");
2141
$self->error("you need to download $self->{imdbListFiles}->{genres} from ftp.imdb.com");
2145
elsif ( abs($num - $countEstimate) > $countEstimate*.05 ) {
2146
$self->status("ARG estimate of $countEstimate for genres needs updating, found $num");
2148
$self->dbinfoAdd("db_stat_genres_count", "$num");
2150
$self->status("writing stage5 data ..");
2152
my $countEstimate=$self->dbinfoGet("db_stat_genres_count", 0);
2153
my $progress=Term::ProgressBar->new({name => "writing genres",
2154
count => $countEstimate,
2157
$progress->minor(0) if Have_bar;
2158
$progress->max_update_rate(1) if Have_bar;
2161
open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2163
for my $movie (keys %{$self->{movies}}) {
2164
print OUT "$movie\t$self->{movies}->{$movie}\n";
2168
# re-adjust target so progress bar doesn't seem too wonky
2169
if ( $count > $countEstimate ) {
2170
$countEstimate = $progress->target($count+100);
2171
$next_update=$progress->update($count);
2173
elsif ( $count > $next_update ) {
2174
$next_update=$progress->update($count);
2178
$progress->update($countEstimate) if Have_bar;
2180
delete($self->{movies});
2183
elsif ( $stage == 6 ) {
2184
$self->status("parsing Ratings list for stage $stage..");
2185
my $countEstimate=$self->dbinfoCalcEstimate("ratings", 63);
2187
my $num=$self->readRatings($countEstimate, "$self->{imdbListFiles}->{ratings}");
2190
$self->error("you need to download $self->{imdbListFiles}->{ratings} from ftp.imdb.com");
2194
elsif ( abs($num - $countEstimate) > $countEstimate*.05 ) {
2195
$self->status("ARG estimate of $countEstimate for ratings needs updating, found $num");
2197
$self->dbinfoAdd("db_stat_ratings_count", "$num");
2199
$self->status("writing stage6 data ..");
2201
my $countEstimate=$self->dbinfoGet("db_stat_ratings_count", 0);
2202
my $progress=Term::ProgressBar->new({name => "writing ratings",
2203
count => $countEstimate,
2206
$progress->minor(0) if Have_bar;
2207
$progress->max_update_rate(1) if Have_bar;
2210
open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2212
for my $movie (keys %{$self->{movies}}) {
2213
my @value=@{$self->{movies}->{$movie}};
2214
print OUT "$movie\t$value[0]\t$value[1]\t$value[2]\n";
2218
# re-adjust target so progress bar doesn't seem too wonky
2219
if ( $count > $countEstimate ) {
2220
$countEstimate = $progress->target($count+100);
2221
$next_update=$progress->update($count);
2223
elsif ( $count > $next_update ) {
2224
$next_update=$progress->update($count);
2228
$progress->update($countEstimate) if Have_bar;
2230
delete($self->{movies});
2233
elsif ( $stage == 7 ) {
2234
my $tab=sprintf("\t");
2236
$self->status("indexing all previous stage's data for stage 7..");
2238
$self->status("parsing stage 1 data (movie list)..");
2241
my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2242
my $progress=Term::ProgressBar->new({name => "reading titles",
2243
count => $countEstimate,
2246
$progress->minor(0) if Have_bar;
2247
$progress->max_update_rate(1) if Have_bar;
2250
open(IN, "< $self->{imdbDir}/stage1.data") || die "$self->{imdbDir}/stage1.data:$!";
2256
# re-adjust target so progress bar doesn't seem too wonky
2257
if ( $. > $countEstimate ) {
2258
$countEstimate = $progress->target($.+100);
2259
$next_update=$progress->update($.);
2261
elsif ( $. > $next_update ) {
2262
$next_update=$progress->update($.);
2267
$progress->update($countEstimate) if Have_bar;
2270
$self->status("merging in stage 2 data (directors)..");
2272
my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2273
my $progress=Term::ProgressBar->new({name => "merging directors",
2274
count => $countEstimate,
2277
$progress->minor(0) if Have_bar;
2278
$progress->max_update_rate(1) if Have_bar;
2281
open(IN, "< $self->{imdbDir}/stage2.data") || die "$self->{imdbDir}/stage2.data:$!";
2285
if ( !defined($movies{$1}) ) {
2286
$self->error("directors list references unidentified title '$1'");
2292
# re-adjust target so progress bar doesn't seem too wonky
2293
if ( $. > $countEstimate ) {
2294
$countEstimate = $progress->target($.+100);
2295
$next_update=$progress->update($.);
2297
elsif ( $. > $next_update ) {
2298
$next_update=$progress->update($.);
2302
$progress->update($countEstimate) if Have_bar;
2307
# fill in default for movies we didn't have a director for
2308
for my $key (keys %movies) {
2309
if ( !length($movies{$key})) {
2315
$self->status("merging in stage 3 data (actors)..");
2317
my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2318
my $progress=Term::ProgressBar->new({name => "merging actors",
2319
count => $countEstimate,
2322
$progress->minor(0) if Have_bar;
2323
$progress->max_update_rate(1) if Have_bar;
2326
open(IN, "< $self->{imdbDir}/stage3.data") || die "$self->{imdbDir}/stage3.data:$!";
2331
my $val=$movies{$dbkey};
2332
if ( !defined($val) ) {
2333
$self->error("actors list references unidentified title '$dbkey'");
2336
if ( $val=~m/$tab/o ) {
2337
$movies{$dbkey}=$val."|".$_;
2340
$movies{$dbkey}=$val.$tab.$_;
2343
# re-adjust target so progress bar doesn't seem too wonky
2344
if ( $. > $countEstimate ) {
2345
$countEstimate = $progress->target($.+100);
2346
$next_update=$progress->update($.);
2348
elsif ( $. > $next_update ) {
2349
$next_update=$progress->update($.);
2353
$progress->update($countEstimate) if Have_bar;
2357
$self->status("merging in stage 4 data (actresses)..");
2359
my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2360
my $progress=Term::ProgressBar->new({name => "merging actresses",
2361
count => $countEstimate,
2364
$progress->minor(0) if Have_bar;
2365
$progress->max_update_rate(1) if Have_bar;
2368
open(IN, "< $self->{imdbDir}/stage4.data") || die "$self->{imdbDir}/stage4.data:$!";
2373
my $val=$movies{$dbkey};
2374
if ( !defined($val) ) {
2375
$self->error("actresses list references unidentified title '$dbkey'");
2378
if ( $val=~m/$tab/o ) {
2379
$movies{$dbkey}=$val."|".$_;
2382
$movies{$dbkey}=$val.$tab.$_;
2385
# re-adjust target so progress bar doesn't seem too wonky
2386
if ( $. > $countEstimate ) {
2387
$countEstimate = $progress->target($.+100);
2388
$next_update=$progress->update($.);
2390
elsif ( $. > $next_update ) {
2391
$next_update=$progress->update($.);
2395
$progress->update($countEstimate) if Have_bar;
2399
# fill in placeholder if no actors were found
2400
for my $key (keys %movies) {
2401
if ( !($movies{$key}=~m/$tab/o) ) {
2402
$movies{$key}.=$tab."<>";
2407
$self->status("merging in stage 5 data (genres)..");
2409
my $countEstimate=$self->dbinfoGet("db_stat_genres_count", 0);
2410
my $progress=Term::ProgressBar->new({name => "merging genres",
2411
count => $countEstimate,
2414
$progress->minor(0) if Have_bar;
2415
$progress->max_update_rate(1) if Have_bar;
2418
open(IN, "< $self->{imdbDir}/stage5.data") || die "$self->{imdbDir}/stage5.data:$!";
2424
my $val=$movies{$dbkey};
2425
if ( !defined($val) ) {
2426
$self->error("genres list references unidentified title '$1'");
2429
$movies{$dbkey}.=$tab.$genres;
2432
# re-adjust target so progress bar doesn't seem too wonky
2433
if ( $. > $countEstimate ) {
2434
$countEstimate = $progress->target($.+100);
2435
$next_update=$progress->update($.);
2437
elsif ( $. > $next_update ) {
2438
$next_update=$progress->update($.);
2442
$progress->update($countEstimate) if Have_bar;
2447
# fill in placeholder if no genres were found
2448
for my $key (keys %movies) {
2449
my $val=$movies{$key};
2450
my $t=index($val, $tab);
2452
die "corrupt entry '$key' '$val'";
2454
if ( index($val, $tab, $t+1) == -1 ) {
2455
$movies{$key}.=$tab."<>";
2460
$self->status("merging in stage 6 data (ratings)..");
2462
my $countEstimate=$self->dbinfoGet("db_stat_ratings_count", 0);
2463
my $progress=Term::ProgressBar->new({name => "merging ratings",
2464
count => $countEstimate,
2467
$progress->minor(0) if Have_bar;
2468
$progress->max_update_rate(1) if Have_bar;
2471
open(IN, "< $self->{imdbDir}/stage6.data") || die "$self->{imdbDir}/stage6.data:$!";
2474
s/^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)$//o;
2476
my ($ratingDist, $ratingVotes, $ratingRank)=($2,$3,$4);
2478
my $val=$movies{$dbkey};
2479
if ( !defined($val) ) {
2480
$self->error("ratings list references unidentified title '$1'");
2483
$movies{$dbkey}.=$tab.$ratingDist.$tab.$ratingVotes.$tab.$ratingRank;
2486
# re-adjust target so progress bar doesn't seem too wonky
2487
if ( $. > $countEstimate ) {
2488
$countEstimate = $progress->target($.+100);
2489
$next_update=$progress->update($.);
2491
elsif ( $. > $next_update ) {
2492
$next_update=$progress->update($.);
2496
$progress->update($countEstimate) if Have_bar;
2500
# fill in placeholder if no genres were found
2501
for my $key (keys %movies) {
2502
my $val=$movies{$key};
2504
my $t=index($val, $tab);
2506
die "corrupt entry '$key' '$val'";
2508
my $j=index($val, $tab, $t+1);
2510
die "corrupt entry '$key' '$val'";
2512
if ( index($val, $tab, $j+1) == -1 ) {
2513
$movies{$key}.=$tab."<>".$tab."<>".$tab."<>";
2518
#unlink("$self->{imdbDir}/stage1.data");
2519
#unlink("$self->{imdbDir}/stage2.data");
2520
#unlink("$self->{imdbDir}/stage3.data");
2523
# note: not all movies end up with a cast, but we include them anyway.
2528
my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2529
my $progress=Term::ProgressBar->new({name => "computing index",
2530
count => $countEstimate,
2533
$progress->minor(0) if Have_bar;
2534
$progress->max_update_rate(1) if Have_bar;
2538
for my $key (keys %movies) {
2541
# drop episode information - ex: {Twelve Angry Men (1954)}
2542
$dbkey=~s/\s*\{[^\}]+\}//go;
2544
# todo - this would make things easier
2545
# change double-quotes around title to be (made-for-tv) suffix instead
2546
if ( $dbkey=~m/^\"/o && #"
2547
$dbkey=~m/\"\s*\(/o ) { #"
2548
$dbkey.=" (tv_series)";
2550
# how rude, some entries have (TV) appearing more than once.
2551
$dbkey=~s/\(TV\)\s*\(TV\)$/(TV)/o;
2554
if ( $dbkey=~s/\s+\(TV\)$//o ) {
2555
$qualifier="tv_movie";
2557
elsif ( $dbkey=~s/\s+\(mini\) \(tv_series\)$// ) {
2558
$qualifier="tv_mini_series";
2560
elsif ( $dbkey=~s/\s+\(tv_series\)$// ) {
2561
$qualifier="tv_series";
2563
elsif ( $dbkey=~s/\s+\(mini\)$//o ) {
2564
$qualifier="tv_mini_series";
2566
elsif ( $dbkey=~s/\s+\(V\)$//o ) {
2567
$qualifier="video_movie";
2569
elsif ( $dbkey=~s/\s+\(VG\)$//o ) {
2570
$qualifier="video_game";
2575
#if ( $dbkey=~s/\s+\((tv_series|tv_mini_series|tv_movie|video_movie|video_game)\)$//o ) {
2581
if ( $title=~m/^\"/o && $title=~m/\"\s*\(/o ) { #"
2582
$title=~s/^\"//o; #"
2583
$title=~s/\"(\s*\()/$1/o; #"
2586
if ( $title=~s/\s+\((\d\d\d\d)\)$//o ||
2587
$title=~s/\s+\((\d\d\d\d)\/[IVX]+\)$//o ) {
2590
elsif ( $title=~s/\s+\((\?\?\?\?)\)$//o ||
2591
$title=~s/\s+\((\?\?\?\?)\/[IVX]+\)$//o ) {
2595
$self->error("movie list format failed to decode year from title '$title'");
2598
$title=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og;
2600
my $hashkey=lc("$title ($year)");
2601
$hashkey=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg;
2603
if ( defined($movies{$hashkey}) ) {
2604
die "unable to place moviedb key for $key, report to xmltv-devel\@lists.sf.net";
2606
die "title \"$title\" contains a tab" if ( $title=~m/\t/o );
2607
#print "key:$dbkey\n\ttitle=$title\n\tyear=$year\n\tqualifier=$qualifier\n";
2608
#print "key $key: value=\"$movies{$key}\"\n";
2610
$nmovies{$hashkey}=$dbkey.$tab.$year.$tab.$qualifier.$tab.delete($movies{$key});
2614
# re-adjust target so progress bar doesn't seem too wonky
2615
if ( $count > $countEstimate ) {
2616
$countEstimate = $progress->target($count+100);
2617
$next_update=$progress->update($count);
2619
elsif ( $count > $next_update ) {
2620
$next_update=$progress->update($count);
2624
$progress->update($countEstimate) if Have_bar;
2626
if ( scalar(keys %movies) != 0 ) {
2627
die "what happened, we have keys left ?";
2633
my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2634
my $progress=Term::ProgressBar->new({name => "writing database",
2635
count => $countEstimate,
2638
$progress->minor(0) if Have_bar;
2639
$progress->max_update_rate(1) if Have_bar;
2642
open(IDX, "> $self->{moviedbIndex}") || die "$self->{moviedbIndex}:$!";
2643
open(DAT, "> $self->{moviedbData}") || die "$self->{moviedbData}:$!";
2645
for my $key (sort {$a cmp $b} keys %nmovies) {
2646
my $val=delete($nmovies{$key});
2647
#print "movie $key: $val\n";
2648
#$val=~s/^([^\t]+)\t([^\t]+)\t([^\t]+)\t//o || die "internal failure ($key:$val)";
2649
my ($dbkey, $year, $qualifier,$directors,$actors,@rest)=split('\t', $val);
2650
#die ("no 1") if ( !defined($dbkey));
2651
#die ("no 2") if ( !defined($year));
2652
#die ("no 3") if ( !defined($qualifier));
2653
#die ("no 4") if ( !defined($directors));
2654
#die ("no 5") if ( !defined($actors));
2655
#print "key:$key\n\ttitle=$dbkey\n\tyear=$year\n\tqualifier=$qualifier\n";
2657
#my ($directors, $actors)=split('\t', $val);
2661
if ( $directors eq "<>" ) {
2665
# sort directors by last name
2666
for my $name (sort {$a cmp $b} split('\|', $directors)) {
2672
#print " $dbkey: $val\n";
2673
if ( $actors eq "<>" ) {
2674
$details.=$tab."<>";
2679
# sort actors by billing
2680
# be warned, two actors may have the same billing level
2681
for my $c (sort {$a cmp $b} split('\|', $actors)) {
2682
my ($billing, $name)=split(':', $c);
2683
# remove Host/Narrators from end
2684
# BUG - should remove (I)'s from actors/actresses names when details are generated
2685
$name=~s/\s\([IVX]+\)\[/\[/o;
2686
$name=~s/\s\([IVX]+\)$//o;
2689
#print " $c: split gives'$billing' and '$name'\n";
2694
my $lineno=sprintf("%07d", $count);
2695
print IDX $key."\t".$dbkey."\t".$year."\t".$qualifier."\t".$lineno."\n";
2696
print DAT $lineno.":".$details."\t".join($tab, @rest)."\n";
2699
# re-adjust target so progress bar doesn't seem too wonky
2700
if ( $count > $countEstimate ) {
2701
$countEstimate = $progress->target($count+100);
2702
$next_update=$progress->update($count);
2704
elsif ( $count > $next_update ) {
2705
$next_update=$progress->update($count);
2709
$progress->update($countEstimate) if Have_bar;
2714
$self->dbinfoAdd("db_version", $XMLTV::IMDB::VERSION);
2716
if ( $self->dbinfoSave() ) {
2717
$self->error("$self->{moviedbInfo}:$!");
2721
$self->status("running quick sanity check on database indexes...");
2722
my $imdb=new XMLTV::IMDB('imdbDir' => $self->{imdbDir},
2723
'verbose' => $self->{verbose});
2725
if ( -e "$self->{moviedbOffline}" ) {
2726
unlink("$self->{moviedbOffline}");
2729
if ( my $errline=$imdb->sanityCheckDatabase() ) {
2730
open(OFF, "> $self->{moviedbOffline}") || die "$self->{moviedbOffline}:$!";
2731
print OFF $errline."\n";
2732
print OFF "one of the prep stages' must have produced corrupt data\n";
2733
print OFF "report the following details to xmltv-devel\@lists.sf.net\n";
2735
my $info=XMLTV::IMDB::loadDBInfo($self->{moviedbInfo});
2736
if ( ref $info eq 'SCALAR' ) {
2737
print OFF "\tdbinfo file corrupt\n";
2738
print OFF "\t$info";
2741
for my $key (sort keys %{$info}) {
2742
print OFF "\t$key:$info->{$key}\n";
2745
print OFF "database taken offline\n";
2747
open(OFF, "< $self->{moviedbOffline}") || die "$self->{moviedbOffline}:$!";
2755
$self->status("sanity intact :)");
2758
$self->error("tv_imdb: invalid stage $stage: only 1-5 are valid");
2762
$self->dbinfoAdd("seconds_to_complete_prep_stage_$stage", (time()-$startTime));
2763
if ( $self->dbinfoSave() ) {
2764
$self->error("$self->{moviedbInfo}:$!");
2772
my ($self, $stage)=@_;
2774
for (my $st=1 ; $st < $stage ; $st++ ) {
2775
if ( !$self->stageComplete($st) ) {
2776
$self->error("prep stages must be run in sequence..");
2777
$self->error("prepStage $st either has never been run or failed");
2778
$self->error("rerun tv_imdb with --prepStage=$st");
2783
if ( -f "$self->{moviedbInfo}" && $stage != 1 ) {
2784
my $ret=$self->dbinfoLoad();
2791
$self->redirect("$self->{imdbDir}/stage$stage.log") || return(1);
2792
my $ret=$self->invokeStage($stage);
2793
$self->redirect(undef);
2796
if ( $self->{errorCountInLog} == 0 ) {
2797
$self->status("prep stage $stage succeeded with no errors");
2800
$self->status("prep stage $stage succeeded with $self->{errorCountInLog} errors in $self->{imdbDir}/stage$stage.log");
2801
if ( $stage == 7 && $self->{errorCountInLog} > 30 && $self->{errorCountInLog} < 80 ) {
2802
$self->status("this stage commonly produces around 60 (or so) warnings because of imdb");
2803
$self->status("list file inconsistancies, they can usually be safely ignored");
2808
if ( $self->{errorCountInLog} == 0 ) {
2809
$self->status("prep stage $stage failed (with no logged errors)");
2812
$self->status("prep stage $stage failed with $self->{errorCountInLog} errors in $self->{imdbDir}/stage$stage.log");