69
71
The Mail::SpamAssassin::ArchiveIterator module will go through a set
70
72
of mbox files, mbx files, and directories (with a single message per
71
file) and generate a list of messages. It will then call the wanted
72
and results functions appropriately per message.
73
file) and generate a list of messages. It will then call the C<wanted_sub>
74
and C<result_sub> functions appropriately per message.
123
125
it's a good idea to set this to 0 if you can, as it imposes a performance
128
=item opt_skip_empty_messages
130
Set to 1 if you want to skip corrupt, 0-byte messages. The default is 0.
128
134
Set to 0 (default) if you don't want to use cached information to help speed
327
if (! $self->{opt_all} && -s INPUT > BIG_BYTES) {
332
my $stat_errn = stat(INPUT) ? 0 : 0+$!;
333
if ($stat_errn == ENOENT) {
334
dbg("archive-iterator: no such input ($where)");
337
elsif ($stat_errn != 0) {
338
warn "archive-iterator: no access to input ($where): $!";
341
elsif (!-f _ && !-c _ && !-p _) {
342
warn "archive-iterator: not a plain file (or char.spec. or pipe) ($where)";
346
if ($self->{opt_all}) {
349
# must check size while reading
350
} elsif (-s _ > BIG_BYTES) {
352
# note that -s can only deal with files, it returns 0 on char.spec. STDIN
328
353
info("archive-iterator: skipping large message\n");
354
close INPUT or die "error closing input file: $!";
336
if (!defined $header && /^\015?$/) {
363
while ( $nread=read(INPUT,$inbuf,16384) ) {
365
if (($len > BIG_BYTES) && !$self->{opt_all}) {
366
info("archive-iterator: skipping large message\n");
367
close INPUT or die "error closing input file: $!";
372
defined $nread or die "error reading: $!";
374
@msg = split(/^/m, $str, -1); undef $str;
375
for my $j (0..$#msg) {
376
if ($msg[$j] =~ /^\015?$/) { $header = $j; last }
378
close INPUT or die "error closing input file: $!";
342
380
if ($date == AI_TIME_UNKNOWN && $self->{determine_receive_date}) {
343
381
$date = Mail::SpamAssassin::Util::receive_date(join('', splice(@msg, 0, $header)));
356
394
$self->{access_problem} = 1;
359
seek(INPUT,$offset,0);
361
last if (substr($_,0,5) eq "From " && @msg);
397
seek(INPUT,$offset,0) or die "cannot reposition file to $offset: $!";
398
for ($!=0; <INPUT>; $!=0) {
399
last if (substr($_,0,5) eq "From " && @msg && /^From \S+ ?\S\S\S \S\S\S .\d .\d:\d\d:\d\d \d{4}/);
364
402
# skip too-big mails
365
403
if (! $self->{opt_all} && @msg > BIG_LINES) {
366
404
info("archive-iterator: skipping large message\n");
405
close INPUT or die "error closing input file: $!";
413
defined $_ || $!==0 or
414
$!==EBADF ? dbg("archive-iterator: error reading: $!")
415
: die "error reading: $!";
416
close INPUT or die "error closing input file: $!";
377
418
if ($date == AI_TIME_UNKNOWN && $self->{determine_receive_date}) {
378
419
$date = Mail::SpamAssassin::Util::receive_date(join('', splice(@msg, 0, $header)));
396
seek(INPUT, $offset, 0);
437
seek(INPUT,$offset,0) or die "cannot reposition file to $offset: $!";
439
for ($!=0; <INPUT>; $!=0) {
399
440
last if ($_ =~ MBX_SEPARATOR);
402
443
# skip mails that are too big
403
444
if (! $self->{opt_all} && @msg > BIG_LINES) {
404
445
info("archive-iterator: skipping large message\n");
446
close INPUT or die "error closing input file: $!";
454
defined $_ || $!==0 or
455
$!==EBADF ? dbg("archive-iterator: error reading: $!")
456
: die "error reading: $!";
457
close INPUT or die "error closing input file: $!";
415
459
if ($date == AI_TIME_UNKNOWN && $self->{determine_receive_date}) {
416
460
$date = Mail::SpamAssassin::Util::receive_date(join('', splice(@msg, 0, $header)));
440
484
if (ref $target eq 'HASH') {
441
485
# e.g. { target => $target, opt_foo => 1, opt_bar => 0.4 ... }
442
486
foreach my $k (keys %{$target}) {
443
next unless ($k =~ /^opt_/);
444
my $v = $target->{$k};
445
next unless defined $v;
488
$opts{$k} = $target->{$k};
448
491
$target = $target->{target};
486
529
# for this location only; 'detect' means they can differ for each location
487
my $thisformat = $format;
530
my $thisformat = $format;
489
532
if ($format eq 'detect') {
490
533
# detect the format
491
if (!-d $location && $location =~ /\.mbox/i) {
534
my $stat_errn = stat($location) ? 0 : 0+$!;
535
if ($stat_errn == ENOENT) {
536
$thisformat = 'file'; # actually, no file - to be detected later
538
elsif ($stat_errn != 0) {
539
warn "archive-iterator: no access to $location: $!";
540
$thisformat = 'file';
546
elsif ($location =~ /\.mbox/i) {
492
547
# filename indicates mbox
493
548
$thisformat = 'mbox';
495
elsif (!(-d $location)) {
496
$thisformat = 'file';
551
$thisformat = 'file';
636
687
sub _scan_directory {
637
688
my ($self, $class, $folder, $bkfunc) = @_;
641
opendir(DIR, $folder) || die "archive-iterator: can't open '$folder' dir: $!\n";
642
if (-f "$folder/cyrus.header") {
692
if (-d "$folder/new" && -d "$folder/cur" && -d "$folder/tmp") {
693
# Maildir format: bug 3003
694
for my $sub ("new", "cur") {
695
opendir (DIR, "$folder/$sub")
696
or die "Can't open '$folder/$sub' dir: $!\n";
697
# Don't learn from messages marked as deleted
698
# Or files starting with a leading dot
699
push @files, map { "$sub/$_" } grep { !/^\.|:2,.*T/ } readdir(DIR);
700
closedir(DIR) or die "error closing directory $folder: $!";
703
elsif (-f "$folder/cyrus.header") {
704
opendir(DIR, $folder)
705
or die "archive-iterator: can't open '$folder' dir: $!\n";
643
707
# cyrus metadata: http://unix.lsa.umich.edu/docs/imap/imap-lsa-srv_3.html
644
@files = grep { /^\S+$/ && !/^cyrus\.(?:index|header|cache|seen)/ }
708
@files = grep { $_ ne '.' && $_ ne '..' &&
709
/^\S+$/ && !/^cyrus\.(?:index|header|cache|seen)/ }
711
closedir(DIR) or die "error closing directory $folder: $!";
714
opendir(DIR, $folder)
715
or die "archive-iterator: can't open '$folder' dir: $!\n";
648
717
# ignore ,234 (deleted or refiled messages) and MH metadata dotfiles
649
718
@files = grep { !/^[,.]/ } readdir(DIR);
719
closedir(DIR) or die "error closing directory $folder: $!";
653
@files = grep { -f } map { "$folder/$_" } @files;
722
$_ = "$folder/$_" for @files;
656
725
# this is not a problem; no need to warn about it
661
730
$self->_create_cache('dir', $folder);
663
foreach my $mail (@files) {
664
$self->_scan_file($class, $mail, $bkfunc);
732
foreach my $file (@files) {
733
my $stat_errn = stat($file) ? 0 : 0+$!;
734
if ($stat_errn == ENOENT) {
737
elsif ($stat_errn != 0) {
738
warn "archive-iterator: no access to $file: $!";
740
elsif (-f _ || -c _ || -p _) {
741
$self->_scan_file($class, $file, $bkfunc);
744
push(@subdirs, $file);
747
warn "archive-iterator: $file is not a plain file or directory: $!";
750
@files = (); # release storage
752
# recurse into directories
753
foreach my $dir (@subdirs) {
754
$self->_scan_directory($class, $dir, $bkfunc);
667
757
if (defined $AICache) {
675
765
$self->_bump_scan_progress();
678
return unless $self->_message_is_useful_by_file_modtime($s[9]);
767
# only perform these stat() operations if we're not using a cache;
768
# it's faster to perform lookups in the cache, and more accurate
769
if (!defined $AICache) {
771
@s or warn "archive-iterator: no access to $mail: $!";
772
return unless $self->_message_is_useful_by_file_modtime($s[9]);
680
775
my $date = AI_TIME_UNKNOWN;
682
776
if ($self->{determine_receive_date}) {
683
777
unless (defined $AICache and $date = $AICache->check($mail)) {
778
# silently skip directories/non-files; some folders may
779
# contain extraneous dirs etc.
780
my $stat_errn = stat($mail) ? 0 : 0+$!;
781
if ($stat_errn != 0) {
782
warn "archive-iterator: no access to $mail: $!";
685
790
if (!_mail_open($mail)) {
686
791
$self->{access_problem} = 1;
794
for ($!=0; <INPUT>; $!=0) {
690
795
last if /^\015?$/s;
798
defined $_ || $!==0 or
799
$!==EBADF ? dbg("archive-iterator: error reading: $!")
800
: die "error reading: $!";
801
close INPUT or die "error closing input file: $!";
803
return if ($self->{opt_skip_empty_messages} && $header eq '');
694
805
$date = Mail::SpamAssassin::Util::receive_date($header);
695
806
if (defined $AICache) {
696
807
$AICache->update($mail, $date);
700
811
return if !$self->_message_is_useful_by_date($date);
701
812
return if !$self->_scanprob_says_scan();
815
return if ($self->{opt_skip_empty_messages} && (-z $mail));
704
818
&{$bkfunc}($self, $date, $class, 'f', $mail);
710
824
my ($self, $class, $folder, $bkfunc) = @_;
827
my $stat_errn = stat($folder) ? 0 : 0+$!;
828
if ($stat_errn == ENOENT) {
831
elsif ($stat_errn != 0) {
832
warn "archive-iterator: no access to $folder: $!";
835
push(@files, $folder);
714
838
# passed a directory of mboxes
715
839
$folder =~ s/\/\s*$//; #Remove trailing slash, if there
716
840
if (!opendir(DIR, $folder)) {
718
842
$self->{access_problem} = 1;
722
845
while ($_ = readdir(DIR)) {
723
if(/^[^\.]\S*$/ && ! -d "$folder/$_") {
846
next if $_ eq '.' || $_ eq '..' || !/^[^\.]\S*$/;
847
# hmmm, ignores folders with spaces in the name???
848
$stat_errn = stat("$folder/$_") ? 0 : 0+$!;
849
if ($stat_errn == ENOENT) {
852
elsif ($stat_errn != 0) {
853
warn "archive-iterator: no access to $folder/$_: $!";
724
856
push(@files, "$folder/$_");
859
closedir(DIR) or die "error closing directory $folder: $!";
730
push(@files, $folder);
862
warn "archive-iterator: $folder is not a plain file or directory: $!";
733
865
foreach my $file (@files) {
762
895
my $where = 0; # current byte offset
763
896
my $first = ''; # first line of message
764
897
my $header = ''; # header text
765
my $in_header = 0; # are in we a header?
898
my $in_header = 0; # are in we a header?
766
899
while (!eof INPUT) {
767
900
my $offset = $start; # byte offset of this message
768
901
my $header = $first; # remember first line
902
for ($!=0; <INPUT>; $!=0) {
770
903
if ($in_header) {
771
904
if (/^\015?$/s) {
778
if (substr($_,0,5) eq "From ") {
911
if (substr($_,0,5) eq "From " &&
912
/^From \S+ ?\S\S\S \S\S\S .\d .\d:\d\d:\d\d \d{4}/) {
782
916
$where = tell INPUT;
917
$where >= 0 or die "cannot obtain file position: $!";
785
920
$where = tell INPUT;
921
$where >= 0 or die "cannot obtain file position: $!";
923
defined $_ || $!==0 or
924
$!==EBADF ? dbg("archive-iterator: error reading: $!")
925
: die "error reading: $!";
927
# next if ($self->{opt_skip_empty_messages} && $header eq '');
788
928
$self->_bump_scan_progress();
789
929
$info->{$offset} = Mail::SpamAssassin::Util::receive_date($header);
932
close INPUT or die "error closing input file: $!";
795
935
while(my($k,$v) = each %{$info}) {
815
955
my ($self, $class, $folder, $bkfunc) = @_;
816
956
my (@files, $fp);
958
my $stat_errn = stat($folder) ? 0 : 0+$!;
959
if ($stat_errn == ENOENT) {
962
elsif ($stat_errn != 0) {
963
warn "archive-iterator: no access to $folder: $!";
966
push(@files, $folder);
819
969
# got passed a directory full of mbx folders.
820
970
$folder =~ s/\/\s*$//; # remove trailing slash, if there is one
821
971
if (!opendir(DIR, $folder)) {
823
973
$self->{access_problem} = 1;
827
976
while ($_ = readdir(DIR)) {
828
if(/^[^\.]\S*$/ && ! -d "$folder/$_") {
977
next if $_ eq '.' || $_ eq '..' || !/^[^\.]\S*$/;
978
# hmmm, ignores folders with spaces in the name???
979
$stat_errn = stat("$folder/$_") ? 0 : 0+$!;
980
if ($stat_errn == ENOENT) {
983
elsif ($stat_errn != 0) {
984
warn "archive-iterator: no access to $folder/$_: $!";
829
987
push(@files, "$folder/$_");
990
closedir(DIR) or die "error closing directory $folder: $!";
835
push(@files, $folder);
993
warn "archive-iterator: $folder is not a plain file or directory: $!";
838
996
foreach my $file (@files) {
867
1026
# check the mailbox is in mbx format
869
if ($fp !~ /\*mbx\*/) {
1027
$! = 0; $fp = <INPUT>;
1028
defined $fp || $!==0 or
1029
$!==EBADF ? dbg("archive-iterator: error reading: $!")
1030
: die "error reading: $!";
1032
die "archive-iterator: error: mailbox not in mbx format - empty!\n";
1033
} elsif ($fp !~ /\*mbx\*/) {
870
1034
die "archive-iterator: error: mailbox not in mbx format!\n";
873
1037
# skip mbx headers to the first email...
874
seek(INPUT, 2048, 0);
1038
seek(INPUT,2048,0) or die "cannot reposition file to 2048: $!";
876
1039
my $sep = MBX_SEPARATOR;
1041
for ($!=0; <INPUT>; $!=0) {
879
1042
if ($_ =~ /$sep/) {
880
1043
my $offset = tell INPUT;
1044
$offset >= 0 or die "cannot obtain file position: $!";
883
1047
# gather up the headers...
884
1048
my $header = '';
1049
for ($!=0; <INPUT>; $!=0) {
886
1050
last if (/^\015?$/s);
890
$self->_bump_scan_progress();
891
$info->{$offset} = Mail::SpamAssassin::Util::receive_date($header);
1053
defined $_ || $!==0 or
1054
$!==EBADF ? dbg("archive-iterator: error reading: $!")
1055
: die "error reading: $!";
1056
if (!($self->{opt_skip_empty_messages} && $header eq '')) {
1057
$self->_bump_scan_progress();
1058
$info->{$offset} = Mail::SpamAssassin::Util::receive_date($header);
893
1061
# go onto the next message
894
seek(INPUT, $offset + $size, 0);
1062
seek(INPUT, $offset + $size, 0)
1063
or die "cannot reposition file to $offset + $size: $!";
897
1066
die "archive-iterator: error: failure to read message body!\n";
1069
defined $_ || $!==0 or
1070
$!==EBADF ? dbg("archive-iterator: error reading: $!")
1071
: die "error reading: $!";
1072
close INPUT or die "error closing input file: $!";
903
1075
while(my($k,$v) = each %{$info}) {