2
# Licensed to the Apache Software Foundation (ASF) under one or more
3
# contributor license agreements. See the NOTICE file distributed with
4
# this work for additional information regarding copyright ownership.
5
# The ASF licenses this file to you under the Apache License, Version 2.0
6
# (the "License"); you may not use this file except in compliance with
7
# the License. You may obtain a copy of the License at:
9
# http://www.apache.org/licenses/LICENSE-2.0
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
20
Mail::SpamAssassin::Message - decode, render, and hold an RFC-2822 message
24
This module encapsulates an email message and allows access to the various MIME
25
message parts and message metadata.
27
The message structure, after initiating a parse() cycle, looks like this:
29
Message object, also top-level node in Message::Node tree
31
+---> Message::Node for other parts in MIME structure
32
| |---> [ more Message::Node parts ... ]
35
+---> Message::Metadata object to hold metadata
43
package Mail::SpamAssassin::Message;
49
use Mail::SpamAssassin;
50
use Mail::SpamAssassin::Message::Node;
51
use Mail::SpamAssassin::Message::Metadata;
52
use Mail::SpamAssassin::Constants qw(:sa);
53
use Mail::SpamAssassin::Logger;
57
@ISA = qw(Mail::SpamAssassin::Message::Node);
59
# ---------------------------------------------------------------------------
63
Creates a Mail::SpamAssassin::Message object. Takes a hash reference
64
as a parameter. The used hash key/value pairs are as follows:
66
C<message> is either undef (which will use STDIN), a scalar of the
67
entire message, an array reference of the message with 1 line per array
68
element, and either a file glob or IO::File object which holds the entire
69
contents of the message.
71
Note: The message is expected to generally be in RFC 2822 format, optionally
72
including an mbox message separator line (the "From " line) as the first line.
74
C<parse_now> specifies whether or not to create the MIME tree
75
at object-creation time or later as necessary.
77
The I<parse_now> option, by default, is set to false (0).
78
This allows SpamAssassin to not have to generate the tree of
79
Mail::SpamAssassin::Message::Node objects and their related data if the
80
tree is not going to be used. This is handy, for instance, when running
81
C<spamassassin -d>, which only needs the pristine header and body which
82
is always handled when the object is created.
84
C<subparse> specifies how many MIME recursion levels should be parsed.
89
# month mappings (ripped from Util.pm)
90
my %MONTH = (jan => 1, feb => 2, mar => 3, apr => 4, may => 5, jun => 6,
91
jul => 7, aug => 8, sep => 9, oct => 10, nov => 11, dec => 12);
93
# day of week mapping (starting from zero)
94
my @DAY_OF_WEEK = qw/Sun Mon Tue Wed Thu Fri Sat/ ;
98
$class = ref($class) || $class;
101
my $message = defined $opts->{'message'} ? $opts->{'message'} : \*STDIN;
102
my $parsenow = $opts->{'parsenow'} || 0;
103
my $normalize = $opts->{'normalize'} || 0;
105
# Specifies whether or not to parse message/rfc822 parts into its own tree.
106
# If the # > 0, it'll subparse, otherwise it won't. By default, do twenty
108
my $subparse = defined $opts->{'subparse'} ? $opts->{'subparse'} : 20;
110
my $self = $class->SUPER::new({normalize=>$normalize});
112
$self->{tmpfiles} = [];
113
$self->{pristine_headers} = '';
114
$self->{pristine_body} = '';
115
$self->{mime_boundary_state} = {};
116
$self->{line_ending} = "\012";
117
$self->{master_deadline} = $opts->{'master_deadline'};
118
$self->{suppl_attrib} = $opts->{'suppl_attrib'};
122
# create the metadata holder class
123
$self->{metadata} = Mail::SpamAssassin::Message::Metadata->new($self);
125
# Ok, go ahead and do the message "parsing"
127
# protect it from abuse ...
130
# Figure out how the message was passed to us, and deal with it.
132
if (ref $message eq 'ARRAY') {
133
@message = @{$message};
135
elsif (ref($message) eq 'GLOB' || ref($message) =~ /^IO::/) {
136
if (defined fileno $message) {
138
# sysread+split avoids a Perl I/O bug (Bug 5985)
139
# and is faster than (<$message>) by 10..25 %
140
# (a drawback is a short-term double storage of a text in $raw_str)
142
my($inbuf,$nread,$raw_str); $raw_str = '';
143
while ( $nread=sysread($message,$inbuf,16384) ) { $raw_str .= $inbuf }
144
defined $nread or die "error reading: $!";
145
@message = split(/^/m, $raw_str, -1);
147
dbg("message: empty message read") if $raw_str eq '';
150
elsif (ref $message) {
151
dbg("message: Input is a reference of unknown type!");
153
elsif (defined $message) {
154
@message = split(/^/m, $message, -1);
157
# Pull off mbox and mbx separators
158
# also deal with null messages
161
# if we get here, it means that the input was null, so fake the message
162
# content as a single newline...
164
} elsif ($message[0] =~ /^From\s+(?!:)/) {
165
# careful not to confuse with obsolete syntax which allowed WSP before ':'
166
# mbox formated mailbox
167
$self->{'mbox_sep'} = shift @message;
168
} elsif ($message[0] =~ MBX_SEPARATOR) {
171
# Munge the mbx message separator into mbox format as a sort of
172
# de facto portability standard in SA's internals. We need to
173
# to this so that Mail::SpamAssassin::Util::parse_rfc822_date
174
# can parse the date string...
175
if (/([\s\d]\d)-([a-zA-Z]{3})-(\d{4})\s(\d{2}):(\d{2}):(\d{2})/) {
182
my @arr = localtime(timelocal($6,$5,$4,$1,$MONTH{lc($2)}-1,$3));
185
if (/^From:[^<]*<([^>]+)>/) {
188
} elsif (/^From:\s*([^<> ]+)/) {
193
$self->{'mbox_sep'} = "From $address $DAY_OF_WEEK[$arr[6]] $2 $1 $4:$5:$6 $3\n";
198
# Check to see if we should do CRLF instead of just LF
199
# For now, just check the first header and do whatever it does
200
if (@message && $message[0] =~ /\015\012/) {
201
$self->{line_ending} = "\015\012";
202
dbg("message: line ending changed to CRLF");
205
# Go through all the header fields of the message
209
# make sure not to lose the last header field when there is no body
211
my $current = $eof ? "\n" : shift @message;
213
if ( $current =~ /^[ \t]/ ) {
214
# This wasn't useful in terms of a rule, but we may want to treat it
215
# specially at some point. Perhaps ignore it?
216
#unless ($current =~ /\S/) {
217
# $self->{'obsolete_folding_whitespace'} = 1;
220
$header = '' if !defined $header; # header starts with a continuation!?
221
$header .= $current; # append continuations, no matter what
222
$self->{'pristine_headers'} .= $current;
224
else { # not a continuation
225
# Ok, there's a header here, let's go ahead and add it in.
226
if (defined $header) { # deal with a previous header field
227
my ($key, $value) = split (/:/s, $header, 2);
229
# If it's not a valid header (aka: not in the form "foo:bar"), skip it.
230
if (defined $value) {
231
$key =~ s/[ \t]+\z//; # strip WSP before colon, obsolete rfc822 syn
232
# limit the length of the pairs we store
233
if (length($key) > MAX_HEADER_KEY_LENGTH) {
234
$key = substr($key, 0, MAX_HEADER_KEY_LENGTH);
235
$self->{'truncated_header'} = 1;
237
if (length($value) > MAX_HEADER_VALUE_LENGTH) {
238
$value = substr($value, 0, MAX_HEADER_VALUE_LENGTH);
239
$self->{'truncated_header'} = 1;
241
$self->header($key, $value);
245
if ($current =~ /^\r?$/) { # a regular end of a header section
247
$self->{'missing_head_body_separator'} = 1;
249
$self->{'pristine_headers'} .= $current;
253
elsif ($current =~ /^--/) { # mime boundary encountered, bail out
254
$self->{'missing_head_body_separator'} = 1;
255
unshift(@message, $current);
258
# should we assume entering a body on encountering invalid header field?
259
elsif ($current !~ /^[\041-\071\073-\176]+[ \t]*:/) {
260
# A field name MUST be composed of printable US-ASCII characters
261
# (i.e., characters that have values between 33 (041) and 126 (176),
262
# inclusive), except colon (072). Obsolete header field syntax
263
# allowed WSP before a colon.
264
if (++$hdr_errors <= 3) {
265
# just consume but ignore a few invalid header fields
266
} else { # enough is enough...
267
$self->{'missing_head_body_separator'} = 1;
268
unshift(@message, $current);
273
# start collecting a new header field
275
$self->{'pristine_headers'} .= $current;
280
# Store the pristine body for later -- store as a copy since @message
281
# will get modified below
282
$self->{'pristine_body'} = join('', @message);
285
# also merge multiple blank lines into a single one
287
# iterate over lines in reverse order
288
for (my $cnt=$#message; $cnt>=0; $cnt--) {
289
$message[$cnt] =~ s/\015\012/\012/;
292
if ($message[$cnt] !~ /\S/) {
293
if (!defined $start) {
296
next unless $cnt == 0;
299
# line is not blank, or we've reached the beginning
301
# if we've got a series of blank lines, get rid of them
302
if (defined $start) {
303
my $num = $start-$cnt;
305
splice @message, $cnt+2, $num-1;
311
# Figure out the boundary
313
($self->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($self->header('content-type'));
314
dbg("message: main message type: ".$self->{'type'});
316
# parse queue, simple array of parts to parse:
317
# 0: part object, already in the tree
318
# 1: boundary used to focus body parsing
320
# 3: how many MIME subparts to parse down
322
$self->{'parse_queue'} = [ [ $self, $boundary, \@message, $subparse ] ];
324
# If the message does need to get parsed, save off a copy of the body
325
# in a format we can easily parse later so we don't have to rip from
326
# pristine_body ... If we do want to parse now, go ahead and do so ...
335
# ---------------------------------------------------------------------------
339
Used to search the tree for specific MIME parts. See
340
I<Mail::SpamAssassin::Message::Node> for more details.
344
# Used to find any MIME parts whose simple content-type matches a given regexp
345
# Searches it's own and any children parts. Returns an array of MIME
346
# objects which match.
351
# ok, we need to do the parsing now...
352
$self->parse_body() if (exists $self->{'parse_queue'});
354
# and pass through to the Message::Node version of the method
355
return $self->SUPER::find_parts(@_);
358
# ---------------------------------------------------------------------------
360
=item get_pristine_header()
362
Returns pristine headers of the message. If no specific header name
363
is given as a parameter (case-insensitive), then all headers will be
364
returned as a scalar, including the blank line at the end of the headers.
366
If called in an array context, an array will be returned with each
367
specific header in a different element. In a scalar context, the last
368
specific header is returned.
370
ie: If 'Subject' is specified as the header, and there are 2 Subject
371
headers in a message, the last/bottom one in the message is returned in
372
scalar context or both are returned in array context.
374
Btw, returning the last header field (not the first) happens to be consistent
375
with DKIM signatures, which search for and cover multiple header fields
376
bottom-up according to the 'h' tag. Let's keep it this way.
378
Note: the returned header will include the ending newline and any embedded
383
sub get_pristine_header {
384
my ($self, $hdr) = @_;
386
return $self->{pristine_headers} if !defined $hdr || $hdr eq '';
388
$self->{pristine_headers} =~ /^\Q$hdr\E[ \t]*:[ \t]*(.*?\n(?![ \t]))/smgi;
389
# taintedness is retained by "use re 'taint'" (fix in bug 5283 now redundant)
391
return $self->get_header($hdr);
392
} elsif (wantarray) {
399
=item get_mbox_separator()
401
Returns the mbox separator found in the message, or undef if there
406
sub get_mbox_separator {
407
return $_[0]->{mbox_sep};
412
Returns an array of the pristine message body, one line per array element.
418
my @ret = split(/^/m, $self->{pristine_body});
422
# ---------------------------------------------------------------------------
426
Returns a scalar of the entire pristine message.
432
return $self->{pristine_headers} . $self->{pristine_body};
435
=item get_pristine_body()
437
Returns a scalar of the pristine message body.
441
sub get_pristine_body {
443
return $self->{pristine_body};
446
# ---------------------------------------------------------------------------
448
=item extract_message_metadata($permsgstatus)
452
sub extract_message_metadata {
453
my ($self, $permsgstatus) = @_;
455
# do this only once per message, it can be expensive
456
if ($self->{already_extracted_metadata}) { return; }
457
$self->{already_extracted_metadata} = 1;
459
$self->{metadata}->extract ($self, $permsgstatus);
462
# ---------------------------------------------------------------------------
464
=item $str = get_metadata($hdr)
469
my ($self, $hdr) = @_;
470
if (!$self->{metadata}) {
471
warn "metadata: oops! get_metadata() called after finish_metadata()"; return;
473
# dbg("message: get_metadata - %s: %s", $hdr, defined $_ ? $_ : '<undef>')
474
# for $self->{metadata}->{strings}->{lc $hdr};
476
$self->{metadata}->{strings}->{lc $hdr};
479
=item put_metadata($hdr, $text)
484
my ($self, $hdr, $text) = @_;
485
if (!$self->{metadata}) {
486
warn "metadata: oops! put_metadata() called after finish_metadata()"; return;
488
# dbg("message: put_metadata - %s: %s", $hdr, $text);
489
$self->{metadata}->{strings}->{lc $hdr} = $text;
492
=item delete_metadata($hdr)
496
sub delete_metadata {
497
my ($self, $hdr) = @_;
498
if (!$self->{metadata}) {
499
warn "metadata: oops! delete_metadata() called after finish_metadata()"; return;
501
delete $self->{metadata}->{strings}->{lc $hdr};
504
=item $str = get_all_metadata()
508
sub get_all_metadata {
511
if (!$self->{metadata}) {
512
warn "metadata: oops! get_all_metadata() called after finish_metadata()"; return;
515
my $keys_ref = $self->{metadata}->{strings};
516
foreach my $key (sort keys %$keys_ref) {
517
my $val = $keys_ref->{$key};
518
$val = '' if !defined $val;
519
push (@ret, "$key: $val\n");
521
return (wantarray ? @ret : join('', @ret));
524
# ---------------------------------------------------------------------------
526
=item finish_metadata()
528
Destroys the metadata for this message. Once a message has been
529
scanned fully, the metadata is no longer required. Destroying
530
this will free up some memory.
534
sub finish_metadata {
536
if (defined ($self->{metadata})) {
537
$self->{metadata}->finish();
538
delete $self->{metadata};
544
Clean up an object so that it can be destroyed.
552
$self->finish_metadata();
554
# These will only be in the root Message node
555
delete $self->{'mime_boundary_state'};
556
delete $self->{'mbox_sep'};
557
delete $self->{'normalize'};
558
delete $self->{'pristine_body'};
559
delete $self->{'pristine_headers'};
560
delete $self->{'line_ending'};
561
delete $self->{'missing_head_body_separator'};
563
my @toclean = ( $self );
565
# Go ahead and clean up all of the Message::Node parts
566
while (my $part = shift @toclean) {
567
# bug 5557: windows requires tmp file be closed before it can be rm'd
568
if (ref $part->{'raw'} eq 'GLOB') {
569
close($part->{'raw'}) or die "error closing input file: $!";
572
# bug 5858: avoid memory leak with deep MIME structure
573
if (defined ($part->{metadata})) {
574
$part->{metadata}->finish();
575
delete $part->{metadata};
578
delete $part->{'headers'};
579
delete $part->{'raw_headers'};
580
delete $part->{'header_order'};
581
delete $part->{'raw'};
582
delete $part->{'decoded'};
583
delete $part->{'rendered'};
584
delete $part->{'visible_rendered'};
585
delete $part->{'invisible_rendered'};
586
delete $part->{'type'};
587
delete $part->{'rendered_type'};
589
# if there are children nodes, add them to the queue of nodes to clean up
590
if (exists $part->{'body_parts'}) {
591
push(@toclean, @{$part->{'body_parts'}});
592
delete $part->{'body_parts'};
596
# delete temporary files
597
if ($self->{'tmpfiles'}) {
598
for my $fn (@{$self->{'tmpfiles'}}) {
599
unlink($fn) or warn "cannot unlink $fn: $!";
601
delete $self->{'tmpfiles'};
605
# also use a DESTROY method, just to ensure (as much as possible) that
606
# temporary files are deleted even if the finish() method is omitted
609
# best practices: prevent potential calls to eval and to system routines
610
# in code of a DESTROY method from clobbering global variables $@ and $!
611
local($@,$!); # keep outer error handling unaffected by DESTROY
612
if ($self->{'tmpfiles'}) {
613
for my $fn (@{$self->{'tmpfiles'}}) {
614
unlink($fn) or dbg("message: cannot unlink $fn: $!");
619
# ---------------------------------------------------------------------------
623
Return a time_t value with the received date of the current message,
624
or current time if received time couldn't be determined.
631
return Mail::SpamAssassin::Util::receive_date(scalar $self->get_all_headers(0,1));
634
# ---------------------------------------------------------------------------
638
=head1 PARSING METHODS, NON-PUBLIC
640
These methods take a RFC2822-esque formatted message and create a tree
641
with all of the MIME body parts included. Those parts will be decoded
642
as necessary, and text/html parts will be rendered into a standard text
643
format, suitable for use in SpamAssassin.
649
parse_body() passes the body part that was passed in onto the
650
correct part parser, either _parse_multipart() for multipart/* parts,
651
or _parse_normal() for everything else. Multipart sections become the
652
root of sub-trees, while everything else becomes a leaf in the tree.
654
For multipart messages, the first call to parse_body() doesn't create a
655
new sub-tree and just uses the parent node to contain children. All other
656
calls to parse_body() will cause a new sub-tree root to be created and
657
children will exist underneath that root. (this is just so the tree
658
doesn't have a root node which points at the actual root node ...)
665
# This shouldn't happen, but just in case, abort.
666
return unless (exists $self->{'parse_queue'});
668
dbg("message: ---- MIME PARSER START ----");
670
while (my $toparse = shift @{$self->{'parse_queue'}}) {
671
# multipart sections are required to have a boundary set ... If this
672
# one doesn't, assume it's malformed and send it to be parsed as a
673
# non-multipart section
675
if ( $toparse->[0]->{'type'} =~ /^multipart\//i && defined $toparse->[1] && ($toparse->[3] > 0)) {
676
$self->_parse_multipart($toparse);
679
# If it's not multipart, go ahead and just deal with it.
680
$self->_parse_normal($toparse);
682
# bug 5041: exclude message/partial messages, however
683
if ($toparse->[0]->{'type'} =~ /^message\b/i &&
684
$toparse->[0]->{'type'} !~ /^message\/partial$/i &&
687
# Just decode the part, but we don't care about the result here.
688
$toparse->[0]->decode(0);
690
# bug 5051, bug 3748: sometimes message/* parts have no content,
691
# and we get stuck waiting for STDIN, which is bad. :(
692
if (defined $toparse->[0]->{'decoded'} &&
693
$toparse->[0]->{'decoded'} ne '')
695
# Ok, so this part is still semi-recursive, since M::SA::Message
696
# calls M::SA::Message, but we don't subparse the new message,
697
# and pull a sneaky "steal our child's queue" maneuver to deal
698
# with it on our own time. Reference the decoded array directly
701
my $msg_obj = Mail::SpamAssassin::Message->new({
702
message => $toparse->[0]->{'decoded'},
704
normalize => $self->{normalize},
705
subparse => $toparse->[3]-1,
708
# Add the new message to the current node
709
$toparse->[0]->add_body_part($msg_obj);
711
# now this is the sneaky bit ... steal the sub-message's parse_queue
712
# and add it to ours. then we'll handle the sub-message in our
713
# normal loop and get all the glory. muhaha. :)
714
push(@{$self->{'parse_queue'}}, @{$msg_obj->{'parse_queue'}});
715
delete $msg_obj->{'parse_queue'};
717
# Ok, we've subparsed, so go ahead and remove the raw and decoded
718
# data because we won't need them anymore (the tree under this part
719
# will have that data)
720
if (ref $toparse->[0]->{'raw'} eq 'GLOB') {
721
# Make sure we close it if it's a temp file -- Bug 5166
722
close($toparse->[0]->{'raw'})
723
or die "error closing input file: $!";
726
delete $toparse->[0]->{'raw'};
728
delete $toparse->[0]->{'decoded'};
734
dbg("message: ---- MIME PARSER END ----");
736
# we're done parsing, so remove the queue variable
737
delete $self->{'parse_queue'};
740
=item _parse_multipart()
742
Generate a root node, and for each child part call parse_body()
743
to generate the tree.
747
sub _parse_multipart {
748
my($self, $toparse) = @_;
750
my ($msg, $boundary, $body, $subparse) = @{$toparse};
752
# we're not supposed to be a leaf, so prep ourselves
753
$msg->{'body_parts'} = [];
755
# the next set of objects will be one level deeper
758
dbg("message: parsing multipart, got boundary: ".(defined $boundary ? $boundary : ''));
760
# NOTE: The MIME boundary REs here are very specific to be mostly RFC 1521
761
# compliant, but also allow possible malformations to still work. Please
762
# see Bugzilla bug 3749 for more information before making any changes!
764
# ignore preamble per RFC 1521, unless there's no boundary ...
765
if ( defined $boundary ) {
767
my $tmp_line = @{$body};
768
for ($line=0; $line < $tmp_line; $line++) {
769
# specifically look for an opening boundary
770
if ($body->[$line] =~ /^--\Q$boundary\E\s*$/) {
771
# Make note that we found the opening boundary
772
$self->{mime_boundary_state}->{$boundary} = 1;
774
# if the line after the opening boundary isn't a header, flag it.
775
# we need to make sure that there's actually another line though.
776
if ($line+1 < $tmp_line && $body->[$line+1] !~ /^[\041-\071\073-\176]+:/) {
777
$self->{'missing_mime_headers'} = 1;
784
# Found a boundary, ignore the preamble
785
if ( $line < $tmp_line ) {
786
splice @{$body}, 0, $line+1;
789
# Else, there's no boundary, so leave the whole part...
792
# prepare a new tree node
793
my $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
798
my $line_count = @{$body};
799
foreach ( @{$body} ) {
800
# if we're on the last body line, or we find any boundary marker,
801
# deal with the mime part
802
if ( --$line_count == 0 || (defined $boundary && /^--\Q$boundary\E(?:--)?\s*$/) ) {
803
my $line = $_; # remember the last line
805
# per rfc 1521, the CRLF before the boundary is part of the boundary:
806
# NOTE: The CRLF preceding the encapsulation line is conceptually
807
# attached to the boundary so that it is possible to have a part
808
# that does not end with a CRLF (line break). Body parts that must
809
# be considered to end with line breaks, therefore, must have two
810
# CRLFs preceding the encapsulation line, the first of which is part
811
# of the preceding body part, and the second of which is part of the
812
# encapsulation boundary.
814
chomp( $part_array->[-1] ); # trim the CRLF that's part of the boundary
815
splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...
818
# Invalid parts can have no body, so fake in a blank body
824
($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
825
$p_boundary ||= $boundary;
826
dbg("message: found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));
828
# we've created a new node object, so add it to the queue along with the
829
# text that belongs to that part, then add the new part to the current
830
# node to create the tree.
831
push(@{$self->{'parse_queue'}}, [ $part_msg, $p_boundary, $part_array, $subparse ]);
832
$msg->add_body_part($part_msg);
834
# rfc 1521 says /^--boundary--$/, some MUAs may just require /^--boundary--/
835
# but this causes problems with horizontal lines when the boundary is
836
# made up of dashes as well, etc.
837
if (defined $boundary) {
838
if ($line =~ /^--\Q${boundary}\E--\s*$/) {
839
# Make a note that we've seen the end boundary
840
$self->{mime_boundary_state}->{$boundary}--;
843
elsif ($line_count && $body->[-$line_count] !~ /^[\041-\071\073-\176]+:/) {
844
# if we aren't on an end boundary and there are still lines left, it
845
# means we hit a new start boundary. therefore, the next line ought
846
# to be a mime header. if it's not, mark it.
847
$self->{'missing_mime_headers'} = 1;
851
# make sure we start with a new clean node
853
$part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
861
# s/\s+$//; # bug 5127: don't clean this up (yet)
862
if (/^[\041-\071\073-\176]+[ \t]*:/) {
864
my ( $key, $value ) = split ( /:\s*/, $header, 2 );
865
$part_msg->header( $key, $value );
870
elsif (/^[ \t]/ && $header) {
871
# $_ =~ s/^\s*//; # bug 5127, again
877
my ( $key, $value ) = split ( /:\s*/, $header, 2 );
878
$part_msg->header( $key, $value );
882
# if there's a blank line separator, that's good. if there isn't,
883
# it's a body line, so drop through.
888
$self->{'missing_mime_head_body_separator'} = 1;
893
# we run into a perl bug if the lines are astronomically long (probably
894
# due to lots of regexp backtracking); so split any individual line
895
# over MAX_BODY_LINE_LENGTH bytes in length. This can wreck HTML
896
# totally -- but IMHO the only reason a luser would use
897
# MAX_BODY_LINE_LENGTH-byte lines is to crash filters, anyway.
898
while (length ($_) > MAX_BODY_LINE_LENGTH) {
899
push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
900
substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
902
push ( @{$part_array}, $_ );
905
# Look for a message epilogue
906
# originally ignored whitespace: 0.185 0.2037 0.0654 0.757 0.00 0.00 TVD_TAB
907
# ham FPs were all "." on a line by itself.
908
# spams seem to only have NULL chars afterwards ?
910
for(; $line_count > 0; $line_count--) {
911
if ($body->[-$line_count] =~ /[^\s.]/) {
912
$self->{mime_epilogue_exists} = 1;
920
=item _parse_normal()
922
Generate a leaf node and add it to the parent.
927
my($self, $toparse) = @_;
929
my ($msg, $boundary, $body) = @{$toparse};
931
dbg("message: parsing normal part");
933
# 0: content-type, 1: boundary, 2: charset, 3: filename
934
my @ct = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
936
# multipart sections are required to have a boundary set ... If this
937
# one doesn't, assume it's malformed and revert to text/plain
938
$msg->{'type'} = ($ct[0] !~ m@^multipart/@i || defined $boundary ) ? $ct[0] : 'text/plain';
939
$msg->{'charset'} = $ct[2];
941
# attempt to figure out a name for this attachment if there is one ...
942
my $disp = $msg->header('content-disposition') || '';
943
if ($disp =~ /name="?([^\";]+)"?/i) {
947
$msg->{'name'} = $ct[3];
950
$msg->{'boundary'} = $boundary;
952
# If the part type is not one that we're likely to want to use, go
953
# ahead and write the part data out to a temp file -- why keep sucking
954
# up RAM with something we're not going to use?
956
if ($msg->{'type'} !~ m@^(?:text/(?:plain|html)$|message\b)@) {
958
($filepath, $msg->{'raw'}) = Mail::SpamAssassin::Util::secure_tmpfile();
961
# The temp file was created, add it to the list of pending deletions
962
# we cannot just delete immediately in the POSIX idiom, as this is
963
# unportable (to win32 at least)
964
push @{$self->{tmpfiles}}, $filepath;
965
$msg->{'raw'}->print(@{$body}) or die "error writing to $filepath: $!";
969
# if the part didn't get a temp file, go ahead and store the data in memory
970
if (!exists $msg->{'raw'}) {
971
$msg->{'raw'} = $body;
975
# ---------------------------------------------------------------------------
977
sub get_rendered_body_text_array {
980
if (exists $self->{text_rendered}) { return $self->{text_rendered}; }
982
$self->{text_rendered} = [];
984
# Find all parts which are leaves
985
my @parts = $self->find_parts(qr/./,1);
986
return $self->{text_rendered} unless @parts;
988
# the html metadata may have already been set, so let's not bother if it's
990
my $html_needs_setting = !exists $self->{metadata}->{html};
992
# Go through each part
993
my $text = $self->get_header ('subject') || "\n";
994
for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
997
# put a blank line between parts ...
1000
my($type, $rnd) = $p->rendered(); # decode this part
1001
if ( defined $rnd ) {
1002
# Only text/* types are rendered ...
1005
# TVD - if there are multiple parts, what should we do?
1006
# right now, just use the last one. we may need to give some priority
1007
# at some point, ie: use text/html rendered if it exists, or
1008
# text/plain rendered as html otherwise.
1009
if ($html_needs_setting && $type eq 'text/html') {
1010
$self->{metadata}->{html} = $p->{html_results};
1015
# whitespace handling (warning: small changes have large effects!)
1016
$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
1017
$text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space
1018
$text =~ tr/\f/\n/; # form feeds => newline
1020
# warn "message: $text";
1022
my @textary = split_into_array_of_short_lines ($text);
1023
$self->{text_rendered} = \@textary;
1025
return $self->{text_rendered};
1028
# ---------------------------------------------------------------------------
1030
# TODO: possibly this should just replace get_rendered_body_text_array().
1031
# (although watch out, this one doesn't copy {html} to metadata)
1032
sub get_visible_rendered_body_text_array {
1035
if (exists $self->{text_visible_rendered}) {
1036
return $self->{text_visible_rendered};
1039
$self->{text_visible_rendered} = [];
1041
# Find all parts which are leaves
1042
my @parts = $self->find_parts(qr/./,1);
1043
return $self->{text_visible_rendered} unless @parts;
1045
# the html metadata may have already been set, so let's not bother if it's
1046
# already been done.
1047
my $html_needs_setting = !exists $self->{metadata}->{html};
1049
# Go through each part
1050
my $text = $self->get_header ('subject') || "\n";
1051
for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1052
my $p = $parts[$pt];
1054
# put a blank line between parts ...
1057
my($type, $rnd) = $p->visible_rendered(); # decode this part
1058
if ( defined $rnd ) {
1059
# Only text/* types are rendered ...
1062
# TVD - if there are multiple parts, what should we do?
1063
# right now, just use the last one. we may need to give some priority
1064
# at some point, ie: use text/html rendered if it exists, or
1065
# text/plain rendered as html otherwise.
1066
if ($html_needs_setting && $type eq 'text/html') {
1067
$self->{metadata}->{html} = $p->{html_results};
1072
# whitespace handling (warning: small changes have large effects!)
1073
$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
1074
$text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space
1075
$text =~ tr/\f/\n/; # form feeds => newline
1077
my @textary = split_into_array_of_short_lines ($text);
1078
$self->{text_visible_rendered} = \@textary;
1080
return $self->{text_visible_rendered};
1083
sub get_invisible_rendered_body_text_array {
1086
if (exists $self->{text_invisible_rendered}) {
1087
return $self->{text_invisible_rendered};
1090
$self->{text_invisible_rendered} = [];
1092
# Find all parts which are leaves
1093
my @parts = $self->find_parts(qr/./,1);
1094
return $self->{text_invisible_rendered} unless @parts;
1096
# the html metadata may have already been set, so let's not bother if it's
1097
# already been done.
1098
my $html_needs_setting = !exists $self->{metadata}->{html};
1100
# Go through each part
1102
for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1103
my $p = $parts[$pt];
1105
# put a blank line between parts ...
1106
$text .= "\n" if ( $text );
1108
my($type, $rnd) = $p->invisible_rendered(); # decode this part
1109
if ( defined $rnd ) {
1110
# Only text/* types are rendered ...
1113
# TVD - if there are multiple parts, what should we do?
1114
# right now, just use the last one. we may need to give some priority
1115
# at some point, ie: use text/html rendered if it exists, or
1116
# text/plain rendered as html otherwise.
1117
if ($html_needs_setting && $type eq 'text/html') {
1118
$self->{metadata}->{html} = $p->{html_results};
1123
# whitespace handling (warning: small changes have large effects!)
1124
$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
1125
$text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space
1126
$text =~ tr/\f/\n/; # form feeds => newline
1128
my @textary = split_into_array_of_short_lines ($text);
1129
$self->{text_invisible_rendered} = \@textary;
1131
return $self->{text_invisible_rendered};
1134
# ---------------------------------------------------------------------------
1136
sub get_decoded_body_text_array {
1139
if (defined $self->{text_decoded}) { return $self->{text_decoded}; }
1140
$self->{text_decoded} = [ ];
1142
# Find all parts which are leaves
1143
my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
1144
return $self->{text_decoded} unless @parts;
1146
# Go through each part
1147
for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1148
# bug 4843: skip text/calendar parts since they're usually an attachment
1150
next if ($parts[$pt]->{'type'} eq 'text/calendar');
1152
push(@{$self->{text_decoded}}, "\n") if ( @{$self->{text_decoded}} );
1153
push(@{$self->{text_decoded}},
1154
split_into_array_of_short_paragraphs($parts[$pt]->decode()));
1157
return $self->{text_decoded};
1160
# ---------------------------------------------------------------------------
1162
sub split_into_array_of_short_lines {
1164
foreach my $line (split (/^/m, $_[0])) {
1165
while (length ($line) > MAX_BODY_LINE_LENGTH) {
1166
# try splitting "nicely" so that we don't chop an url in half or
1167
# something. if there's no space, then just split at max length.
1168
my $length = rindex($line, ' ', MAX_BODY_LINE_LENGTH) + 1;
1169
$length ||= MAX_BODY_LINE_LENGTH;
1170
push (@result, substr($line, 0, $length, ''));
1172
push (@result, $line);
1177
# ---------------------------------------------------------------------------
1179
# split a text into array of paragraphs of sizes between
1180
# $chunk_size and 2 * $chunk_size, returning the resulting array
1182
sub split_into_array_of_short_paragraphs {
1184
my $chunk_size = 1024;
1185
my $text_l = length($_[0]);
1187
for ($ofs = 0; $text_l - $ofs > 2 * $chunk_size; $ofs = $j+1) {
1188
$j = index($_[0], "\n", $ofs+$chunk_size);
1190
$j = index($_[0], " ", $ofs+$chunk_size);
1191
if ($j < 0) { $j = $ofs+$chunk_size }
1193
push(@result, substr($_[0], $ofs, $j-$ofs+1));
1195
push(@result, substr($_[0], $ofs)) if $ofs < $text_l;
1199
# ---------------------------------------------------------------------------