~ubuntu-branches/ubuntu/utopic/spamassassin/utopic-proposed

Viewing changes to .pc/60_bug_684709/lib/Mail/SpamAssassin/Message.pm

Committer: Package Import Robot
Author(s): Felix Geyer
Date: 2012-09-07 12:05:50 UTC
mfrom: (5.1.16 sid)
Revision ID: package-import@ubuntu.com-20120907120550-vkk59m0zpbzsyzjs

Tags: 3.3.2-4ubuntu1

* Merge from Debian unstable (LP: #1040274), remaining changes:
  - Add debian/patches/90_missing_tld to restore missing tld that was
    inadvertently dropped from the list upstream
* debian/spamassassin.postinst: change upgrade version check from
  "<= 3.3.2-2" to "< 3.3.2-3~". This is necessary for upgrades from
  precise.

files added:
.pc/.quilt_patches

.pc/.quilt_series

.pc/55_disable_nagios_epm

.pc/55_disable_nagios_epm/sa-check_spamd.raw

.pc/60_bug_684709

.pc/60_bug_684709/lib

.pc/60_bug_684709/lib/Mail

.pc/60_bug_684709/lib/Mail/SpamAssassin

.pc/60_bug_684709/lib/Mail/SpamAssassin/Message.pm

.pc/90_pod_cleanup

.pc/90_pod_cleanup/lib

.pc/90_pod_cleanup/lib/Mail

.pc/90_pod_cleanup/lib/Mail/SpamAssassin

.pc/90_pod_cleanup/lib/Mail/SpamAssassin/Conf.pm

debian/patches/55_disable_nagios_epm

debian/patches/60_bug_684709

debian/patches/90_pod_cleanup

files modified:
.pc/applied-patches

debian/changelog

debian/control

debian/patches/series

debian/rules

debian/spamassassin.cron.daily

debian/spamassassin.postinst

lib/Mail/SpamAssassin/Conf.pm

lib/Mail/SpamAssassin/Message.pm

sa-check_spamd.raw

Show diffs side-by-side

added added

removed removed

.pc/60_bug_684709/lib/Mail/SpamAssassin/Message.pm

# <@LICENSE>

# Licensed to the Apache Software Foundation (ASF) under one or more

# contributor license agreements. See the NOTICE file distributed with

# this work for additional information regarding copyright ownership.

# The ASF licenses this file to you under the Apache License, Version 2.0

# (the "License"); you may not use this file except in compliance with

# the License. You may obtain a copy of the License at:

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

# </@LICENSE>

=head1 NAME

Mail::SpamAssassin::Message - decode, render, and hold an RFC-2822 message

=head1 DESCRIPTION

This module encapsulates an email message and allows access to the various MIME

message parts and message metadata.

The message structure, after initiating a parse() cycle, looks like this:

Message object, also top-level node in Message::Node tree

+---> Message::Node for other parts in MIME structure

| |---> [ more Message::Node parts ... ]

| [ others ... ]

+---> Message::Metadata object to hold metadata

=head1 PUBLIC METHODS

=over 4

=cut

package Mail::SpamAssassin::Message;

use strict;

use warnings;

use re 'taint';

use Mail::SpamAssassin;

use Mail::SpamAssassin::Message::Node;

use Mail::SpamAssassin::Message::Metadata;

use Mail::SpamAssassin::Constants qw(:sa);

use Mail::SpamAssassin::Logger;

use vars qw(@ISA);

@ISA = qw(Mail::SpamAssassin::Message::Node);

# ---------------------------------------------------------------------------

=item new()

Creates a Mail::SpamAssassin::Message object. Takes a hash reference

as a parameter. The used hash key/value pairs are as follows:

C<message> is either undef (which will use STDIN), a scalar of the

entire message, an array reference of the message with 1 line per array

element, and either a file glob or IO::File object which holds the entire

contents of the message.

Note: The message is expected to generally be in RFC 2822 format, optionally

including an mbox message separator line (the "From " line) as the first line.

C<parse_now> specifies whether or not to create the MIME tree

at object-creation time or later as necessary.

The I<parse_now> option, by default, is set to false (0).

This allows SpamAssassin to not have to generate the tree of

Mail::SpamAssassin::Message::Node objects and their related data if the

tree is not going to be used. This is handy, for instance, when running

C<spamassassin -d>, which only needs the pristine header and body which

is always handled when the object is created.

C<subparse> specifies how many MIME recursion levels should be parsed.

Defaults to 20.

=cut

# month mappings (ripped from Util.pm)

my %MONTH = (jan => 1, feb => 2, mar => 3, apr => 4, may => 5, jun => 6,

jul => 7, aug => 8, sep => 9, oct => 10, nov => 11, dec => 12);

# day of week mapping (starting from zero)

my @DAY_OF_WEEK = qw/Sun Mon Tue Wed Thu Fri Sat/ ;

sub new {

my $class = shift;

$class = ref($class) || $class;

100

my($opts) = @_;

101

my $message = defined $opts->{'message'} ? $opts->{'message'} : \*STDIN;

102

my $parsenow = $opts->{'parsenow'} || 0;

103

my $normalize = $opts->{'normalize'} || 0;

104

105

# Specifies whether or not to parse message/rfc822 parts into its own tree.

106

# If the # > 0, it'll subparse, otherwise it won't. By default, do twenty

107

# levels deep.

108

my $subparse = defined $opts->{'subparse'} ? $opts->{'subparse'} : 20;

109

110

my $self = $class->SUPER::new({normalize=>$normalize});

111

112

$self->{tmpfiles} = [];

113

$self->{pristine_headers} = '';

114

$self->{pristine_body} = '';

115

$self->{mime_boundary_state} = {};

116

$self->{line_ending} = "\012";

117

$self->{master_deadline} = $opts->{'master_deadline'};

118

$self->{suppl_attrib} = $opts->{'suppl_attrib'};

119

120

bless($self,$class);

121

122

# create the metadata holder class

123

$self->{metadata} = Mail::SpamAssassin::Message::Metadata->new($self);

124

125

# Ok, go ahead and do the message "parsing"

126

127

# protect it from abuse ...

128

local $_;

129

130

# Figure out how the message was passed to us, and deal with it.

131

my @message;

132

if (ref $message eq 'ARRAY') {

133

@message = @{$message};

134

}

135

elsif (ref($message) eq 'GLOB' || ref($message) =~ /^IO::/) {

136

if (defined fileno $message) {

137

138

# sysread+split avoids a Perl I/O bug (Bug 5985)

139

# and is faster than (<$message>) by 10..25 %

140

# (a drawback is a short-term double storage of a text in $raw_str)

141

142

my($inbuf,$nread,$raw_str); $raw_str = '';

143

while ( $nread=sysread($message,$inbuf,16384) ) { $raw_str .= $inbuf }

144

defined $nread or die "error reading: $!";

145

@message = split(/^/m, $raw_str, -1);

146

147

dbg("message: empty message read") if $raw_str eq '';

148

}

149

}

150

elsif (ref $message) {

151

dbg("message: Input is a reference of unknown type!");

152

}

153

elsif (defined $message) {

154

@message = split(/^/m, $message, -1);

155

}

156

157

# Pull off mbox and mbx separators

158

# also deal with null messages

159

if (!@message) {

160

# bug 4884:

161

# if we get here, it means that the input was null, so fake the message

162

# content as a single newline...

163

@message = ("\n");

164

} elsif ($message[0] =~ /^From\s+(?!:)/) {

165

# careful not to confuse with obsolete syntax which allowed WSP before ':'

166

# mbox formated mailbox

167

$self->{'mbox_sep'} = shift @message;

168

} elsif ($message[0] =~ MBX_SEPARATOR) {

169

$_ = shift @message;

170

171

# Munge the mbx message separator into mbox format as a sort of

172

# de facto portability standard in SA's internals. We need to

173

# to this so that Mail::SpamAssassin::Util::parse_rfc822_date

174

# can parse the date string...

175

if (/([\s\d]\d)-([a-zA-Z]{3})-(\d{4})\s(\d{2}):(\d{2}):(\d{2})/) {

176

# $1 = day of month

177

# $2 = month (text)

178

# $3 = year

179

# $4 = hour

180

# $5 = min

181

# $6 = sec

182

my @arr = localtime(timelocal($6,$5,$4,$1,$MONTH{lc($2)}-1,$3));

183

my $address;

184

foreach (@message) {

185

if (/^From:[^<]*<([^>]+)>/) {

186

$address = $1;

187

last;

188

} elsif (/^From:\s*([^<> ]+)/) {

189

$address = $1;

190

last;

191

}

192

}

193

$self->{'mbox_sep'} = "From $address $DAY_OF_WEEK[$arr[6]] $2 $1 $4:$5:$6 $3\n";

194

}

195

}

196

197

# bug 4363

198

# Check to see if we should do CRLF instead of just LF

199

# For now, just check the first header and do whatever it does

200

if (@message && $message[0] =~ /\015\012/) {

201

$self->{line_ending} = "\015\012";

202

dbg("message: line ending changed to CRLF");

203

}

204

205

# Go through all the header fields of the message

206

my $hdr_errors = 0;

207

my $header;

208

for (;;) {

209

# make sure not to lose the last header field when there is no body

210

my $eof = !@message;

211

my $current = $eof ? "\n" : shift @message;

212

213

if ( $current =~ /^[ \t]/ ) {

214

# This wasn't useful in terms of a rule, but we may want to treat it

215

# specially at some point. Perhaps ignore it?

216

#unless ($current =~ /\S/) {

217

# $self->{'obsolete_folding_whitespace'} = 1;

218

219

220

$header = '' if !defined $header; # header starts with a continuation!?

221

$header .= $current; # append continuations, no matter what

222

$self->{'pristine_headers'} .= $current;

223

}

224

else { # not a continuation

225

# Ok, there's a header here, let's go ahead and add it in.

226

if (defined $header) { # deal with a previous header field

227

my ($key, $value) = split (/:/s, $header, 2);

228

229

# If it's not a valid header (aka: not in the form "foo:bar"), skip it.

230

if (defined $value) {

231

$key =~ s/[ \t]+\z//; # strip WSP before colon, obsolete rfc822 syn

232

# limit the length of the pairs we store

233

if (length($key) > MAX_HEADER_KEY_LENGTH) {

234

$key = substr($key, 0, MAX_HEADER_KEY_LENGTH);

235

$self->{'truncated_header'} = 1;

236

}

237

if (length($value) > MAX_HEADER_VALUE_LENGTH) {

238

$value = substr($value, 0, MAX_HEADER_VALUE_LENGTH);

239

$self->{'truncated_header'} = 1;

240

}

241

$self->header($key, $value);

242

}

243

}

244

245

if ($current =~ /^\r?$/) { # a regular end of a header section

246

if ($eof) {

247

$self->{'missing_head_body_separator'} = 1;

248

} else {

249

$self->{'pristine_headers'} .= $current;

250

}

251

last;

252

}

253

elsif ($current =~ /^--/) { # mime boundary encountered, bail out

254

$self->{'missing_head_body_separator'} = 1;

255

unshift(@message, $current);

256

last;

257

}

258

# should we assume entering a body on encountering invalid header field?

259

elsif ($current !~ /^[\041-\071\073-\176]+[ \t]*:/) {

260

# A field name MUST be composed of printable US-ASCII characters

261

# (i.e., characters that have values between 33 (041) and 126 (176),

262

# inclusive), except colon (072). Obsolete header field syntax

263

# allowed WSP before a colon.

264

if (++$hdr_errors <= 3) {

265

# just consume but ignore a few invalid header fields

266

} else { # enough is enough...

267

$self->{'missing_head_body_separator'} = 1;

268

unshift(@message, $current);

269

last;

270

}

271

}

272

273

# start collecting a new header field

274

$header = $current;

275

$self->{'pristine_headers'} .= $current;

276

}

277

}

278

undef $header;

279

280

# Store the pristine body for later -- store as a copy since @message

281

# will get modified below

282

$self->{'pristine_body'} = join('', @message);

283

284

# CRLF -> LF

285

# also merge multiple blank lines into a single one

286

my $start;

287

# iterate over lines in reverse order

288

for (my $cnt=$#message; $cnt>=0; $cnt--) {

289

$message[$cnt] =~ s/\015\012/\012/;

290

291

# line is blank

292

if ($message[$cnt] !~ /\S/) {

293

if (!defined $start) {

294

$start=$cnt;

295

}

296

next unless $cnt == 0;

297

}

298

299

# line is not blank, or we've reached the beginning

300

301

# if we've got a series of blank lines, get rid of them

302

if (defined $start) {

303

my $num = $start-$cnt;

304

if ($num > 10) {

305

splice @message, $cnt+2, $num-1;

306

}

307

undef $start;

308

}

309

}

310

311

# Figure out the boundary

312

my ($boundary);

313

($self->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($self->header('content-type'));

314

dbg("message: main message type: ".$self->{'type'});

315

316

# parse queue, simple array of parts to parse:

317

# 0: part object, already in the tree

318

# 1: boundary used to focus body parsing

319

# 2: message content

320

# 3: how many MIME subparts to parse down

321

322

$self->{'parse_queue'} = [ [ $self, $boundary, \@message, $subparse ] ];

323

324

# If the message does need to get parsed, save off a copy of the body

325

# in a format we can easily parse later so we don't have to rip from

326

# pristine_body ... If we do want to parse now, go ahead and do so ...

327

328

if ($parsenow) {

329

$self->parse_body();

330

}

331

332

$self;

333

}

334

335

# ---------------------------------------------------------------------------

336

337

=item find_parts()

338

339

Used to search the tree for specific MIME parts. See

340

I<Mail::SpamAssassin::Message::Node> for more details.

341

342

=cut

343

344

# Used to find any MIME parts whose simple content-type matches a given regexp

345

# Searches it's own and any children parts. Returns an array of MIME

346

# objects which match.

347

348

sub find_parts {

349

my $self = shift;

350

351

# ok, we need to do the parsing now...

352

$self->parse_body() if (exists $self->{'parse_queue'});

353

354

# and pass through to the Message::Node version of the method

355

return $self->SUPER::find_parts(@_);

356

}

357

358

# ---------------------------------------------------------------------------

359

360

=item get_pristine_header()

361

362

Returns pristine headers of the message. If no specific header name

363

is given as a parameter (case-insensitive), then all headers will be

364

returned as a scalar, including the blank line at the end of the headers.

365

366

If called in an array context, an array will be returned with each

367

specific header in a different element. In a scalar context, the last

368

specific header is returned.

369

370

ie: If 'Subject' is specified as the header, and there are 2 Subject

371

headers in a message, the last/bottom one in the message is returned in

372

scalar context or both are returned in array context.

373

374

Btw, returning the last header field (not the first) happens to be consistent

375

with DKIM signatures, which search for and cover multiple header fields

376

bottom-up according to the 'h' tag. Let's keep it this way.

377

378

Note: the returned header will include the ending newline and any embedded

379

whitespace folding.

380

381

=cut

382

383

sub get_pristine_header {

384

my ($self, $hdr) = @_;

385

386

return $self->{pristine_headers} if !defined $hdr || $hdr eq '';

387

my(@ret) =

388

$self->{pristine_headers} =~ /^\Q$hdr\E[ \t]*:[ \t]*(.*?\n(?![ \t]))/smgi;

389

# taintedness is retained by "use re 'taint'" (fix in bug 5283 now redundant)

390

if (!@ret) {

391

return $self->get_header($hdr);

392

} elsif (wantarray) {

393

return @ret;

394

} else {

395

return $ret[-1];

396

}

397

}

398

399

=item get_mbox_separator()

400

401

Returns the mbox separator found in the message, or undef if there

402

wasn't one.

403

404

=cut

405

406

sub get_mbox_separator {

407

return $_[0]->{mbox_sep};

408

}

409

410

=item get_body()

411

412

Returns an array of the pristine message body, one line per array element.

413

414

=cut

415

416

sub get_body {

417

my ($self) = @_;

418

my @ret = split(/^/m, $self->{pristine_body});

419

return \@ret;

420

}

421

422

# ---------------------------------------------------------------------------

423

424

=item get_pristine()

425

426

Returns a scalar of the entire pristine message.

427

428

=cut

429

430

sub get_pristine {

431

my ($self) = @_;

432

return $self->{pristine_headers} . $self->{pristine_body};

433

}

434

435

=item get_pristine_body()

436

437

Returns a scalar of the pristine message body.

438

439

=cut

440

441

sub get_pristine_body {

442

my ($self) = @_;

443

return $self->{pristine_body};

444

}

445

446

# ---------------------------------------------------------------------------

447

448

=item extract_message_metadata($permsgstatus)

449

450

=cut

451

452

sub extract_message_metadata {

453

my ($self, $permsgstatus) = @_;

454

455

# do this only once per message, it can be expensive

456

if ($self->{already_extracted_metadata}) { return; }

457

$self->{already_extracted_metadata} = 1;

458

459

$self->{metadata}->extract ($self, $permsgstatus);

460

}

461

462

# ---------------------------------------------------------------------------

463

464

=item $str = get_metadata($hdr)

465

466

=cut

467

468

sub get_metadata {

469

my ($self, $hdr) = @_;

470

if (!$self->{metadata}) {

471

warn "metadata: oops! get_metadata() called after finish_metadata()"; return;

472

}

473

# dbg("message: get_metadata - %s: %s", $hdr, defined $_ ? $_ : '<undef>')

474

# for $self->{metadata}->{strings}->{lc $hdr};

475

476

$self->{metadata}->{strings}->{lc $hdr};

477

}

478

479

=item put_metadata($hdr, $text)

480

481

=cut

482

483

sub put_metadata {

484

my ($self, $hdr, $text) = @_;

485

if (!$self->{metadata}) {

486

warn "metadata: oops! put_metadata() called after finish_metadata()"; return;

487

}

488

# dbg("message: put_metadata - %s: %s", $hdr, $text);

489

$self->{metadata}->{strings}->{lc $hdr} = $text;

490

}

491

492

=item delete_metadata($hdr)

493

494

=cut

495

496

sub delete_metadata {

497

my ($self, $hdr) = @_;

498

if (!$self->{metadata}) {

499

warn "metadata: oops! delete_metadata() called after finish_metadata()"; return;

500

}

501

delete $self->{metadata}->{strings}->{lc $hdr};

502

}

503

504

=item $str = get_all_metadata()

505

506

=cut

507

508

sub get_all_metadata {

509

my ($self) = @_;

510

511

if (!$self->{metadata}) {

512

warn "metadata: oops! get_all_metadata() called after finish_metadata()"; return;

513

}

514

my @ret;

515

my $keys_ref = $self->{metadata}->{strings};

516

foreach my $key (sort keys %$keys_ref) {

517

my $val = $keys_ref->{$key};

518

$val = '' if !defined $val;

519

push (@ret, "$key: $val\n");

520

}

521

return (wantarray ? @ret : join('', @ret));

522

}

523

524

# ---------------------------------------------------------------------------

525

526

=item finish_metadata()

527

528

Destroys the metadata for this message. Once a message has been

529

scanned fully, the metadata is no longer required. Destroying

530

this will free up some memory.

531

532

=cut

533

534

sub finish_metadata {

535

my ($self) = @_;

536

if (defined ($self->{metadata})) {

537

$self->{metadata}->finish();

538

delete $self->{metadata};

539

}

540

}

541

542

=item finish()

543

544

Clean up an object so that it can be destroyed.

545

546

=cut

547

548

sub finish {

549

my ($self) = @_;

550

551

# Clean ourself up

552

$self->finish_metadata();

553

554

# These will only be in the root Message node

555

delete $self->{'mime_boundary_state'};

556

delete $self->{'mbox_sep'};

557

delete $self->{'normalize'};

558

delete $self->{'pristine_body'};

559

delete $self->{'pristine_headers'};

560

delete $self->{'line_ending'};

561

delete $self->{'missing_head_body_separator'};

562

563

my @toclean = ( $self );

564

565

# Go ahead and clean up all of the Message::Node parts

566

while (my $part = shift @toclean) {

567

# bug 5557: windows requires tmp file be closed before it can be rm'd

568

if (ref $part->{'raw'} eq 'GLOB') {

569

close($part->{'raw'}) or die "error closing input file: $!";

570

}

571

572

# bug 5858: avoid memory leak with deep MIME structure

573

if (defined ($part->{metadata})) {

574

$part->{metadata}->finish();

575

delete $part->{metadata};

576

}

577

578

delete $part->{'headers'};

579

delete $part->{'raw_headers'};

580

delete $part->{'header_order'};

581

delete $part->{'raw'};

582

delete $part->{'decoded'};

583

delete $part->{'rendered'};

584

delete $part->{'visible_rendered'};

585

delete $part->{'invisible_rendered'};

586

delete $part->{'type'};

587

delete $part->{'rendered_type'};

588

589

# if there are children nodes, add them to the queue of nodes to clean up

590

if (exists $part->{'body_parts'}) {

591

push(@toclean, @{$part->{'body_parts'}});

592

delete $part->{'body_parts'};

593

}

594

}

595

596

# delete temporary files

597

if ($self->{'tmpfiles'}) {

598

for my $fn (@{$self->{'tmpfiles'}}) {

599

unlink($fn) or warn "cannot unlink $fn: $!";

600

}

601

delete $self->{'tmpfiles'};

602

}

603

}

604

605

# also use a DESTROY method, just to ensure (as much as possible) that

606

# temporary files are deleted even if the finish() method is omitted

607

sub DESTROY {

608

my $self = shift;

609

# best practices: prevent potential calls to eval and to system routines

610

# in code of a DESTROY method from clobbering global variables $@ and $!

611

local($@,$!); # keep outer error handling unaffected by DESTROY

612

if ($self->{'tmpfiles'}) {

613

for my $fn (@{$self->{'tmpfiles'}}) {

614

unlink($fn) or dbg("message: cannot unlink $fn: $!");

615

}

616

}

617

}

618

619

# ---------------------------------------------------------------------------

620

621

=item receive_date()

622

623

Return a time_t value with the received date of the current message,

624

or current time if received time couldn't be determined.

625

626

=cut

627

628

sub receive_date {

629

my($self) = @_;

630

631

return Mail::SpamAssassin::Util::receive_date(scalar $self->get_all_headers(0,1));

632

}

633

634

# ---------------------------------------------------------------------------

635

636

=back

637

638

=head1 PARSING METHODS, NON-PUBLIC

639

640

These methods take a RFC2822-esque formatted message and create a tree

641

with all of the MIME body parts included. Those parts will be decoded

642

as necessary, and text/html parts will be rendered into a standard text

643

format, suitable for use in SpamAssassin.

644

645

=over 4

646

647

=item parse_body()

648

649

parse_body() passes the body part that was passed in onto the

650

correct part parser, either _parse_multipart() for multipart/* parts,

651

or _parse_normal() for everything else. Multipart sections become the

652

root of sub-trees, while everything else becomes a leaf in the tree.

653

654

For multipart messages, the first call to parse_body() doesn't create a

655

new sub-tree and just uses the parent node to contain children. All other

656

calls to parse_body() will cause a new sub-tree root to be created and

657

children will exist underneath that root. (this is just so the tree

658

doesn't have a root node which points at the actual root node ...)

659

660

=cut

661

662

sub parse_body {

663

my($self) = @_;

664

665

# This shouldn't happen, but just in case, abort.

666

return unless (exists $self->{'parse_queue'});

667

668

dbg("message: ---- MIME PARSER START ----");

669

670

while (my $toparse = shift @{$self->{'parse_queue'}}) {

671

# multipart sections are required to have a boundary set ... If this

672

# one doesn't, assume it's malformed and send it to be parsed as a

673

# non-multipart section

674

675

if ( $toparse->[0]->{'type'} =~ /^multipart\//i && defined $toparse->[1] && ($toparse->[3] > 0)) {

676

$self->_parse_multipart($toparse);

677

}

678

else {

679

# If it's not multipart, go ahead and just deal with it.

680

$self->_parse_normal($toparse);

681

682

# bug 5041: exclude message/partial messages, however

683

if ($toparse->[0]->{'type'} =~ /^message\b/i &&

684

$toparse->[0]->{'type'} !~ /^message\/partial$/i &&

685

($toparse->[3] > 0))

686

{

687

# Just decode the part, but we don't care about the result here.

688

$toparse->[0]->decode(0);

689

690

# bug 5051, bug 3748: sometimes message/* parts have no content,

691

# and we get stuck waiting for STDIN, which is bad. :(

692

if (defined $toparse->[0]->{'decoded'} &&

693

$toparse->[0]->{'decoded'} ne '')

694

{

695

# Ok, so this part is still semi-recursive, since M::SA::Message

696

# calls M::SA::Message, but we don't subparse the new message,

697

# and pull a sneaky "steal our child's queue" maneuver to deal

698

# with it on our own time. Reference the decoded array directly

699

# since it's faster.

700

701

my $msg_obj = Mail::SpamAssassin::Message->new({

702

message => $toparse->[0]->{'decoded'},

703

parsenow => 0,

704

normalize => $self->{normalize},

705

subparse => $toparse->[3]-1,

706

});

707

708

# Add the new message to the current node

709

$toparse->[0]->add_body_part($msg_obj);

710

711

# now this is the sneaky bit ... steal the sub-message's parse_queue

712

# and add it to ours. then we'll handle the sub-message in our

713

# normal loop and get all the glory. muhaha. :)

714

push(@{$self->{'parse_queue'}}, @{$msg_obj->{'parse_queue'}});

715

delete $msg_obj->{'parse_queue'};

716

717

# Ok, we've subparsed, so go ahead and remove the raw and decoded

718

# data because we won't need them anymore (the tree under this part

719

# will have that data)

720

if (ref $toparse->[0]->{'raw'} eq 'GLOB') {

721

# Make sure we close it if it's a temp file -- Bug 5166

722

close($toparse->[0]->{'raw'})

723

or die "error closing input file: $!";

724

}

725

726

delete $toparse->[0]->{'raw'};

727

728

delete $toparse->[0]->{'decoded'};

729

}

730

}

731

}

732

}

733

734

dbg("message: ---- MIME PARSER END ----");

735

736

# we're done parsing, so remove the queue variable

737

delete $self->{'parse_queue'};

738

}

739

740

=item _parse_multipart()

741

742

Generate a root node, and for each child part call parse_body()

743

to generate the tree.

744

745

=cut

746

747

sub _parse_multipart {

748

my($self, $toparse) = @_;

749

750

my ($msg, $boundary, $body, $subparse) = @{$toparse};

751

752

# we're not supposed to be a leaf, so prep ourselves

753

$msg->{'body_parts'} = [];

754

755

# the next set of objects will be one level deeper

756

$subparse--;

757

758

dbg("message: parsing multipart, got boundary: ".(defined $boundary ? $boundary : ''));

759

760

# NOTE: The MIME boundary REs here are very specific to be mostly RFC 1521

761

# compliant, but also allow possible malformations to still work. Please

762

# see Bugzilla bug 3749 for more information before making any changes!

763

764

# ignore preamble per RFC 1521, unless there's no boundary ...

765

if ( defined $boundary ) {

766

my $line;

767

my $tmp_line = @{$body};

768

for ($line=0; $line < $tmp_line; $line++) {

769

# specifically look for an opening boundary

770

if ($body->[$line] =~ /^--\Q$boundary\E\s*$/) {

771

# Make note that we found the opening boundary

772

$self->{mime_boundary_state}->{$boundary} = 1;

773

774

# if the line after the opening boundary isn't a header, flag it.

775

# we need to make sure that there's actually another line though.

776

if ($line+1 < $tmp_line && $body->[$line+1] !~ /^[\041-\071\073-\176]+:/) {

777

$self->{'missing_mime_headers'} = 1;

778

}

779

780

last;

781

}

782

}

783

784

# Found a boundary, ignore the preamble

785

if ( $line < $tmp_line ) {

786

splice @{$body}, 0, $line+1;

787

}

788

789

# Else, there's no boundary, so leave the whole part...

790

}

791

792

# prepare a new tree node

793

my $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });

794

my $in_body = 0;

795

my $header;

796

my $part_array;

797

798

my $line_count = @{$body};

799

foreach ( @{$body} ) {

800

# if we're on the last body line, or we find any boundary marker,

801

# deal with the mime part

802

if ( --$line_count == 0 || (defined $boundary && /^--\Q$boundary\E(?:--)?\s*$/) ) {

803

my $line = $_; # remember the last line

804

805

# per rfc 1521, the CRLF before the boundary is part of the boundary:

806

# NOTE: The CRLF preceding the encapsulation line is conceptually

807

# attached to the boundary so that it is possible to have a part

808

# that does not end with a CRLF (line break). Body parts that must

809

# be considered to end with line breaks, therefore, must have two

810

# CRLFs preceding the encapsulation line, the first of which is part

811

# of the preceding body part, and the second of which is part of the

812

# encapsulation boundary.

813

if ($part_array) {

814

chomp( $part_array->[-1] ); # trim the CRLF that's part of the boundary

815

splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...

816

}

817

else {

818

# Invalid parts can have no body, so fake in a blank body

819

# in that case.

820

$part_array = [];

821

}

822

823

my($p_boundary);

824

($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));

825

$p_boundary ||= $boundary;

826

dbg("message: found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));

827

828

# we've created a new node object, so add it to the queue along with the

829

# text that belongs to that part, then add the new part to the current

830

# node to create the tree.

831

push(@{$self->{'parse_queue'}}, [ $part_msg, $p_boundary, $part_array, $subparse ]);

832

$msg->add_body_part($part_msg);

833

834

# rfc 1521 says /^--boundary--$/, some MUAs may just require /^--boundary--/

835

# but this causes problems with horizontal lines when the boundary is

836

# made up of dashes as well, etc.

837

if (defined $boundary) {

838

if ($line =~ /^--\Q${boundary}\E--\s*$/) {

839

# Make a note that we've seen the end boundary

840

$self->{mime_boundary_state}->{$boundary}--;

841

last;

842

}

843

elsif ($line_count && $body->[-$line_count] !~ /^[\041-\071\073-\176]+:/) {

844

# if we aren't on an end boundary and there are still lines left, it

845

# means we hit a new start boundary. therefore, the next line ought

846

# to be a mime header. if it's not, mark it.

847

$self->{'missing_mime_headers'} = 1;

848

}

849

}

850

851

# make sure we start with a new clean node

852

$in_body = 0;

853

$part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });

854

undef $part_array;

855

undef $header;

856

857

next;

858

}

859

860

if (!$in_body) {

861

# s/\s+$//; # bug 5127: don't clean this up (yet)

862

if (/^[\041-\071\073-\176]+[ \t]*:/) {

863

if ($header) {

864

my ( $key, $value ) = split ( /:\s*/, $header, 2 );

865

$part_msg->header( $key, $value );

866

}

867

$header = $_;

868

next;

869

}

870

elsif (/^[ \t]/ && $header) {

871

# $_ =~ s/^\s*//; # bug 5127, again

872

$header .= $_;

873

next;

874

}

875

else {

876

if ($header) {

877

my ( $key, $value ) = split ( /:\s*/, $header, 2 );

878

$part_msg->header( $key, $value );

879

}

880

$in_body = 1;

881

882

# if there's a blank line separator, that's good. if there isn't,

883

# it's a body line, so drop through.

884

if (/^\r?$/) {

885

next;

886

}

887

else {

888

$self->{'missing_mime_head_body_separator'} = 1;

889

}

890

}

891

}

892

893

# we run into a perl bug if the lines are astronomically long (probably

894

# due to lots of regexp backtracking); so split any individual line

895

# over MAX_BODY_LINE_LENGTH bytes in length. This can wreck HTML

896

# totally -- but IMHO the only reason a luser would use

897

# MAX_BODY_LINE_LENGTH-byte lines is to crash filters, anyway.

898

while (length ($_) > MAX_BODY_LINE_LENGTH) {

899

push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");

900

substr($_, 0, MAX_BODY_LINE_LENGTH) = '';

901

}

902

push ( @{$part_array}, $_ );

903

}

904

905

# Look for a message epilogue

906

# originally ignored whitespace: 0.185 0.2037 0.0654 0.757 0.00 0.00 TVD_TAB

907

# ham FPs were all "." on a line by itself.

908

# spams seem to only have NULL chars afterwards ?

909

if ($line_count) {

910

for(; $line_count > 0; $line_count--) {

911

if ($body->[-$line_count] =~ /[^\s.]/) {

912

$self->{mime_epilogue_exists} = 1;

913

last;

914

}

915

}

916

}

917

918

}

919

920

=item _parse_normal()

921

922

Generate a leaf node and add it to the parent.

923

924

=cut

925

926

sub _parse_normal {

927

my($self, $toparse) = @_;

928

929

my ($msg, $boundary, $body) = @{$toparse};

930

931

dbg("message: parsing normal part");

932

933

# 0: content-type, 1: boundary, 2: charset, 3: filename

934

my @ct = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));

935

936

# multipart sections are required to have a boundary set ... If this

937

# one doesn't, assume it's malformed and revert to text/plain

938

$msg->{'type'} = ($ct[0] !~ m@^multipart/@i || defined $boundary ) ? $ct[0] : 'text/plain';

939

$msg->{'charset'} = $ct[2];

940

941

# attempt to figure out a name for this attachment if there is one ...

942

my $disp = $msg->header('content-disposition') || '';

943

if ($disp =~ /name="?([^\";]+)"?/i) {

944

$msg->{'name'} = $1;

945

}

946

elsif ($ct[3]) {

947

$msg->{'name'} = $ct[3];

948

}

949

950

$msg->{'boundary'} = $boundary;

951

952

# If the part type is not one that we're likely to want to use, go

953

# ahead and write the part data out to a temp file -- why keep sucking

954

# up RAM with something we're not going to use?

955

956

if ($msg->{'type'} !~ m@^(?:text/(?:plain|html)$|message\b)@) {

957

my $filepath;

958

($filepath, $msg->{'raw'}) = Mail::SpamAssassin::Util::secure_tmpfile();

959

960

if ($filepath) {

961

# The temp file was created, add it to the list of pending deletions

962

# we cannot just delete immediately in the POSIX idiom, as this is

963

# unportable (to win32 at least)

964

push @{$self->{tmpfiles}}, $filepath;

965

$msg->{'raw'}->print(@{$body}) or die "error writing to $filepath: $!";

966

}

967

}

968

969

# if the part didn't get a temp file, go ahead and store the data in memory

970

if (!exists $msg->{'raw'}) {

971

$msg->{'raw'} = $body;

972

}

973

}

974

975

# ---------------------------------------------------------------------------

976

977

sub get_rendered_body_text_array {

978

my ($self) = @_;

979

980

if (exists $self->{text_rendered}) { return $self->{text_rendered}; }

981

982

$self->{text_rendered} = [];

983

984

# Find all parts which are leaves

985

my @parts = $self->find_parts(qr/./,1);

986

return $self->{text_rendered} unless @parts;

987

988

# the html metadata may have already been set, so let's not bother if it's

989

# already been done.

990

my $html_needs_setting = !exists $self->{metadata}->{html};

991

992

# Go through each part

993

my $text = $self->get_header ('subject') || "\n";

994

for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {

995

my $p = $parts[$pt];

996

997

# put a blank line between parts ...

998

$text .= "\n";

999

1000

my($type, $rnd) = $p->rendered(); # decode this part

1001

if ( defined $rnd ) {

1002

# Only text/* types are rendered ...

1003

$text .= $rnd;

1004

1005

# TVD - if there are multiple parts, what should we do?

1006

# right now, just use the last one. we may need to give some priority

1007

# at some point, ie: use text/html rendered if it exists, or

1008

# text/plain rendered as html otherwise.

1009

if ($html_needs_setting && $type eq 'text/html') {

1010

$self->{metadata}->{html} = $p->{html_results};

1011

}

1012

}

1013

}

1014

1015

# whitespace handling (warning: small changes have large effects!)

1016

$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed

1017

$text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space

1018

$text =~ tr/\f/\n/; # form feeds => newline

1019

1020

# warn "message: $text";

1021

1022

my @textary = split_into_array_of_short_lines ($text);

1023

$self->{text_rendered} = \@textary;

1024

1025

return $self->{text_rendered};

1026

}

1027

1028

# ---------------------------------------------------------------------------

1029

1030

# TODO: possibly this should just replace get_rendered_body_text_array().

1031

# (although watch out, this one doesn't copy {html} to metadata)

1032

sub get_visible_rendered_body_text_array {

1033

my ($self) = @_;

1034

1035

if (exists $self->{text_visible_rendered}) {

1036

return $self->{text_visible_rendered};

1037

}

1038

1039

$self->{text_visible_rendered} = [];

1040

1041

# Find all parts which are leaves

1042

my @parts = $self->find_parts(qr/./,1);

1043

return $self->{text_visible_rendered} unless @parts;

1044

1045

# the html metadata may have already been set, so let's not bother if it's

1046

# already been done.

1047

my $html_needs_setting = !exists $self->{metadata}->{html};

1048

1049

# Go through each part

1050

my $text = $self->get_header ('subject') || "\n";

1051

for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {

1052

my $p = $parts[$pt];

1053

1054

# put a blank line between parts ...

1055

$text .= "\n";

1056

1057

my($type, $rnd) = $p->visible_rendered(); # decode this part

1058

if ( defined $rnd ) {

1059

# Only text/* types are rendered ...

1060

$text .= $rnd;

1061

1062

# TVD - if there are multiple parts, what should we do?

1063

# right now, just use the last one. we may need to give some priority

1064

# at some point, ie: use text/html rendered if it exists, or

1065

# text/plain rendered as html otherwise.

1066

if ($html_needs_setting && $type eq 'text/html') {

1067

$self->{metadata}->{html} = $p->{html_results};

1068

}

1069

}

1070

}

1071

1072

# whitespace handling (warning: small changes have large effects!)

1073

$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed

1074

$text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space

1075

$text =~ tr/\f/\n/; # form feeds => newline

1076

1077

my @textary = split_into_array_of_short_lines ($text);

1078

$self->{text_visible_rendered} = \@textary;

1079

1080

return $self->{text_visible_rendered};

1081

}

1082

1083

sub get_invisible_rendered_body_text_array {

1084

my ($self) = @_;

1085

1086

if (exists $self->{text_invisible_rendered}) {

1087

return $self->{text_invisible_rendered};

1088

}

1089

1090

$self->{text_invisible_rendered} = [];

1091

1092

# Find all parts which are leaves

1093

my @parts = $self->find_parts(qr/./,1);

1094

return $self->{text_invisible_rendered} unless @parts;

1095

1096

# the html metadata may have already been set, so let's not bother if it's

1097

# already been done.

1098

my $html_needs_setting = !exists $self->{metadata}->{html};

1099

1100

# Go through each part

1101

my $text = '';

1102

for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {

1103

my $p = $parts[$pt];

1104

1105

# put a blank line between parts ...

1106

$text .= "\n" if ( $text );

1107

1108

my($type, $rnd) = $p->invisible_rendered(); # decode this part

1109

if ( defined $rnd ) {

1110

# Only text/* types are rendered ...

1111

$text .= $rnd;

1112

1113

# TVD - if there are multiple parts, what should we do?

1114

# right now, just use the last one. we may need to give some priority

1115

# at some point, ie: use text/html rendered if it exists, or

1116

# text/plain rendered as html otherwise.

1117

if ($html_needs_setting && $type eq 'text/html') {

1118

$self->{metadata}->{html} = $p->{html_results};

1119

}

1120

}

1121

}

1122

1123

# whitespace handling (warning: small changes have large effects!)

1124

$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed

1125

$text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space

1126

$text =~ tr/\f/\n/; # form feeds => newline

1127

1128

my @textary = split_into_array_of_short_lines ($text);

1129

$self->{text_invisible_rendered} = \@textary;

1130

1131

return $self->{text_invisible_rendered};

1132

}

1133

1134

# ---------------------------------------------------------------------------

1135

1136

sub get_decoded_body_text_array {

1137

my ($self) = @_;

1138

1139

if (defined $self->{text_decoded}) { return $self->{text_decoded}; }

1140

$self->{text_decoded} = [ ];

1141

1142

# Find all parts which are leaves

1143

my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);

1144

return $self->{text_decoded} unless @parts;

1145

1146

# Go through each part

1147

for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {

1148

# bug 4843: skip text/calendar parts since they're usually an attachment

1149

# and not displayed

1150

next if ($parts[$pt]->{'type'} eq 'text/calendar');

1151

1152

push(@{$self->{text_decoded}}, "\n") if ( @{$self->{text_decoded}} );

1153

push(@{$self->{text_decoded}},

1154

split_into_array_of_short_paragraphs($parts[$pt]->decode()));

1155

}

1156

1157

return $self->{text_decoded};

1158

}

1159

1160

# ---------------------------------------------------------------------------

1161

1162

sub split_into_array_of_short_lines {

1163

my @result;

1164

foreach my $line (split (/^/m, $_[0])) {

1165

while (length ($line) > MAX_BODY_LINE_LENGTH) {

1166

# try splitting "nicely" so that we don't chop an url in half or

1167

# something. if there's no space, then just split at max length.

1168

my $length = rindex($line, ' ', MAX_BODY_LINE_LENGTH) + 1;

1169

$length ||= MAX_BODY_LINE_LENGTH;

1170

push (@result, substr($line, 0, $length, ''));

1171

}

1172

push (@result, $line);

1173

}

1174

@result;

1175

}

1176

1177

# ---------------------------------------------------------------------------

1178

1179

# split a text into array of paragraphs of sizes between

1180

# $chunk_size and 2 * $chunk_size, returning the resulting array

1181

1182

sub split_into_array_of_short_paragraphs {

1183

my @result;

1184

my $chunk_size = 1024;

1185

my $text_l = length($_[0]);

1186

my($j,$ofs);

1187

for ($ofs = 0; $text_l - $ofs > 2 * $chunk_size; $ofs = $j+1) {

1188

$j = index($_[0], "\n", $ofs+$chunk_size);

1189

if ($j < 0) {

1190

$j = index($_[0], " ", $ofs+$chunk_size);

1191

if ($j < 0) { $j = $ofs+$chunk_size }

1192

}

1193

push(@result, substr($_[0], $ofs, $j-$ofs+1));

1194

}

1195

push(@result, substr($_[0], $ofs)) if $ofs < $text_l;

1196

@result;

1197

}

1198

1199

# ---------------------------------------------------------------------------

1200

1201

1202

1203

=back

1204

1205

=cut

Older »