~ubuntu-branches/ubuntu/utopic/spamassassin/utopic-updates

« back to all changes in this revision

Viewing changes to .pc/98_sa-compile-quiet/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

Committer: Package Import Robot
Author(s): Noah Meyerhans
Date: 2014-02-14 22:45:15 UTC
mfrom: (0.8.1) (0.6.2) (5.1.22 sid)
Revision ID: package-import@ubuntu.com-20140214224515-z1es2twos8xh7n2y

Tags: 3.4.0-1

http://bugs.debian.org/738963

http://bugs.debian.org/738872

http://bugs.debian.org/738867

http://bugs.debian.org/738951

http://bugs.debian.org/738974

* New upstream version! (Closes: 738963, 738872, 738867)
* Scrub the environment when switching to the debian-spamd user in
  postinst and cron.daily. (Closes: 738951)
* Enhancements to postinst to better manage ownership of
  /var/lib/spamassassin, via Iain Lane <iain.lane@canonical.com>
  (Closes: 738974)

files added:
.pc/90_pod_cleanup/lib/Mail/SpamAssassin/Plugin

.pc/90_pod_cleanup/lib/Mail/SpamAssassin/Plugin/DCC.pm

.pc/90_pod_cleanup/lib/Mail/SpamAssassin/Plugin/DNSEval.pm

.pc/98_sa-compile-quiet

.pc/98_sa-compile-quiet/lib

.pc/98_sa-compile-quiet/lib/Mail

.pc/98_sa-compile-quiet/lib/Mail/SpamAssassin

.pc/98_sa-compile-quiet/lib/Mail/SpamAssassin/Plugin

.pc/98_sa-compile-quiet/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

META.json

debian/patches/98_sa-compile-quiet

lib/Mail/SpamAssassin/BayesStore/Redis.pm

lib/Mail/SpamAssassin/Plugin/AskDNS.pm

lib/Mail/SpamAssassin/Util/TinyRedis.pm

pkgrules/10_hasbase.cf

pkgrules/20_mailspike.cf

rules/v340.pre

t/autolearn.t

t/autolearn_force.t

t/autolearn_force_fail.t

t/basic_lint_without_sandbox.t

t/data/spam/badmime3.txt

t/dnsbl_subtests.t

files removed:
.pc/50_sa-learn_fix_empty_list_handling

.pc/50_sa-learn_fix_empty_list_handling/sa-learn.raw

.pc/60_bug_684709

.pc/60_bug_684709/lib

.pc/60_bug_684709/lib/Mail

.pc/60_bug_684709/lib/Mail/SpamAssassin

.pc/60_bug_684709/lib/Mail/SpamAssassin/Message.pm

.pc/85_disable_SSLv2

.pc/85_disable_SSLv2/spamc

.pc/85_disable_SSLv2/spamc/libspamc.c

.pc/85_disable_SSLv2/spamc/libspamc.h

.pc/85_disable_SSLv2/spamc/spamc.c

.pc/85_disable_SSLv2/spamc/spamc.pod

.pc/85_disable_SSLv2/spamd

.pc/85_disable_SSLv2/spamd/spamd.raw

.pc/90_missing_tld

.pc/90_missing_tld/lib

.pc/90_missing_tld/lib/Mail

.pc/90_missing_tld/lib/Mail/SpamAssassin

.pc/90_missing_tld/lib/Mail/SpamAssassin/Util

.pc/90_missing_tld/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm

.pc/90_missing_tld/t

.pc/90_missing_tld/t/uri_text.t

.pc/91_no_rfc_ignorant

.pc/91_no_rfc_ignorant/pkgrules

.pc/91_no_rfc_ignorant/pkgrules/20_dnsbl_tests.cf

.pc/91_no_rfc_ignorant/pkgrules/30_text_de.cf

.pc/91_no_rfc_ignorant/pkgrules/50_scores.cf

.pc/91_no_rfc_ignorant/pkgrules/local.cf

.pc/91_no_rfc_ignorant/rules

.pc/91_no_rfc_ignorant/rules/STATISTICS-set1.txt

.pc/91_no_rfc_ignorant/rules/STATISTICS-set3.txt

.pc/91_no_rfc_ignorant/rules/active.list

.pc/95_bug694504-spamdforkscaling-crash

.pc/95_bug694504-spamdforkscaling-crash/lib

.pc/95_bug694504-spamdforkscaling-crash/lib/Mail

.pc/95_bug694504-spamdforkscaling-crash/lib/Mail/SpamAssassin

.pc/95_bug694504-spamdforkscaling-crash/lib/Mail/SpamAssassin/Logger

.pc/95_bug694504-spamdforkscaling-crash/lib/Mail/SpamAssassin/Logger/Syslog.pm

.pc/95_bug694504-spamdforkscaling-crash/spamd

.pc/95_bug694504-spamdforkscaling-crash/spamd/spamd.raw

.pc/96_disable_njabl

.pc/96_disable_njabl/pkgrules

.pc/96_disable_njabl/pkgrules/20_dnsbl_tests.cf

.pc/96_disable_njabl/pkgrules/30_text_de.cf

.pc/96_disable_njabl/pkgrules/30_text_fr.cf

.pc/96_disable_njabl/pkgrules/30_text_nl.cf

.pc/96_disable_njabl/pkgrules/30_text_pl.cf

.pc/96_disable_njabl/pkgrules/50_scores.cf

.pc/98_bug721565-syntax-5.18

.pc/98_bug721565-syntax-5.18/lib

.pc/98_bug721565-syntax-5.18/lib/Mail

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/AsyncLoop.pm

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/Conf

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/Conf/Parser.pm

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/DnsResolver.pm

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/Message.pm

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/PerMsgStatus.pm

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/Util.pm

debian/patches/50_sa-learn_fix_empty_list_handling

debian/patches/60_bug_684709

debian/patches/85_disable_SSLv2

debian/patches/90_missing_tld

debian/patches/91_no_rfc_ignorant

debian/patches/95_bug694504-spamdforkscaling-crash

debian/patches/96_disable_njabl

debian/patches/98_bug721565-syntax-5.18

rules/STATISTICS-set0.txt

rules/STATISTICS-set1.txt

rules/STATISTICS-set2.txt

rules/STATISTICS-set3.txt

t/dkim2.t

t/spamd_ssl_v2.t

t/spamd_ssl_v23.t

files modified:
.pc/10_change_config_paths/INSTALL

.pc/10_change_config_paths/README

.pc/10_change_config_paths/UPGRADE

.pc/10_change_config_paths/lib/Mail/SpamAssassin/Conf.pm

.pc/10_change_config_paths/lib/Mail/SpamAssassin/Plugin/Test.pm

.pc/10_change_config_paths/lib/spamassassin-run.pod

.pc/10_change_config_paths/sa-compile.raw

.pc/10_change_config_paths/sa-learn.raw

.pc/10_change_config_paths/spamc/spamc.pod

.pc/10_change_config_paths/spamd/spamd.raw

.pc/10_change_config_paths/sql/README

.pc/20_edit_spamc_pod/spamc/spamc.pod

.pc/30_edit_README/README

.pc/55_disable_nagios_epm/sa-check_spamd.raw

.pc/90_pod_cleanup/lib/Mail/SpamAssassin/Conf.pm

.pc/97_bug720499-pod-5.18/sa-check_spamd.raw

.pc/applied-patches

CREDITS

Changes

INSTALL

MANIFEST

MANIFEST.SKIP

META.yml

Makefile.PL

README

UPGRADE

build/mkrules

build/parse-rules-for-masses

debian/changelog

debian/control

debian/patches/10_change_config_paths

debian/patches/20_edit_spamc_pod

debian/patches/90_pod_cleanup

debian/patches/series

debian/sa-compile.postinst

debian/spamassassin.cron.daily

debian/spamassassin.postinst

debian/tests/daemon *

debian/tests/get_spam_points.py *

lib/Mail/SpamAssassin.pm

lib/Mail/SpamAssassin/AICache.pm

lib/Mail/SpamAssassin/ArchiveIterator.pm

lib/Mail/SpamAssassin/AsyncLoop.pm

lib/Mail/SpamAssassin/AutoWhitelist.pm

lib/Mail/SpamAssassin/Bayes/CombineChi.pm

lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm

lib/Mail/SpamAssassin/BayesStore.pm

lib/Mail/SpamAssassin/BayesStore/BDB.pm

lib/Mail/SpamAssassin/BayesStore/DBM.pm

lib/Mail/SpamAssassin/BayesStore/MySQL.pm

lib/Mail/SpamAssassin/BayesStore/PgSQL.pm

lib/Mail/SpamAssassin/BayesStore/SQL.pm

lib/Mail/SpamAssassin/Client.pm

lib/Mail/SpamAssassin/Conf.pm

lib/Mail/SpamAssassin/Conf/LDAP.pm

lib/Mail/SpamAssassin/Conf/Parser.pm

lib/Mail/SpamAssassin/Conf/SQL.pm

lib/Mail/SpamAssassin/Constants.pm

lib/Mail/SpamAssassin/Dns.pm

lib/Mail/SpamAssassin/DnsResolver.pm

lib/Mail/SpamAssassin/HTML.pm

lib/Mail/SpamAssassin/Locales.pm

lib/Mail/SpamAssassin/Logger.pm

lib/Mail/SpamAssassin/Logger/File.pm

lib/Mail/SpamAssassin/Logger/Syslog.pm

lib/Mail/SpamAssassin/MailingList.pm

lib/Mail/SpamAssassin/Message.pm

lib/Mail/SpamAssassin/Message/Metadata.pm

lib/Mail/SpamAssassin/Message/Metadata/Received.pm

lib/Mail/SpamAssassin/Message/Node.pm

lib/Mail/SpamAssassin/NetSet.pm

lib/Mail/SpamAssassin/PerMsgStatus.pm

lib/Mail/SpamAssassin/Plugin.pm

lib/Mail/SpamAssassin/Plugin/ASN.pm

lib/Mail/SpamAssassin/Plugin/AWL.pm

lib/Mail/SpamAssassin/Plugin/AutoLearnThreshold.pm

lib/Mail/SpamAssassin/Plugin/Bayes.pm

lib/Mail/SpamAssassin/Plugin/BodyEval.pm

lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

lib/Mail/SpamAssassin/Plugin/Check.pm

lib/Mail/SpamAssassin/Plugin/DCC.pm

lib/Mail/SpamAssassin/Plugin/DKIM.pm

lib/Mail/SpamAssassin/Plugin/DNSEval.pm

lib/Mail/SpamAssassin/Plugin/FreeMail.pm

lib/Mail/SpamAssassin/Plugin/Hashcash.pm

lib/Mail/SpamAssassin/Plugin/HeaderEval.pm

lib/Mail/SpamAssassin/Plugin/ImageInfo.pm

lib/Mail/SpamAssassin/Plugin/MIMEEval.pm

lib/Mail/SpamAssassin/Plugin/PhishTag.pm

lib/Mail/SpamAssassin/Plugin/Pyzor.pm

lib/Mail/SpamAssassin/Plugin/Razor2.pm

lib/Mail/SpamAssassin/Plugin/RelayCountry.pm

lib/Mail/SpamAssassin/Plugin/Reuse.pm

lib/Mail/SpamAssassin/Plugin/SPF.pm

lib/Mail/SpamAssassin/Plugin/Test.pm

lib/Mail/SpamAssassin/Plugin/TextCat.pm

lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm

lib/Mail/SpamAssassin/Plugin/URIEval.pm

lib/Mail/SpamAssassin/Plugin/VBounce.pm

lib/Mail/SpamAssassin/Plugin/WLBLEval.pm

lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm

lib/Mail/SpamAssassin/PluginHandler.pm

lib/Mail/SpamAssassin/Reporter.pm

lib/Mail/SpamAssassin/SQLBasedAddrList.pm

lib/Mail/SpamAssassin/SpamdForkScaling.pm

lib/Mail/SpamAssassin/Timeout.pm

lib/Mail/SpamAssassin/Util.pm

lib/Mail/SpamAssassin/Util/DependencyInfo.pm

lib/Mail/SpamAssassin/Util/Progress.pm

lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm

lib/Mail/SpamAssassin/Util/ScopedTimer.pm

lib/spamassassin-run.pod

pkgrules/10_default_prefs.cf

pkgrules/20_aux_tlds.cf

pkgrules/20_dnsbl_tests.cf

pkgrules/20_drugs.cf

pkgrules/20_dynrdns.cf

pkgrules/20_fake_helo_tests.cf

pkgrules/20_freemail.cf

pkgrules/20_freemail_domains.cf

pkgrules/20_head_tests.cf

pkgrules/20_html_tests.cf

pkgrules/20_phrases.cf

pkgrules/20_ratware.cf

pkgrules/20_uri_tests.cf

pkgrules/20_vbounce.cf

pkgrules/25_dcc.cf

pkgrules/25_replace.cf

pkgrules/25_spf.cf

pkgrules/25_uribl.cf

pkgrules/30_text_de.cf

pkgrules/30_text_fr.cf

pkgrules/30_text_nl.cf

pkgrules/30_text_pl.cf

pkgrules/30_text_pt_br.cf

pkgrules/50_scores.cf

pkgrules/60_adsp_override_dkim.cf

pkgrules/60_whitelist_dkim.cf

pkgrules/72_active.cf

pkgrules/72_scores.cf

pkgrules/73_sandbox_manual_scores.cf

pkgrules/STATISTICS-set0-72_scores.cf.txt

pkgrules/STATISTICS-set1-72_scores.cf.txt

pkgrules/local.cf

pkgrules/regression_tests.cf

rules/active.list

rules/init.pre

rules/local.cf

rules/regression_tests.cf

sa-awl.raw

sa-check_spamd.raw

sa-compile.raw

sa-learn.raw

sa-update.raw

spamassassin.raw

spamc/libspamc.c

spamc/libspamc.h

spamc/spamc.c

spamc/spamc.pod

spamc/utils.c

spamc/utils.h

spamd/netbsd-rc-script.sh

spamd/spamd.raw

sql/README

sql/README.bayes

sql/awl_mysql.sql

sql/bayes_mysql.sql

sql/userpref_pg.sql

t/SATest.pm

t/bayesbdb.t

t/bayesdbm.t

t/bayesdbm_flock.t

t/bayessdbm.t

t/bayessdbm_seen_delete.t

t/bayessql.t

t/config.dist

t/config_errs.t

t/cross_user_config_leak.t

t/data/01_test_rules.cf

t/dcc.t

t/debug.t

t/dkim.t

t/duplicates.t

t/html_colors.t

t/if_can.t

t/lang_pl_tests.t

t/mimeparse.t

t/mkrules_else.t *

t/priorities.t

t/rcvd_parser.t

t/re_base_extraction.t

t/rule_multiple.t

t/rule_names.t

t/sa_check_spamd.t

t/sa_compile.t

t/spamd_hup.t

t/spamd_maxsize.t

t/spamd_prefork_stress_4.t

t/spamd_protocol_10.t

t/spamd_sql_prefs.t

t/spamd_unix_and_tcp.t

t/spf.t

t/trust_path.t

t/uri_text.t

t/uribl.t

t/uribl_all_types.t

t/uribl_domains_only.t

t/uribl_ips_only.t

t/whitelist_addrs.t

Show diffs side-by-side

added added

removed removed

.pc/98_sa-compile-quiet/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

# <@LICENSE>

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

# </@LICENSE>

=head1 NAME

Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset

=head1 SYNOPSIS

This is a plugin to extract "base" strings from SpamAssassin 'body' rules,

suitable for use in Rule2XSBody rules or other parallel matching algorithms.

=cut

package Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor;

use Mail::SpamAssassin::Plugin;

use Mail::SpamAssassin::Logger;

use Mail::SpamAssassin::Util qw(untaint_var);

use Mail::SpamAssassin::Util::Progress;

use Errno qw(ENOENT EACCES EEXIST);

use Data::Dumper;

use strict;

use warnings;

use bytes;

use re 'taint';

use vars qw(@ISA);

@ISA = qw(Mail::SpamAssassin::Plugin);

use constant DEBUG_RE_PARSING => 0; # noisy!

# a few settings that control what kind of bases are output.

# treat all rules as lowercase for purposes of term extraction?

# $main->{bases_must_be_casei} = 1;

# $main->{bases_can_use_alternations} = 0; # /(foo|bar|baz)/

# $main->{bases_can_use_quantifiers} = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/

# $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/

# $main->{bases_split_out_alternations} = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"]

# $main->{base_quiet} = 0; # silences progress output

# TODO: it would be nice to have a clean API to pass such settings

# through to plugins instead of hanging them off $main

##############################################################################

# testing purposes only

my $fixup_re_test;

#$fixup_re_test = 1; fixup_re("fr()|\\\\|"); die;

#$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die;

#$fixup_re_test = 1; fixup_re("\\33\$b"); die;

#$fixup_re_test = 1; fixup_re("[link]"); die;

#$fixup_re_test = 1; fixup_re("please do not resend your original message."); die;

###########################################################################

sub new {

my $class = shift;

my $mailsaobject = shift;

$class = ref($class) || $class;

my $self = $class->SUPER::new($mailsaobject);

bless ($self, $class);

$self->{show_progress} = !$mailsaobject->{base_quiet};

# $self->test(); exit;

return $self;

}

###########################################################################

sub finish_parsing_end {

my ($self, $params) = @_;

my $conf = $params->{conf};

$self->extract_bases($conf);

}

sub extract_bases {

my ($self, $conf) = @_;

my $main = $conf->{main};

if (!$main->{base_extract}) { return; }

100

$self->{show_progress} and

101

info("base extraction starting. this can take a while...");

102

103

$self->extract_set($conf, $conf->{body_tests}, 'body');

104

}

105

106

sub extract_set {

107

my ($self, $conf, $test_set, $ruletype) = @_;

108

109

foreach my $pri (keys %{$test_set}) {

110

my $nicepri = $pri; $nicepri =~ s/-/neg/g;

111

$self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);

112

}

113

}

114

115

###########################################################################

116

117

sub extract_set_pri {

118

my ($self, $conf, $rules, $ruletype) = @_;

119

120

my @good_bases;

121

my @failed;

122

my $yes = 0;

123

my $no = 0;

124

my $count = 0;

125

my $start = time;

126

$self->{main} = $conf->{main}; # for use in extract_hints()

127

$self->{show_progress} and info ("extracting from rules of type $ruletype");

128

my $tflags = $conf->{tflags};

129

130

# attempt to find good "base strings" (simplified regexp subsets) for each

131

# regexp. We try looking at the regexp from both ends, since there

132

# may be a good long string of text at the end of the rule.

133

134

# require this many chars in a base string + delimiters for it to be viable

135

my $min_chars = 5;

136

137

my $progress;

138

$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({

139

total => (scalar keys %{$rules} || 1),

140

itemtype => 'rules',

141

});

142

143

my $cached = { };

144

my $cachefile;

145

146

if ($self->{main}->{bases_cache_dir}) {

147

$cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype";

148

dbg("zoom: reading cache file $cachefile");

149

$cached = $self->read_cachefile($cachefile);

150

}

151

152

NEXT_RULE:

153

foreach my $name (keys %{$rules}) {

154

$self->{show_progress} and $progress and $progress->update(++$count);

155

156

my $rule = $rules->{$name};

157

my $cachekey = join "#", $name, $rule;

158

159

my $cent = $cached->{rule_bases}->{$cachekey};

160

if (defined $cent) {

161

if (defined $cent->{g}) {

162

dbg("zoom: YES (cached) $rule $name");

163

foreach my $ent (@{$cent->{g}}) {

164

# note: we have to copy these, since otherwise later

165

# modifications corrupt the cached data

166

push @good_bases, {

167

base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name}

168

};

169

}

170

$yes++;

171

}

172

else {

173

dbg("zoom: NO (cached) $rule $name");

174

push @failed, { orig => $rule }; # no need to cache this

175

$no++;

176

}

177

next NEXT_RULE;

178

}

179

180

# ignore ReplaceTags rules

181

my $is_a_replacetags_rule = $conf->{rules_to_replace}->{$name};

182

my ($minlen, $lossy, @bases);

183

184

if (!$is_a_replacetags_rule) {

185

eval { # catch die()s

186

my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);

187

($lossy, @bases) = $self->extract_hints($rule, $qr, $mods);

188

# dbg("zoom: %s %s -> %s", $name, $rule, join(", ", @bases));

189

190

} or do {

191

my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;

192

dbg("zoom: giving up on regexp: $eval_stat");

193

};

194

195

if ($lossy && ($tflags->{$name}||'') =~ /\bmultiple\b/) {

196

warn "\nzoom: rule $name will loop on SpamAssassin older than 3.3.2 ".

197

"running under Perl 5.12 or older, Bug 6558\n";

198

}

199

200

# if any of the extracted hints in a set are too short, the entire

201

# set is invalid; this is because each set of N hints represents just

202

# 1 regexp.

203

foreach my $str (@bases) {

204

my $len = length fixup_re($str); # bug 6143: count decoded characters

205

if ($len < $min_chars) { $minlen = undef; @bases = (); last; }

206

elsif (!defined($minlen) || $len < $minlen) { $minlen = $len; }

207

}

208

}

209

210

if ($is_a_replacetags_rule || !$minlen || !@bases) {

211

dbg("zoom: ignoring rule %s, %s", $name,

212

$is_a_replacetags_rule ? 'is a replace rule'

213

: !@bases ? 'no bases' : 'no minlen');

214

push @failed, { orig => $rule };

215

$cached->{rule_bases}->{$cachekey} = { };

216

$no++;

217

}

218

else {

219

# dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");

220

221

# figure out if we have e.g. ["foo", "foob", "foobar"]; in this

222

# case, we only need to track ["foo"].

223

my %subsumed;

224

foreach my $base1 (@bases) {

225

foreach my $base2 (@bases) {

226

if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {

227

$subsumed{$base1} = 1; # base2 is inside base1; discard the longer

228

}

229

}

230

}

231

232

my @forcache;

233

foreach my $base (@bases) {

234

next if $subsumed{$base};

235

push @good_bases, {

236

base => $base, orig => $rule, name => "$name,[l=$lossy]"

237

};

238

# *separate* copies for cache -- we modify the @good_bases entry

239

push @forcache, {

240

base => $base, orig => $rule, name => "$name,[l=$lossy]"

241

};

242

}

243

244

$cached->{rule_bases}->{$cachekey} = { g => \@forcache };

245

$yes++;

246

}

247

}

248

249

$self->{show_progress} and $progress and $progress->final();

250

251

dbg("zoom: $ruletype: found ".(scalar @good_bases).

252

" usable base strings in $yes rules, skipped $no rules");

253

254

# NOTE: re2c will attempt to provide the longest pattern that matched; e.g.

255

# ("food" =~ "foo" / "food") will return "food". So therefore if a pattern

256

# subsumes other patterns, we need to return hits for all of them. We also

257

# need to take care of the case where multiple regexps wind up sharing the

258

# same base.

259

260

# Another gotcha, an exception to the subsumption rule; if one pattern isn't

261

# entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be

262

# returned as two hits, correctly. So we only have to be smart about the

263

# full-subsumption case; overlapping is taken care of for us, by re2c.

264

265

# TODO: there's a bug here. Since the code in extract_hints() has been

266

# modified to support more complex regexps, we can no longer simply assume

267

# that if pattern A is not contained in pattern B, that means that pattern B

268

# doesn't subsume it. Consider, for example, A="foo*bar" and

269

# B="morefobarry"; A is indeed subsumed by B, but we won't be able to test

270

# that without running the A RE match itself somehow against B.

271

# same issue remains with:

272

273

# "foo?bar" / "fobar"

274

# "fo(?:o|oo|)bar" / "fobar"

275

# "fo(?:o|oo)?bar" / "fobar"

276

# "fo(?:o*|baz)bar" / "fobar"

277

# "(?:fo(?:o*|baz)bar|blargh)" / "fobar"

278

279

# it's worse with this:

280

281

# "fo(?:o|oo|)bar" / "foo*bar"

282

283

# basically, this is impossible to compute without reimplementing most of

284

# re2c, and it appears the re2c developers don't plan to offer this:

285

# https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203

286

287

$conf->{base_orig}->{$ruletype} = { };

288

$conf->{base_string}->{$ruletype} = { };

289

290

$count = 0;

291

$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({

292

total => (scalar @good_bases || 1),

293

itemtype => 'bases',

294

});

295

296

# this bit is annoyingly O(N^2). Rewrite the data -- the @good_bases

297

# array -- into a more efficient format, using arrays and with a little

298

# bit of precomputation, to go (quite a bit) faster

299

300

my @rewritten;

301

foreach my $set1 (@good_bases) {

302

my $base = $set1->{base};

303

next if (!$base || !$set1->{name});

304

push @rewritten, [

305

$base, # 0

306

$set1->{name}, # 1

307

$set1->{orig}, # 2

308

length $base, # 3

309

qr/\Q$base\E/, # 4

310

0 # 5, has_multiple flag

311

];

312

}

313

@good_bases = @rewritten;

314

315

foreach my $set1 (@good_bases) {

316

$self->{show_progress} and $progress and $progress->update(++$count);

317

318

my $base1 = $set1->[0]; next unless $base1;

319

my $name1 = $set1->[1];

320

my $orig1 = $set1->[2];

321

$conf->{base_orig}->{$ruletype}->{$name1} = $orig1;

322

my $len1 = $set1->[3];

323

324

foreach my $set2 (@good_bases) {

325

next if ($set1 == $set2);

326

327

my $base2 = $set2->[0]; next unless $base2;

328

my $name2 = $set2->[1];

329

330

# clobber exact dups; this can happen if a regexp outputs the

331

# same base string multiple times

332

if ($base1 eq $base2 &&

333

$name1 eq $name2 &&

334

$orig1 eq $set2->[2])

335

{

336

$set2->[0] = ''; # clobber

337

next;

338

}

339

340

# skip if it's too short to contain the other base string

341

next if ($len1 < $set2->[3]);

342

343

# skip if either already contains the other rule's name

344

# optimize: this can only happen if the base has more than

345

# one rule already attached, ie [5]

346

next if ($set2->[5] && $name2 =~ /(?: |^)\Q$name1\E(?: |$)/);

347

348

# don't use $name1 here, since another base in the set2 loop

349

# may have added $name2 since we set that

350

next if ($set1->[5] && $set1->[1] =~ /(?: |^)\Q$name2\E(?: |$)/);

351

352

# and finally check to see if it *does* contain the other base string

353

next if ($base1 !~ $set2->[4]);

354

355

# base2 is just a subset of base1

356

# dbg("zoom: subsuming '$base2' ($name2) into '$base1': [1]=$set1->[1] [5]=$set1->[5]");

357

$set1->[1] .= " ".$name2;

358

$set1->[5] = 1;

359

}

360

}

361

362

# we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS

363

# both contain "killed" for example, pointing at different rules, which

364

# the above search hasn't found. Collapse them here with a hash

365

my %bases;

366

foreach my $set (@good_bases) {

367

my $base = $set->[0];

368

next unless $base;

369

370

if (defined $bases{$base}) {

371

$bases{$base} .= " ".$set->[1];

372

} else {

373

$bases{$base} = $set->[1];

374

}

375

}

376

undef @good_bases;

377

378

foreach my $base (keys %bases) {

379

# uniq the list, since there are probably dup rules listed

380

my %u;

381

for my $i (split ' ', $bases{$base}) {

382

next if exists $u{$i}; undef $u{$i};

383

}

384

$conf->{base_string}->{$ruletype}->{$base} = join ' ', sort keys %u;

385

}

386

$self->{show_progress} and $progress and $progress->final();

387

388

if ($cachefile) {

389

$self->write_cachefile ($cachefile, $cached);

390

}

391

392

my $elapsed = time - $start;

393

$self->{show_progress} and info ("$ruletype: ".

394

(scalar keys %{$conf->{base_string}->{$ruletype}}).

395

" base strings extracted in $elapsed seconds\n");

396

}

397

398

###########################################################################

399

400

# TODO:

401

# NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i

402

# => should extract 'scription' somehow

403

# /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i

404

# => should understand alternations; tricky

405

406

sub simplify_and_qr_regexp {

407

my $self = shift;

408

my $rule = shift;

409

410

my $main = $self->{main};

411

$rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);

412

413

# remove the regexp modifiers, keep for later

414

my $mods = '';

415

while ($rule =~ s/^$\?([a-z]*)$//) { $mods .= $1; }

416

417

# modifier removal

418

while ($rule =~ s/^$\?-([a-z]*)$//) {

419

foreach my $modchar (split '', $mods) {

420

$mods =~ s/$modchar//g;

421

}

422

}

423

424

my $lossy = 0;

425

426

# now: simplify aspects of the regexp. Bear in mind that we can

427

# simplify as long as we cause the regexp to become more general;

428

# more hits is OK, since false positives will be discarded afterwards

429

# anyway. Simplification that causes the regexp to *not* hit

430

# stuff that the "real" rule would hit, however, is a bad thing.

431

432

if ($main->{bases_must_be_casei}) {

433

$rule = lc $rule;

434

435

$lossy = 1;

436

$mods =~ s/i// and $lossy = 0;

437

438

# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/

439

$rule =~ s/(?<!\\)$\?i\:(.*?)$/$1/gs and $lossy++;

440

441

# always case-i: /A(?-i:ct)/ => /Act/

442

$rule =~ s/(?<!\\)$\?-i\:(.*?)$/$1/gs and $lossy++;

443

444

# remove (?i)

445

$rule =~ s/$\?i$//gs;

446

}

447

else {

448

die "case-i" if $rule =~ /$\?i$/;

449

die "case-i" if $mods =~ /i/;

450

451

# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/

452

$rule =~ s/(?<!\\)$\?i\:(.*?)$/$1/gs and die "case-i";

453

454

# we're already non-case-i so this is a no-op: /A(?-i:ct)/ => /Act/

455

$rule =~ s/(?<!\\)$\?-i\:(.*?)$/$1/gs;

456

}

457

458

# remove /m and /s modifiers

459

$mods =~ s/m// and $lossy++;

460

$mods =~ s/s// and $lossy++;

461

462

# remove (^|\b)'s

463

# T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS|(^|\b)EXTI($|\b))/is

464

$rule =~ s/$\^\|\\b$//gs and $lossy++;

465

$rule =~ s/$\$\|\\b$//gs and $lossy++;

466

$rule =~ s/$\\b\|\^$//gs and $lossy++;

467

$rule =~ s/$\\b\|\$$//gs and $lossy++;

468

469

# remove (?!credit)

470

$rule =~ s/$\?\![^$]+\)//gs and $lossy++;

471

472

# remove \b's

473

$rule =~ s/(?<!\\)\\b//gs and $lossy++;

474

475

# remove the "?=" trick

476

477

$rule =~ s/$\?\=\[[^\]]+\]$//gs;

478

479

$mods .= "L" if $lossy;

480

($rule, $mods);

481

}

482

483

sub extract_hints {

484

my $self = shift;

485

my $rawrule = shift;

486

my $rule = shift;

487

my $mods = shift;

488

489

my $main = $self->{main};

490

my $orig = $rule;

491

492

my $lossy = 0;

493

$mods =~ s/L// and $lossy++;

494

495

# if there are anchors, give up; we can't get much

496

# faster than these anyway

497

die "anchors" if $rule =~ /^\(?(?:\^|\\A)/;

498

499

# die "anchors" if $rule =~ /(?:\$|\\Z)\)?$/;

500

# just remove end-of-string anchors; they're slow so could gain

501

# from our speedup

502

$rule =~ s/(?<!\\)(?:\$|\\Z)\)?$// and $lossy++;

503

504

# simplify (?:..) to (..)

505

$main->{bases_allow_noncapture_groups} or

506

$rule =~ s/\(\?:/\(/g;

507

508

# simplify some grouping arrangements so they're easier for us to parse

509

# (foo)? => (foo|)

510

$rule =~ s/$(.*?)$\?/$$1\|$/gs;

511

# r? => (r|)

512

$rule =~ s/(?<!\\)(\w)\?/$$1\|$/gs;

513

514

my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();

515

$tmpfh or die "failed to create a temporary file";

516

untaint_var(\$tmpf);

517

518

print $tmpfh "use bytes; m{" . $rule . "}" . $mods

519

or die "error writing to $tmpf: $!";

520

close $tmpfh or die "error closing $tmpf: $!";

521

522

my $perl = $self->get_perl();

523

local *IN;

524

open (IN, "$perl -c -Mre=debug $tmpf 2>&1 |")

525

or die "cannot run $perl: ".exit_status_str($?,$!);

526

527

my($inbuf,$nread,$fullstr); $fullstr = '';

528

while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }

529

defined $nread or die "error reading from pipe: $!";

530

531

close IN or die "error closing pipe: $!";

532

unlink $tmpf or die "cannot unlink $tmpf: $!";

533

defined $fullstr or warn "empty result from a pipe";

534

535

# now parse the -Mre=debug output.

536

# perl 5.10 format

537

$fullstr =~ s/^.*\nFinal program:\n//gs;

538

# perl 5.6/5.8 format

539

$fullstr =~ s/^(?:.*\n|)size \d[^\n]*\n//gs;

540

$fullstr =~ s/^(?:.*\n|)first at \d[^\n]*\n//gs;

541

# common to all

542

$fullstr =~ s/\nOffsets:.*$//gs;

543

544

# clean up every other line that doesn't start with a space

545

$fullstr =~ s/^\S.*$//gm;

546

547

if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {

548

die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule";

549

}

550

my $opsstr = $1;

551

552

# what's left looks like this:

553

# 1: EXACTF <v>(3)

554

# 3: ANYOF[1ILil](14)

555

# 14: EXACTF <a>(16)

556

# 16: CURLY {2,7}(29)

557

# 18: ANYOF[A-Za-z](0)

558

# 29: SPACE(30)

559

# 30: EXACTF <http://>(33)

560

# 33: END(0)

561

562

DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";

563

564

my @ops;

565

foreach my $op (split(/\n/s, $opsstr)) {

566

next unless $op;

567

568

if ($op =~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*?)\s*(?:$\d+$)?$/) {

569

# perl 5.8: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18)

570

# perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18)

571

push @ops, [ $1, $2, $3 ];

572

}

573

elsif ($op =~ /^ (\s*)<(.*)>\.\.\.\s*$/) {

574

# 5: TRIE-EXACT[im](44)

575

# <message contained attachments that have been blocked by guin>...

576

my $spcs = $1;

577

# we could use the entire length here, but it's easier to trim to

578

# the length of a perl 5.8.x/5.6.x EXACT* string; that way our test

579

# suite results will match, since the sa-update --list extraction will

580

# be the same for all versions. (The "..." trailer is important btw)

581

my $str = substr ($2, 0, 55);

582

push @ops, [ $spcs, '_moretrie', "<$str...>" ];

583

}

584

elsif ($op =~ /^ (\s*)(<.*>)\s*(?:$\d+$)?$/) {

585

# 5: TRIE-EXACT[am](21)

586

# <am> (21)

587

# <might> (12)

588

push @ops, [ $1, '_moretrie', $2 ];

589

}

590

elsif ($op =~ /^ at .+ line \d+$/) {

591

next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':

592

}

593

else {

594

warn "cannot parse '$op': $opsstr";

595

next;

596

}

597

}

598

599

# unroll the branches; returns a list of versions.

600

# e.g. /foo(bar|baz)argh/ => [ "foobarargh", "foobazargh" ]

601

my @unrolled;

602

if ($main->{bases_split_out_alternations}) {

603

@unrolled = $self->unroll_branches(0, \@ops);

604

} else {

605

@unrolled = ( \@ops );

606

}

607

608

# now find the longest DFA-friendly string in each unrolled version

609

my @longests;

610

foreach my $opsarray (@unrolled) {

611

my $longestexact = '';

612

my $buf = '';

613

614

# use a closure to keep the code succinct

615

my $add_candidate = sub {

616

if (length $buf > length $longestexact) { $longestexact = $buf; }

617

$buf = '';

618

};

619

620

my $prevop;

621

foreach my $op (@{$opsarray}) {

622

my ($spcs, $item, $args) = @{$op};

623

624

next if ($item eq 'NOTHING');

625

626

# EXACT == case-sensitive

627

# EXACTF == case-i

628

# we can do both, since we canonicalize to lc.

629

if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)

630

{

631

my $str = $1;

632

$buf .= $str;

633

if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {

634

# a high Unicode codepoint, interpreted by perl 5.8.x. cut and stop

635

$add_candidate->();

636

}

637

if (length $str >= 55 && $buf =~ s/\.\.\.$//) {

638

# perl 5.8.x truncates with a "..." here! cut and stop

639

$add_candidate->();

640

}

641

}

642

# _moretrie == a TRIE-EXACT entry

643

elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)

644

{

645

$buf .= $1;

646

if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {

647

# perl 5.8.x truncates with a "..." here! cut and stop

648

$add_candidate->();

649

}

650

}

651

# /(?:foo|bar|baz){2}/ results in a CURLYX beforehand

652

elsif ($item =~ /^EXACT/ &&

653

$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&

654

$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&

655

$args =~ /<(.*)>/)

656

{

657

$buf .= $1;

658

if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {

659

# perl 5.8.x truncates with a "..." here! cut and stop

660

$add_candidate->();

661

}

662

}

663

# CURLYX, for perl >= 5.9.5

664

elsif ($item =~ /^_moretrie/ &&

665

$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&

666

$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&

667

$args =~ /<(.*)>/)

668

{

669

$buf .= $1;

670

if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {

671

# perl 5.8.x truncates with a "..." here! cut and stop

672

$add_candidate->();

673

}

674

}

675

else {

676

# not an /^EXACT/; clear the buffer

677

$add_candidate->();

678

if ($item !~ /^(?:END|CLOSE\d|MINMOD)$/)

679

{

680

$lossy = 1;

681

DEBUG_RE_PARSING and warn "item $item makes regexp lossy";

682

}

683

}

684

$prevop = $op;

685

}

686

$add_candidate->();

687

688

if (!$longestexact) {

689

die "no long-enough string found in $rawrule";

690

# all unrolled versions must have a long string, otherwise

691

# we cannot reliably match all variants of the rule

692

} else {

693

push @longests, ($main->{bases_must_be_casei}) ?

694

lc $longestexact : $longestexact;

695

}

696

}

697

698

DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";

699

return ($lossy, @longests);

700

}

701

702

###########################################################################

703

704

sub unroll_branches {

705

my ($self, $depth, $opslist) = @_;

706

707

die "too deep" if ($depth++ > 5);

708

709

my @ops = (@{$opslist}); # copy

710

my @pre_branch_ops;

711

my $branch_spcs;

712

my $trie_spcs;

713

my $open_spcs;

714

715

# our input looks something like this 2-level structure:

716

# 1: BOUND(2)

717

# 2: EXACT <Dear >(5)

718

# 5: BRANCH(9)

719

# 6: EXACT <IT>(8)

720

# 8: NALNUM(24)

721

# 9: BRANCH(23)

722

# 10: EXACT <Int>(12)

723

# 12: BRANCH(14)

724

# 13: NOTHING(21)

725

# 14: BRANCH(17)

726

# 15: EXACT <a>(21)

727

# 17: BRANCH(20)

728

# 18: EXACT <er>(21)

729

# 20: TAIL(21)

730

# 21: EXACT <net>(24)

731

# 23: TAIL(24)

732

# 24: EXACT < shop>(27)

733

# 27: END(0)

734

735

# or:

736

737

# 1: OPEN1(3)

738

# 3: BRANCH(6)

739

# 4: EXACT <v>(9)

740

# 6: BRANCH(9)

741

# 7: EXACT <\\/>(9)

742

# 9: CLOSE1(11)

743

# 11: CURLY {2,5}(14)

744

# 13: REG_ANY(0)

745

# 14: EXACT < g r a >(17)

746

# 17: ANYOF[a-z](28)

747

# 28: END(0)

748

749

# or:

750

751

# 1: EXACT <i >(3)

752

# 3: OPEN1(5)

753

# 5: TRIE-EXACT[am](21)

754

# <am> (21)

755

# <might> (12)

756

# 12: OPEN2(14)

757

# 14: TRIE-EXACT[ ](19)

758

# < be>

759

# <>

760

# 19: CLOSE2(21)

761

# 21: CLOSE1(23)

762

# 23: EXACT < c>(25)

763

764

DEBUG_RE_PARSING and warn "starting parse";

765

766

# this happens for /foo|bar/ instead of /(?:foo|bar)/ ; transform

767

# it into the latter. bit of a kludge to do this before the loop, but hey.

768

# note that it doesn't fix the CLOSE1/END ordering to be correct

769

if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {

770

my @newops = ([ "", "OPEN1", "" ]);

771

foreach my $op (@ops) {

772

push @newops, [ " ".$op->[0], $op->[1], $op->[2] ];

773

}

774

push @newops, [ "", "CLOSE1", "" ];

775

@ops = @newops;

776

}

777

778

# iterate until we start a branch set. using

779

# /dkjfksl(foo|bar(baz|argh)boo)gab/ as an example, we're at "dkj..."

780

# just hitting an OPEN is not enough; wait until we see a TRIE-EXACT

781

# or a BRANCH, *then* unroll the most recent OPEN set.

782

while (1) {

783

my $op = shift @ops;

784

last unless defined $op;

785

786

my ($spcs, $item, $args) = @{$op};

787

DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";

788

789

if ($item =~ /^OPEN/) {

790

$open_spcs = $spcs;

791

next; # next will be a BRANCH or TRIE

792

793

} elsif ($item =~ /^TRIE/) {

794

$trie_spcs = $spcs;

795

last;

796

797

} elsif ($item =~ /^BRANCH/) {

798

$branch_spcs = $spcs;

799

last;

800

801

} elsif ($item =~ /^EXACT/ && defined $open_spcs) {

802

# perl 5.9.5 does this; f(o|oish) => OPEN, EXACT, TRIE-EXACT

803

push @pre_branch_ops, [ $open_spcs, $item, $args ];

804

next;

805

806

} elsif (defined $open_spcs) {

807

# OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:

808

# ignore this OPEN block entirely and don't try to unroll it

809

undef $open_spcs;

810

811

} else {

812

push @pre_branch_ops, $op;

813

}

814

}

815

816

# no branches found? we're done unrolling on this one!

817

if (scalar @ops == 0) {

818

return [ @pre_branch_ops ];

819

}

820

821

# otherwise we're at the start of a new branch set

822

# /(foo|bar(baz|argh)boo)gab/

823

my @alts;

824

my @in_this_branch;

825

826

DEBUG_RE_PARSING and warn "entering branch: ".

827

"open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".

828

"branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".

829

"trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";

830

831

# indentation level to remove from "normal" ops (using a s///)

832

my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";

833

my $trie_sub_spcs = "";

834

while (1) {

835

my $op = shift @ops;

836

last unless defined $op;

837

my ($spcs, $item, $args) = @{$op};

838

DEBUG_RE_PARSING and warn "in: [$spcs] $item $args";

839

840

if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) { # alt

841

push @alts, [ @pre_branch_ops, @in_this_branch ];

842

@in_this_branch = ();

843

$open_sub_spcs = $branch_spcs." ";

844

$trie_sub_spcs = "";

845

next;

846

}

847

elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end

848

push @alts, [ @pre_branch_ops, @in_this_branch ];

849

undef $branch_spcs;

850

$open_sub_spcs = "";

851

$trie_sub_spcs = "";

852

last;

853

}

854

elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {

855

if (scalar @in_this_branch > 0) {

856

push @alts, [ @pre_branch_ops, @in_this_branch ];

857

}

858

# use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)

859

@in_this_branch = ( [ $open_spcs, $item, $args ] );

860

$open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";

861

$trie_sub_spcs = " ";

862

next;

863

}

864

elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) { # end

865

push @alts, [ @pre_branch_ops, @in_this_branch ];

866

undef $branch_spcs;

867

undef $open_spcs;

868

undef $trie_spcs;

869

$open_sub_spcs = "";

870

$trie_sub_spcs = "";

871

last;

872

}

873

elsif ($item eq 'END') { # of string

874

push @alts, [ @pre_branch_ops, @in_this_branch ];

875

undef $branch_spcs;

876

undef $open_spcs;

877

undef $trie_spcs;

878

$open_sub_spcs = "";

879

$trie_sub_spcs = "";

880

last;

881

}

882

else {

883

if ($open_sub_spcs) {

884

# deindent the space-level to match the opening brace

885

$spcs =~ s/^$open_sub_spcs//;

886

# tries also add one more indent level in

887

$spcs =~ s/^$trie_sub_spcs//;

888

}

889

push @in_this_branch, [ $spcs, $item, $args ];

890

# note that we ignore ops at a deeper $spcs level entirely (until later!)

891

}

892

}

893

894

if (defined $branch_spcs) {

895

die "fell off end of string with a branch open: '$branch_spcs'";

896

}

897

898

# we're now after the branch set: /gab/

899

# @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz|argh)boo/ ]

900

foreach my $alt (@alts) {

901

push @{$alt}, @ops; # add all remaining ops to each one

902

# note that this could include more (?:...); we don't care, since

903

# those can be handled by recursing

904

}

905

906

# ok, parsed the entire ops list

907

# @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz|argh)boogab/ ]

908

909

if (DEBUG_RE_PARSING) {

910

print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }

911

}

912

913

# now recurse, to unroll the remaining branches (if any exist)

914

my @rets;

915

foreach my $alt (@alts) {

916

push @rets, $self->unroll_branches($depth, $alt);

917

}

918

919

if (DEBUG_RE_PARSING) {

920

print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }

921

}

922

923

return @rets;

924

}

925

926

###########################################################################

927

928

sub test {

929

my ($self) = @_;

930

931

$self->test_split_alt("foo", "/foo/");

932

$self->test_split_alt("(foo)", "/foo/");

933

$self->test_split_alt("foo(bar)baz", "/foobarbaz/");

934

$self->test_split_alt("x(foo|)", "/xfoo/ /x/");

935

$self->test_split_alt("fo(o|)", "/foo/ /fo/");

936

$self->test_split_alt("(foo|bar)", "/foo/ /bar/");

937

$self->test_split_alt("foo|bar", "/foo/ /bar/");

938

$self->test_split_alt("foo (bar|baz) argh", "/foo bar argh/ /foo baz argh/");

939

$self->test_split_alt("foo (bar|baz|bl(arg|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");

940

$self->test_split_alt("(s(otc|tco)k)", "/sotck/ /stcok/");

941

942

}

943

944

sub test_split_alt {

945

my ($self, $in, $out) = @_;

946

947

my @got = $self->split_alt($in);

948

$out =~ s/^\///;

949

$out =~ s/\/$//;

950

my @want = split(/\/ \//, $out);

951

952

my $failed = 0;

953

if (scalar @want != scalar @got) {

954

warn "FAIL: results count don't match";

955

$failed++;

956

}

957

else {

958

my %got = map { $_ => 1 } @got;

959

foreach my $w (@want) {

960

if (!$got{$w}) {

961

warn "FAIL: '$w' not found";

962

$failed++;

963

}

964

}

965

}

966

967

if ($failed) {

968

print "want: /".join('/ /', @want)."/\n" or die "error writing: $!";

969

print "got: /".join('/ /', @got)."/\n" or die "error writing: $!";

970

return 0;

971

} else {

972

print "ok\n" or die "error writing: $!";

973

return 1;

974

}

975

}

976

977

###########################################################################

978

979

sub get_perl {

980

my ($self) = @_;

981

my $perl;

982

983

# allow user override of the perl interpreter to use when

984

# extracting base strings.

985

# TODO: expose this via sa-compile command-line option

986

my $fromconf = $self->{main}->{conf}->{re_parser_perl};

987

988

if ($fromconf) {

989

$perl = $fromconf;

990

} elsif ($^X =~ m|^/|) {

991

$perl = $^X;

992

} else {

993

use Config;

994

$perl = $Config{perlpath};

995

$perl =~ s|/[^/]*$|/$^X|;

996

}

997

untaint_var(\$perl);

998

return $perl;

999

}

1000

1001

###########################################################################

1002

1003

sub read_cachefile {

1004

my ($self, $cachefile) = @_;

1005

local *IN;

1006

if (open(IN, "<".$cachefile)) {

1007

my($inbuf,$nread,$str); $str = '';

1008

while ( $nread=read(IN,$inbuf,16384) ) { $str .= $inbuf }

1009

defined $nread or die "error reading from $cachefile: $!";

1010

close IN or die "error closing $cachefile: $!";

1011

1012

untaint_var(\$str);

1013

my $VAR1; # Data::Dumper

1014

if (eval $str) {

1015

return $VAR1; # Data::Dumper's naming

1016

}

1017

}

1018

return { };

1019

}

1020

1021

sub write_cachefile {

1022

my ($self, $cachefile, $cached) = @_;

1023

1024

my $dump = Data::Dumper->new ([ $cached ]);

1025

$dump->Deepcopy(1);

1026

$dump->Purity(1);

1027

$dump->Indent(1);

1028

if (mkdir($self->{main}->{bases_cache_dir})) {

1029

# successfully created

1030

} elsif ($! == EEXIST) {

1031

dbg("zoom: ok, cache directory already existed");

1032

} else {

1033

warn "cannot create a directory: $!";

1034

}

1035

open(CACHE, ">$cachefile") or warn "cannot write to $cachefile";

1036

print CACHE ($dump->Dump, ";1;") or die "error writing: $!";

1037

close CACHE or die "error closing $cachefile: $!";

1038

}

1039

1040

=over 4

1041

1042

=item my ($cleanregexp) = fixup_re($regexp);

1043

1044

Converts encoded characters in a regular expression pattern into their

1045

equivalent characters

1046

1047

=back

1048

1049

=cut

1050

1051

sub fixup_re {

1052

my $re = shift;

1053

1054

if ($fixup_re_test) { print "INPUT: /$re/\n" or die "error writing: $!" }

1055

1056

my $output = "";

1057

my $TOK = qr([\"\\]);

1058

1059

my $STATE;

1060

local ($1,$2);

1061

while ($re =~ /\G(.*?)($TOK)/gcs) {

1062

my $pre = $1;

1063

my $tok = $2;

1064

1065

if (length($pre)) {

1066

$output .= "\"$pre\"";

1067

}

1068

1069

if ($tok eq '"') {

1070

$output .= '"\\""';

1071

}

1072

elsif ($tok eq '\\') {

1073

$re =~ /\G(x\{[^\}]+\}|[0-7]{1,3}|.)/gcs or die "\\ at end of string!";

1074

my $esc = $1;

1075

if ($esc eq '"') {

1076

$output .= '"\\""';

1077

} elsif ($esc eq '\\') {

1078

$output .= '"**BACKSLASH**"'; # avoid hairy escape-parsing

1079

} elsif ($esc =~ /^x\{(\S+)\}\z/) {

1080

$output .= '"'.chr(hex($1)).'"';

1081

} elsif ($esc =~ /^[0-7]{1,3}\z/) {

1082

$output .= '"'.chr(oct($esc)).'"';

1083

} else {

1084

$output .= "\"$esc\"";

1085

}

1086

}

1087

else {

1088

print "PRE: $pre\nTOK: $tok\n" or die "error writing: $!";

1089

}

1090

}

1091

1092

if (!defined(pos($re))) {

1093

# no matches

1094

$output .= "\"$re\"";

1095

# Bug 6649: protect NL, NULL, ^Z, (and controls to stay on the safe side)

1096

$output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;

1097

}

1098

elsif (pos($re) <= length($re)) {

1099

$output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;

1100

$output .= fixup_re(substr($re, pos($re)));

1101

}

1102

1103

$output =~ s/^""/"/; # protect start and end quotes

1104

$output =~ s/(?<!\\)""\z/"/;

1105

$output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef"

1106

$output =~ s/\*\*BACKSLASH\*\*/\\\\/gs;

1107

1108

if ($fixup_re_test) { print "OUTPUT: $output\n" or die "error writing: $!" }

1109

return $output;

1110

}

1111

1112

Older »