3
# Copyright 2001,2004 by Stefan Hornburg (Racke) <racke@linuxia.de>
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public
16
# License along with this program; if not, write to the Free
17
# Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
20
$ENV{'PATH'} = '/bin:/usr/bin';
21
# Avoid taint problems when any of these environment variables are defined.
22
# See perlsec(1) manpage
23
delete @ENV{qw(IFS CDPATH ENV BASH_ENV)};
27
use CGI qw/:standard/;
32
use File::Temp qw(tempfile);
39
use constant DOCBASEDB => '/var/lib/dhelp/doc-base_dirs';
40
use constant DOCSINDEX => '/var/lib/dhelp/documents.index';
41
use constant TEMPLATES_DIR => '/usr/share/dhelp/templates';
42
use constant RESULTS_PER_PAGE => 20;
43
use constant RESULTS_OFFLINE_MODE => 100;
47
return qr/\b/ . join("|", @$words) . qr/\b/;
50
sub highlight_fragment {
51
my ($text, $words) = @_;
53
my $regex = word_regex($words);
54
$text = encode_entities($text);
55
$text =~ s/$regex/<span class="matched-fragment">$&<\/span>/gi;
59
sub get_highlighted_fragments {
60
my ($text, $words) = @_;
62
# Only look for results in the first 100K
63
$text = substr($text, 0, 100000)."..." if length $text > 100000;
65
my $regex = word_regex($words);
66
my $max_fragments = 5;
67
my @fragment_list = ();
68
while ($text =~ /(.{0,31})($regex)(.{0,31})/isg) {
69
my ($pre, $word, $post) = ($1, $2, $3);
70
if (length $pre > 30) {
71
$pre = "...".substr($pre, 1);
73
if (length $post > 30) {
74
$post = substr($post, 0, 29)."...";
76
push @fragment_list, highlight_fragment($pre.$word.$post, $words);
77
last if --$max_fragments < 1;
79
# Just in case: if we don't match anything, just show the first 60
81
if (scalar @fragment_list) {
82
return @fragment_list;
85
return length $text > 100 ? substr($text, 0, 99)."..." : $text;
90
my ($cmd, $file_type, $package) = @_;
98
return "[Cannot show $file_type file preview. Please install package $package]";
105
# Remove any weird character
106
$file =~ s/[^a-z0-9_.\/+-]//gio;
111
my $basename = basename($file);
112
$basename =~ s/\.(\w+)$//go;
116
if ($ext =~ /html?/) {
117
# Try to fetch a text version of the HTML
118
if (-x '/usr/bin/lynx') {
119
$text = `/usr/bin/lynx -dump -nolist "$file"`;
121
elsif (-x '/usr/bin/links') {
122
$text = `/usr/bin/links -dump -no-references "$file"`;
124
elsif (-x '/usr/bin/w3m') {
125
$text = `/usr/bin/w3m -dump "$file"`;
127
elsif (-x '/usr/bin/html2text') {
128
$text = `/usr/bin/html2text -nobs "$file"`;
131
# Fallback to retarded HTML stripper
133
$text = join("", <F>);
135
$text =~ s/<.*?>//go;
138
elsif ($ext =~ /gz/) {
139
my ($fh, $tmp_path) = tempfile("dsearch-XXXXXX",
141
SUFFIX => "$basename");
142
print $fh `gunzip -c "$file"`;
144
$text = get_text($tmp_path);
146
elsif ($ext =~ /pdf/) {
147
if (-x '/usr/bin/pdftotext') {
148
$text = `/usr/bin/pdftotext "$file" -`;
151
# pstotext is a dependency, so this should never fail, but we
152
# recommend the user to install xpdf-utils instead, to get
153
# pdftotext (better extraction quality and much faster)
154
$text = file_to_text("/usr/bin/pstotext '$file'", "PDF", "xpdf-utils");
157
elsif ($ext =~ /dvi/) {
158
$text = file_to_text("/usr/bin/catdvi '$file'", "DVI", "catdvi");
160
elsif ($ext =~ /ps/) {
161
$text = file_to_text("/usr/bin/pstotext '$file'", "Postscript", "pstotext");
165
$text = join("", <F>);
173
my ($file, $search) = @_;
175
my $text = get_text($file);
176
my @words = split(/\s/, $search);
177
return join(" / ", get_highlighted_fragments($text, \@words));
181
my ($msg, $vars) = @_;
183
$msg =~ s/\n/<br>\n/go;
184
$vars->{error} = $msg;
186
my $tt = Template->new({INCLUDE_PATH => TEMPLATES_DIR});
187
$tt->process('search_error.tmpl', $vars);
191
sub get_doc_base_info {
192
my ($info_hash, $file) = @_;
194
my $packed_dirname = pack("Z100", dirname($file));
195
my $value = $info_hash->{$packed_dirname};
196
defined $value || return;
197
unpack("Z50 Z1000", $value);
200
# Selects the best matching language between two lists (the available languages
201
# list, in normal locale format, and the acceptable languages list, in
202
# HTTP_ACCEPT_LANGUAGE format).
203
sub select_matching_language {
204
my ($available_list, $accept_list) = @_;
206
foreach my $accept (@$accept_list) {
207
return $accept if grep { $accept eq $_ } @$available_list;
208
my $stripped_accept = $accept;
209
$stripped_accept =~ s/-.*//go;
210
$stripped_accept =~ s/;.*//go;
211
foreach my $available (@$available_list) {
212
my $stripped_available = $available;
213
$stripped_available =~ s/_.*//go;
214
$stripped_available =~ s/\..*//go;
215
return $available if $stripped_accept eq $stripped_available;
222
my $online = !param('file');
223
print header(-charset=>'utf-8') if $online;
225
my $gettext = Locale::gettext->domain_raw("dhelp");
226
$gettext->codeset('UTF-8'); # Always UTF-8, specified in the HTML templates
227
my @available_languages = map { chomp; $_ } split(/\n/, `/usr/bin/locale -a`);
228
my @accept_languages = split(",", $ENV{HTTP_ACCEPT_LANGUAGE} || "");
229
setlocale(LC_MESSAGES, select_matching_language(\@available_languages,
230
\@accept_languages));
233
base_url => $online ? '' : 'file:/usr/share',
239
$gettext->get($_[0]);
242
unless (-f DOCSINDEX) {
243
error(_("No search database found.\nPlease run /etc/cron.weekly/dhelp as superuser to create it."), $vars);
246
# Fetch, decode, untaint and check search parameter
247
my $search = param('search');
248
# This script may be called from command-line too, so the string might be just
249
# a bunch of octets. Assume UTF-8 in that case.
250
unless (Encode::is_utf8($search)) {
251
$search = decode('utf-8', $search);
253
$search =~ m/(.+)/; # Is there anything we shouldn't allow?
255
$vars->{search_terms} = encode('utf8', $search);
257
if ($search !~ /\S/) {
258
error(_("Please specify a search term."), $vars);
262
# Pass parameters to Swish++ search program
263
(param('page') || "") =~ /(\d+)/;
264
my $current_page = defined($1) ? $1 : 1;
265
my $skipped_results = ($current_page - 1) * RESULTS_PER_PAGE;
266
# search++ seems to support only Latin-1. Everything else makes it die with
268
my $encoded_search = encode('latin1', $search);
270
or exec '/usr/bin/search++', '-i', DOCSINDEX,
271
'-r', $skipped_results,
272
'-m', RESULTS_PER_PAGE,
275
# Open the doc-base info database, and tie to %docs_info
277
tie %docs_info, 'DB_File', DOCBASEDB, O_RDONLY or die "Cannot tie ".DOCBASEDB.": $!";
280
# Read the search results back and store somewhere, so we can calculate the
283
my ($n_results, $ignored);
286
if ( /^\# ignored: / ) {
288
# Get the ignored words so we can report them to the user.
291
$ignored =~ s/\s+$//;
294
if ( /^\# results: (\d+)/) {
296
if ($n_results == 0) {
298
$vars->{msg} = sprintf(_("No results (ignored words: %s). Search terms must be at least 4 characters long, and must not be \"stop words\". The command \"<tt>index++ -S</tt>\" gives the stop word list."), $ignored);
300
$vars->{msg} = _("No results.");
306
# Future releases of SWISH++ may emit other comments: ignore ones we
311
my( $rank, $file, $size, $title ) = split(/ /, $_, 4);
312
my $extract = get_extract($file, $search);
314
$size = int( $size / 1024 );
320
my $file_url = $file;
322
my $file_url_encoded = uri_escape($file_url);
323
$file_url = "/cgi-bin/dhelp_fetcher?file=$file_url_encoded";
325
$file_url = "file:$file";
327
# Calculate the real title
328
my ($doc_id, $doc_base_title) = get_doc_base_info(\%docs_info, $file);
329
$title .= " ($doc_base_title)" if $doc_base_title;
330
push @{$vars->{results}}, {rank => $rank,
334
extract => $extract};
337
if (!$online && $n_results > RESULTS_OFFLINE_MODE) {
338
$vars->{msg} .= " ".sprintf(_("Showing only %s out of %s results in offline mode."), RESULTS_OFFLINE_MODE, $n_results);
343
# We are already fetching just the data page we want, but Data::Page is
344
# useful to calculate information about _other_ pages of data
345
my $pager = Data::Page->new($n_results,
349
# Include paging information
350
$vars->{current_page} = $pager->current_page;
351
$vars->{previous_page} = $pager->previous_page;
352
$vars->{next_page} = $pager->next_page;
353
$vars->{last_page} = $pager->last_page;
355
my $tt = Template->new({INCLUDE_PATH => TEMPLATES_DIR});
356
$tt->process('search_results.tmpl', $vars) or die $tt->error;