~noskcaj/ubuntu/saucy/dhelp/apache2.4

Viewing changes to src/dsearch

Committer: Charlie Smotherman
Date: 2012-11-05 23:48:50 UTC
mfrom: (22.1.1 dhelp)
Revision ID: cjsmo@cableone.net-20121105234850-4a447ob81bs9a5je

Tags: 0.6.21+nmu1ubuntu1

http://bugs.debian.org/650441

http://bugs.debian.org/669041

http://bugs.debian.org/669758

http://bugs.debian.org/678055

* Merge from Debian unstable. Remaining changes:
  - lib/dhelp.rb:
    + Exit and return zero code if bdb isn't available; this usually
      indicates that dhelp is not configured yet.
* Non-Maintainer Upload
* Dropped the declaration of dependence on ruby-commandline, which was
  already done on the code
* New maintainer: Georgios M. Zarkadas <gz@member.fsf.org> (Closes: #650441).
* Support other web servers in addition to apache2 (Closes: #669041).
* Support apache2 packaging transition for version 2.4 (Closes: #669758).
* Support new ruby packaging policy transition for Wheezy.
* Use OptionParser instead Commandline::Application (Closes: #678055).
  Thanks Gunnar Wolf.
* Support new layout of man2html cgi scripts for Wheezy.
* Keep supporting previous policies/layouts, either during build time or
  during runtime, to aid backporting.
* Man and info pages links are activated only if associated packages are
  installed on the system.
* Subsections now show in the sections list only if section is selected.
* New color, styles and icons themes.
* Package installation now does not fail if cache data cannot be generated
  during install.
* Fix some minor lintian warnings.
* Bump Standards-Version to 3.9.3.

files added:
config

config/apache-dhelp.conf

config/boa-dhelp.conf

config/dhelp.conf-sample

config/lighttpd-dhelp.conf

config/mathopd-dhelp.conf

config/nginx-dhelp.conf

config/swish++.conf

config/yaws-dhelp.conf

debian/dhelp.lintian-overrides

debian/scripts

debian/scripts/files.sh

debian/scripts/httpd.sh

debian/scripts/info.sh

debian/source

debian/source/format

devtools

devtools/dhelp

devtools/dhelp/exporter

devtools/dhelp/exporter/testhtml.rb

devtools/gettext.pl

devtools/index.sh

devtools/list-dirs.rb

devtools/make-html

doc/css

doc/css/book-open.png

doc/css/book.png

doc/css/debian.jpg

doc/css/debian.png

doc/css/dhelp.css

doc/css/folder.png

doc/css/swirl.jpg

doc/css/swirl.png

doc/css/text.png

lib/dhelp/exporter/cgimap.rb

scripts/index-deferred

src/dhelp

src/dhelp_fetcher.rb

src/dhelp_parse.rb

src/dsearch

templates

templates/fetcher_error.rhtml

templates/index.rhtml

templates/search_error.tmpl

templates/search_pagination.tmpl

templates/search_results.tmpl

templates/section.rhtml

test/share-doc/docbook-xsl-doc-html/doc/index.html

test/share-doc/docbook-xsl-doc-html/doc/some_file.html

test/share-doc/docbook-xsl-doc-html/doc/some_section

test/share-doc/docbook-xsl-doc-html/doc/some_section/index.html

test/share-doc/docbook-xsl-doc-html/doc/some_section/some_file.html

files removed:
apache-dhelp.conf

debian/dhelp.docs

debian/postrm

dhelp

dhelp.conf-sample

dhelp.css

dhelp_fetcher.rb

dhelp_parse.rb

doc/debian.jpg

doc/folder.png

doc/swirl.jpg

doc/text.png

dsearch

fetcher_error.rhtml

gettext.pl

index-deferred

index.rhtml

index.sh

list-dirs.rb

search_error.tmpl

search_pagination.tmpl

search_results.tmpl

section.rhtml

swish++.conf

swish++.index

files modified:
Makefile

debian/NEWS

debian/README.Debian

debian/changelog

debian/control

debian/copyright

debian/cron.weekly

debian/dirs

debian/postinst

debian/prerm

debian/rules

lib/dhelp.rb

lib/dhelp/exporter/base.rb

lib/dhelp/exporter/html.rb

man/dhelp_parse.8

po/de.po

po/dhelp.pot

po/el.po

po/es.po

po/eu.po

po/fr.po

po/id.po

po/ru.po

test/tc_dhelpdocumentpool.rb

Show diffs side-by-side

added added

removed removed

src/dsearch

#! /usr/bin/perl -T

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public

# License along with this program; if not, write to the Free

# Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,

# MA 02111-1307 USA.

$ENV{'PATH'} = '/bin:/usr/bin';

# Avoid taint problems when any of these environment variables are defined.

# See perlsec(1) manpage

delete @ENV{qw(IFS CDPATH ENV BASH_ENV)};

use strict;

use warnings;

use CGI qw/:standard/;

use Template;

use DB_File;

use File::Basename;

use HTML::Entities;

use File::Temp qw(tempfile);

use Locale::gettext;

use POSIX;

use Data::Page;

use URI::Escape;

use Encode;

use constant DOCBASEDB => '/var/lib/dhelp/doc-base_dirs';

use constant DOCSINDEX => '/var/lib/dhelp/documents.index';

use constant TEMPLATES_DIR => '/usr/share/dhelp/templates';

use constant RESULTS_PER_PAGE => 20;

use constant RESULTS_OFFLINE_MODE => 100;

sub word_regex {

my ($words) = @_;

return qr/\b/ . join("|", @$words) . qr/\b/;

}

sub highlight_fragment {

my ($text, $words) = @_;

my $regex = word_regex($words);

$text = encode_entities($text);

$text =~ s/$regex/<span class="matched-fragment">$&<\/span>/gi;

return $text;

}

sub get_highlighted_fragments {

my ($text, $words) = @_;

# Only look for results in the first 100K

$text = substr($text, 0, 100000)."..." if length $text > 100000;

my $regex = word_regex($words);

my $max_fragments = 5;

my @fragment_list = ();

while ($text =~ /(.{0,31})($regex)(.{0,31})/isg) {

my ($pre, $word, $post) = ($1, $2, $3);

if (length $pre > 30) {

$pre = "...".substr($pre, 1);

}

if (length $post > 30) {

$post = substr($post, 0, 29)."...";

}

push @fragment_list, highlight_fragment($pre.$word.$post, $words);

last if --$max_fragments < 1;

}

# Just in case: if we don't match anything, just show the first 60

# characters

if (scalar @fragment_list) {

return @fragment_list;

}

else {

return length $text > 100 ? substr($text, 0, 99)."..." : $text;

}

sub file_to_text {

my ($cmd, $file_type, $package) = @_;

$cmd =~ /^(\S+)/;

my $binary = $1;

if (-x $binary) {

return `$cmd`;

}

else {

return "[Cannot show $file_type file preview. Please install package $package]";

}

100

}

101

102

sub get_text {

103

my ($file) = @_;

104

105

# Remove any weird character

106

$file =~ s/[^a-z0-9_.\/+-]//gio;

107

$file =~ s/\.\.//go;

108

$file =~ /(.*)/;

109

$file = $1;

110

111

my $basename = basename($file);

112

$basename =~ s/\.(\w+)$//go;

113

my $ext = $1;

114

115

my $text = undef;

116

if ($ext =~ /html?/) {

117

# Try to fetch a text version of the HTML

118

if (-x '/usr/bin/lynx') {

119

$text = `/usr/bin/lynx -dump -nolist "$file"`;

120

}

121

elsif (-x '/usr/bin/links') {

122

$text = `/usr/bin/links -dump -no-references "$file"`;

123

}

124

elsif (-x '/usr/bin/w3m') {

125

$text = `/usr/bin/w3m -dump "$file"`;

126

}

127

elsif (-x '/usr/bin/html2text') {

128

$text = `/usr/bin/html2text -nobs "$file"`;

129

}

130

else {

131

# Fallback to retarded HTML stripper

132

open F, $file;

133

$text = join("", <F>);

134

close F;

135

$text =~ s/<.*?>//go;

136

}

137

}

138

elsif ($ext =~ /gz/) {

139

my ($fh, $tmp_path) = tempfile("dsearch-XXXXXX",

140

DIR => '/tmp',

141

SUFFIX => "$basename");

142

print $fh `gunzip -c "$file"`;

143

close $fh;

144

$text = get_text($tmp_path);

145

}

146

elsif ($ext =~ /pdf/) {

147

if (-x '/usr/bin/pdftotext') {

148

$text = `/usr/bin/pdftotext "$file" -`;

149

}

150

else {

151

# pstotext is a dependency, so this should never fail, but we

152

# recommend the user to install xpdf-utils instead, to get

153

# pdftotext (better extraction quality and much faster)

154

$text = file_to_text("/usr/bin/pstotext '$file'", "PDF", "xpdf-utils");

155

}

156

}

157

elsif ($ext =~ /dvi/) {

158

$text = file_to_text("/usr/bin/catdvi '$file'", "DVI", "catdvi");

159

}

160

elsif ($ext =~ /ps/) {

161

$text = file_to_text("/usr/bin/pstotext '$file'", "Postscript", "pstotext");

162

}

163

else {

164

open F, $file;

165

$text = join("", <F>);

166

close F;

167

}

168

169

return $text;

170

}

171

172

sub get_extract {

173

my ($file, $search) = @_;

174

175

my $text = get_text($file);

176

my @words = split(/\s/, $search);

177

return join(" / ", get_highlighted_fragments($text, \@words));

178

}

179

180

sub error {

181

my ($msg, $vars) = @_;

182

183

$msg =~ s/\n/<br>\n/go;

184

$vars->{error} = $msg;

185

186

my $tt = Template->new({INCLUDE_PATH => TEMPLATES_DIR});

187

$tt->process('search_error.tmpl', $vars);

188

exit 0;

189

}

190

191

sub get_doc_base_info {

192

my ($info_hash, $file) = @_;

193

194

my $packed_dirname = pack("Z100", dirname($file));

195

my $value = $info_hash->{$packed_dirname};

196

defined $value || return;

197

unpack("Z50 Z1000", $value);

198

}

199

200

# Selects the best matching language between two lists (the available languages

201

# list, in normal locale format, and the acceptable languages list, in

202

# HTTP_ACCEPT_LANGUAGE format).

203

sub select_matching_language {

204

my ($available_list, $accept_list) = @_;

205

206

foreach my $accept (@$accept_list) {

207

return $accept if grep { $accept eq $_ } @$available_list;

208

my $stripped_accept = $accept;

209

$stripped_accept =~ s/-.*//go;

210

$stripped_accept =~ s/;.*//go;

211

foreach my $available (@$available_list) {

212

my $stripped_available = $available;

213

$stripped_available =~ s/_.*//go;

214

$stripped_available =~ s/\..*//go;

215

return $available if $stripped_accept eq $stripped_available;

216

}

217

}

218

}

219

220

221

222

my $online = !param('file');

223

print header(-charset=>'utf-8') if $online;

224

225

my $gettext = Locale::gettext->domain_raw("dhelp");

226

$gettext->codeset('UTF-8'); # Always UTF-8, specified in the HTML templates

227

my @available_languages = map { chomp; $_ } split(/\n/, `/usr/bin/locale -a`);

228

my @accept_languages = split(",", $ENV{HTTP_ACCEPT_LANGUAGE} || "");

229

setlocale(LC_MESSAGES, select_matching_language(\@available_languages,

230

\@accept_languages));

231

my $vars = {

232

online => $online,

233

base_url => $online ? '' : 'file:/usr/share',

234

results => [],

235

gettext => $gettext,

236

};

237

238

sub _ {

239

$gettext->get($_[0]);

240

}

241

242

unless (-f DOCSINDEX) {

243

error(_("No search database found.\nPlease run /etc/cron.weekly/dhelp as superuser to create it."), $vars);

244

}

245

246

# Fetch, decode, untaint and check search parameter

247

my $search = param('search');

248

# This script may be called from command-line too, so the string might be just

249

# a bunch of octets. Assume UTF-8 in that case.

250

unless (Encode::is_utf8($search)) {

251

$search = decode('utf-8', $search);

252

}

253

$search =~ m/(.+)/; # Is there anything we shouldn't allow?

254

$search = $1;

255

$vars->{search_terms} = encode('utf8', $search);

256

257

if ($search !~ /\S/) {

258

error(_("Please specify a search term."), $vars);

259

}

260

261

262

# Pass parameters to Swish++ search program

263

(param('page') || "") =~ /(\d+)/;

264

my $current_page = defined($1) ? $1 : 1;

265

my $skipped_results = ($current_page - 1) * RESULTS_PER_PAGE;

266

# search++ seems to support only Latin-1. Everything else makes it die with

267

# "malformed query"

268

my $encoded_search = encode('latin1', $search);

269

open (SEARCH, '-|')

270

or exec '/usr/bin/search++', '-i', DOCSINDEX,

271

'-r', $skipped_results,

272

'-m', RESULTS_PER_PAGE,

273

$encoded_search;

274

275

# Open the doc-base info database, and tie to %docs_info

276

my %docs_info;

277

tie %docs_info, 'DB_File', DOCBASEDB, O_RDONLY or die "Cannot tie ".DOCBASEDB.": $!";

278

279

280

# Read the search results back and store somewhere, so we can calculate the

281

# pages.

282

283

my ($n_results, $ignored);

284

my @results;

285

while ( <SEARCH> ) {

286

if ( /^\# ignored: / ) {

287

288

# Get the ignored words so we can report them to the user.

289

290

$ignored = $';

291

$ignored =~ s/\s+$//;

292

next;

293

}

294

if ( /^\# results: (\d+)/) {

295

$n_results = $1;

296

if ($n_results == 0) {

297

if ($ignored) {

298

$vars->{msg} = sprintf(_("No results (ignored words: %s). Search terms must be at least 4 characters long, and must not be \"stop words\". The command \"<tt>index++ -S</tt>\" gives the stop word list."), $ignored);

299

} else {

300

$vars->{msg} = _("No results.");

301

}

302

last;

303

}

304

}

305

306

# Future releases of SWISH++ may emit other comments: ignore ones we

307

# don't know about.

308

309

next if /^\#/;

310

311

my( $rank, $file, $size, $title ) = split(/ /, $_, 4);

312

my $extract = get_extract($file, $search);

313

314

$size = int( $size / 1024 );

315

if ( $size ) {

316

$size .= 'K';

317

} else {

318

$size = '<1K';

319

}

320

my $file_url = $file;

321

if ($online) {

322

my $file_url_encoded = uri_escape($file_url);

323

$file_url = "/cgi-bin/dhelp_fetcher?file=$file_url_encoded";

324

} else {

325

$file_url = "file:$file";

326

}

327

# Calculate the real title

328

my ($doc_id, $doc_base_title) = get_doc_base_info(\%docs_info, $file);

329

$title .= " ($doc_base_title)" if $doc_base_title;

330

push @{$vars->{results}}, {rank => $rank,

331

file => $file_url,

332

title => $title,

333

size => $size,

334

extract => $extract};

335

}

336

337

if (!$online && $n_results > RESULTS_OFFLINE_MODE) {

338

$vars->{msg} .= " ".sprintf(_("Showing only %s out of %s results in offline mode."), RESULTS_OFFLINE_MODE, $n_results);

339

}

340

341

close (SEARCH);

342

343

# We are already fetching just the data page we want, but Data::Page is

344

# useful to calculate information about _other_ pages of data

345

my $pager = Data::Page->new($n_results,

346

RESULTS_PER_PAGE,

347

$current_page);

348

349

# Include paging information

350

$vars->{current_page} = $pager->current_page;

351

$vars->{previous_page} = $pager->previous_page;

352

$vars->{next_page} = $pager->next_page;

353

$vars->{last_page} = $pager->last_page;

354

355

my $tt = Template->new({INCLUDE_PATH => TEMPLATES_DIR});

356

$tt->process('search_results.tmpl', $vars) or die $tt->error;

357

358

__END__

359

360

Older »