2
#-------------------------------------------------------
3
# Small script to auto-generate URL Alias files for 5.2+ AWStats
4
# Requires two Perl modules below.
5
# From original title-grabber.pl file
6
# (Feedback/suggestions to: simonjw@users.sourceforge.net)
7
# Modified by eldy@users.sourceforge.net
9
# Note: If you want to retrieve document titles over SSL you must have OpenSSL and
10
# the Net::SSL(eay) Perl Module available. This code will check that SSL is
11
# supported before attempting to retrieve via it.
12
#-------------------------------------------------------
15
use strict;no strict "refs";
19
my $REVISION='$Revision: 1.7 $'; $REVISION =~ /\s(.*)\s/; $REVISION=$1;
20
my $VERSION="1.0 (build $REVISION)";
22
############### EDIT HERE ###############
24
# you can set this manually if you will only grep one site
27
# Where the default input is located.
28
my $awStatsDataDir = "/var/lib/awstats";
30
# Throttle HTTP requests - help avoid DoS-like results if on a quick network.
31
# Number is the number of seconds to pause between requests. Set to zero for
33
my $throttleRequestsTime = 0;
36
# UA string passed to server. You should add this to SkipUserAgents in the
37
# awstats.conf file if you want to ignore hits from this code.
38
my $userAgent = "urlaliasbuilder/$VERSION";
39
# Put a sensible e-mail address here
40
my $spiderOwner = "spider\@mydomain.com";
42
# Timeout (in seconds) for each HTTP request (increase on slow connections)
44
# Proxy server to use when doing http/s - leave blank if you don't have one
45
#my $proxyServer = "http://my.proxy.server:port/";
47
# Hosts not to use a proxy for
48
my @hostsNoProxy = ("host1","host1.my.domain.name");
49
# Make sure we don't download multi-megabyte files! We need only head section
50
my $maxDocSizeBytes = 4096; # number is bytes
52
############### DON'T EDIT BELOW HERE ###############
55
my $FILEMARKER1 = "BEGIN_SIDER";
56
my $FILEMARKER2 = "END_SIDER";
58
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
60
my $fullMonth = sprintf("%02d",$mon+1);
61
my $fullYear = sprintf("%04d",$year+1900);
66
# Change default value if options are used
74
my $fileToOpen = $awStatsDataDir . "/awstats" . $fullMonth . $fullYear . ($SITECONFIG?".$SITECONFIG":"") . ".txt";
75
# URL Alias file to open
76
my $urlAliasFile = "urlalias" . ($SITECONFIG?".$SITECONFIG":"") . ".txt";
79
if ($ARGV[$_] =~ /^-*urllistfile=([^\s&]+)/i) { $fileToOpen="$1"; next; }
80
if ($ARGV[$_] =~ /^-*urlaliasfile=([^\s&]+)/i) { $urlAliasFile="$1"; next; }
81
if ($ARGV[$_] =~ /^-*site=(.*)/i) { $hostname="$1"; next; }
82
if ($ARGV[$_] =~ /^-*h/i) { $helpfound=1; next; }
83
if ($ARGV[$_] =~ /^-*overwrite/i) { $overwritedata=1; next; }
84
if ($ARGV[$_] =~ /^-*secure/i) { $useHTTPS=1; next; }
87
# if no host information provided, we bomb out to usage
88
if(! $hostname && ! $SITECONFIG) { $nohosts=1; }
90
# if no hostname set (i.e. -site=) then we use the config value
91
if(! $hostname && $SITECONFIG) { $hostname=$SITECONFIG; }
94
my $DIR; my $PROG; my $Extension;
95
($DIR=$0) =~ s/([^\/\\]*)$//; ($PROG=$1) =~ s/\.([^\.]*)$//; $Extension=$1;
96
if ($nohosts || $helpfound || ! @ARGV) {
97
print "\n----- $PROG $VERSION -----\n";
98
print ucfirst($PROG)." generates an 'urlalias' file from an input file.\n";
99
print "The input file must contain a list of URLs (It can be an AWStats history file).\n";
100
print "For each of thoose URLs, the script get the corresponding HTML page and catch the\n";
101
print "header information (title), then it writes an output file that contains one line\n";
102
print "for each URLs and several fields:\n";
103
print "- The first field is the URL,\n";
104
print "- The second is title caught from web page.\n";
105
print "This resulting file can be used by AWStats urlalias plugin.\n";
107
print "Usage: $PROG.$Extension -site=www.myserver.com [options]\n";
109
print "The site parameter contains the web server to get the page from.\n";
110
print "Where options are:\n";
111
print " -urllistfile=Input urllist file\n";
112
print " If this file is an AWStats history file then urlaliasbuilder will use the\n";
113
print " SIDER section of this file as its input URL's list.\n";
114
print " -urlaliasfile=Output urlalias file to build\n";
115
print " -overwrite Overwrite output file if exists\n";
116
print " -secure Use https protocol\n";
118
print "Example: $PROG.$Extension -site=www.someotherhost.com\n";
120
print "This is default configuration used when no option are provided on command line:\n";
121
print "Input urllist file: $fileToOpen (overwritten by -urllistfile option)\n";
122
print "Output urlalias file: $urlAliasFile (overwritten by -urlaliasfile option)\n";
124
print "This script was written from Simon Waight original works title-grabber.pl.\n";
133
# only read the alias file if we want to do a comparison
134
# and append new items only (i.e. not overwrite)
135
if($overwritedata == 0) {
136
open(FILE,$urlAliasFile);
140
@bits=split(/\t/,$_);
141
@archivedKeys[$counter]=@bits[0];
143
#print "key: " . @bits[0] . "\n";
149
# open input file (might be an AWStats history data file)
150
print "Reading input file: $fileToOpen\n";
151
open(FILE,$fileToOpen) || die "Error: Can't open input urllist file $fileToOpen";
155
my @addToAliasFile=();
156
my $addToAliasFileCount=0;
157
my $isawstatshistoryfile=0;
161
if ($_ =~ /^AWSTATS DATA FILE/) {
162
print "This file looks like an AWStats history file. Searching URLs list...\n";
163
$isawstatshistoryfile=1;
166
# Split line out into fields
167
@field=split(/\s+/,$_);
168
if (! $field[0]) { next; }
170
# If we're at the start of the URL section of file
171
if (! $isawstatshistoryfile || $field[0] eq $FILEMARKER1) {
176
my @field=split(/\s+/,$_);
179
while ($field[0] ne $FILEMARKER2) {
181
# compare awstats data entry against urlalias entry
182
# only if we don't just want to write current items
183
# to the file (i.e. overwrite)
184
if($overwritedata == 0) {
185
foreach my $key (@archivedKeys) {
186
if($field[0] eq $key) {
191
# it's a new URL, so add to list of items to retrieve
193
@addToAliasFile[$addToAliasFileCount] = $field[0];
194
$addToAliasFileCount++;
195
#print "new: " . $field[0] . "\n"
199
# no comparison, so everything is 'new'
200
@addToAliasFile[$addToAliasFileCount] = $field[0];
201
$addToAliasFileCount++;
206
@field=split(/\s+/,$_);
213
if($addToAliasFileCount == 0) {
214
print "Found no new documents.\n\n" ;
218
print "Found " . $addToAliasFileCount . " new documents with no alias.\n";
222
print "Looking thoose pages on web site '$hostname' to get alias...\n";
224
# Create a user agent (browser) object
225
my $ua = new LWP::UserAgent;
226
# set user agent name
227
$ua->agent($userAgent);
228
# set user agents owners e-mail address
229
$ua->from($spiderOwner);
230
# set timeout for requests
231
$ua->timeout($getTimeOut);
233
# set proxy for access to external sites
234
$ua->proxy(["http","https"],$proxyServer);
235
# avoid proxy for these hosts
236
$ua->no_proxy(@hostsNoProxy);
238
# set maximum size of document to retrieve (in bytes)
239
$ua->max_size($maxDocSizeBytes);
240
if(!($ua->is_protocol_supported('https')) && $useHTTPS) {
241
print "SSL is not supported on this machine.\n\n";
247
# Now lets build the contents to write (or append) to urlalias file
248
foreach my $newAlias (@addToAliasFile) {
249
sleep $throttleRequestsTime;
250
my $newAliasEntry = &Generate_Alias_List_Entry($newAlias);
251
$fileOutput .= $newAliasEntry . "\n";
254
# write the data back to urlalias file
255
if (! $overwritedata) {
257
open(FILE,">>$urlAliasFile") || die "Error: Failed to open file for writing: $_\n\n";
258
print FILE $fileOutput;
262
open(FILE,">$urlAliasFile") || die "Error: Failed to open file for writing: $_\n\n";
263
foreach my $newAlias (@addToAliasFile) {
264
my $newAliasEntry = &Generate_Alias_List_Entry($newAlias);
265
print FILE "$newAliasEntry\n";
269
print "File $urlAliasFile created/updated.\n";
273
#--------------------------- End of Main -----------------------------
277
# Generate new lines for urlalias file by doing a http get using data
280
sub Generate_Alias_List_Entry {
282
# take in the path & document
283
my $urltoget = shift;
285
my $urlPrefix = "http://";
288
$urlPrefix = "https://";
293
$AliasLine = $urltoget;
296
# build a full HTTP request to pass to user agent
297
my $fullurltoget = $urlPrefix . $hostname . $urltoget;
299
# Create a HTTP request
300
print "Getting page $fullurltoget\n";
302
my $req = new HTTP::Request GET => $fullurltoget;
304
# Pass request to the user agent and get a response back
305
my $res = $ua->request($req);
307
# Parse returned document for page title
308
if ($res->is_success()) {
309
$pageTitle = $res->title;
311
print "Failed to get page: ".$res->status_line."\n";
312
$pageTitle = "Unknown Title";
314
if ($pageTitle eq "") {
315
$pageTitle = "Unknown Title";
317
return $AliasLine . $pageTitle;
2
#-------------------------------------------------------
3
# Small script to auto-generate URL Alias files for 5.2+ AWStats
4
# Requires two Perl modules below.
5
# From original title-grabber.pl file
6
# (Feedback/suggestions to: simonjw@users.sourceforge.net)
7
# Modified by eldy@users.sourceforge.net
9
# Note: If you want to retrieve document titles over SSL you must have OpenSSL and
10
# the Net::SSL(eay) Perl Module available. This code will check that SSL is
11
# supported before attempting to retrieve via it.
12
#-------------------------------------------------------
15
use strict;no strict "refs";
19
my $REVISION='$Revision: 1.7 $'; $REVISION =~ /\s(.*)\s/; $REVISION=$1;
20
my $VERSION="1.0 (build $REVISION)";
22
############### EDIT HERE ###############
24
# you can set this manually if you will only grep one site
27
# Where the default input is located.
28
my $awStatsDataDir = "/var/lib/awstats";
30
# Throttle HTTP requests - help avoid DoS-like results if on a quick network.
31
# Number is the number of seconds to pause between requests. Set to zero for
33
my $throttleRequestsTime = 0;
36
# UA string passed to server. You should add this to SkipUserAgents in the
37
# awstats.conf file if you want to ignore hits from this code.
38
my $userAgent = "urlaliasbuilder/$VERSION";
39
# Put a sensible e-mail address here
40
my $spiderOwner = "spider\@mydomain.com";
42
# Timeout (in seconds) for each HTTP request (increase on slow connections)
44
# Proxy server to use when doing http/s - leave blank if you don't have one
45
#my $proxyServer = "http://my.proxy.server:port/";
47
# Hosts not to use a proxy for
48
my @hostsNoProxy = ("host1","host1.my.domain.name");
49
# Make sure we don't download multi-megabyte files! We need only head section
50
my $maxDocSizeBytes = 4096; # number is bytes
52
############### DON'T EDIT BELOW HERE ###############
55
my $FILEMARKER1 = "BEGIN_SIDER";
56
my $FILEMARKER2 = "END_SIDER";
58
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
60
my $fullMonth = sprintf("%02d",$mon+1);
61
my $fullYear = sprintf("%04d",$year+1900);
66
# Change default value if options are used
74
my $fileToOpen = $awStatsDataDir . "/awstats" . $fullMonth . $fullYear . ($SITECONFIG?".$SITECONFIG":"") . ".txt";
75
# URL Alias file to open
76
my $urlAliasFile = "urlalias" . ($SITECONFIG?".$SITECONFIG":"") . ".txt";
79
if ($ARGV[$_] =~ /^-*urllistfile=([^\s&]+)/i) { $fileToOpen="$1"; next; }
80
if ($ARGV[$_] =~ /^-*urlaliasfile=([^\s&]+)/i) { $urlAliasFile="$1"; next; }
81
if ($ARGV[$_] =~ /^-*site=(.*)/i) { $hostname="$1"; next; }
82
if ($ARGV[$_] =~ /^-*h/i) { $helpfound=1; next; }
83
if ($ARGV[$_] =~ /^-*overwrite/i) { $overwritedata=1; next; }
84
if ($ARGV[$_] =~ /^-*secure/i) { $useHTTPS=1; next; }
87
# if no host information provided, we bomb out to usage
88
if(! $hostname && ! $SITECONFIG) { $nohosts=1; }
90
# if no hostname set (i.e. -site=) then we use the config value
91
if(! $hostname && $SITECONFIG) { $hostname=$SITECONFIG; }
94
my $DIR; my $PROG; my $Extension;
95
($DIR=$0) =~ s/([^\/\\]*)$//; ($PROG=$1) =~ s/\.([^\.]*)$//; $Extension=$1;
96
if ($nohosts || $helpfound || ! @ARGV) {
97
print "\n----- $PROG $VERSION -----\n";
98
print ucfirst($PROG)." generates an 'urlalias' file from an input file.\n";
99
print "The input file must contain a list of URLs (It can be an AWStats history file).\n";
100
print "For each of thoose URLs, the script get the corresponding HTML page and catch the\n";
101
print "header information (title), then it writes an output file that contains one line\n";
102
print "for each URLs and several fields:\n";
103
print "- The first field is the URL,\n";
104
print "- The second is title caught from web page.\n";
105
print "This resulting file can be used by AWStats urlalias plugin.\n";
107
print "Usage: $PROG.$Extension -site=www.myserver.com [options]\n";
109
print "The site parameter contains the web server to get the page from.\n";
110
print "Where options are:\n";
111
print " -urllistfile=Input urllist file\n";
112
print " If this file is an AWStats history file then urlaliasbuilder will use the\n";
113
print " SIDER section of this file as its input URL's list.\n";
114
print " -urlaliasfile=Output urlalias file to build\n";
115
print " -overwrite Overwrite output file if exists\n";
116
print " -secure Use https protocol\n";
118
print "Example: $PROG.$Extension -site=www.someotherhost.com\n";
120
print "This is default configuration used when no option are provided on command line:\n";
121
print "Input urllist file: $fileToOpen (overwritten by -urllistfile option)\n";
122
print "Output urlalias file: $urlAliasFile (overwritten by -urlaliasfile option)\n";
124
print "This script was written from Simon Waight original works title-grabber.pl.\n";
133
# only read the alias file if we want to do a comparison
134
# and append new items only (i.e. not overwrite)
135
if($overwritedata == 0) {
136
open(FILE,$urlAliasFile);
140
@bits=split(/\t/,$_);
141
@archivedKeys[$counter]=@bits[0];
143
#print "key: " . @bits[0] . "\n";
149
# open input file (might be an AWStats history data file)
150
print "Reading input file: $fileToOpen\n";
151
open(FILE,$fileToOpen) || die "Error: Can't open input urllist file $fileToOpen";
155
my @addToAliasFile=();
156
my $addToAliasFileCount=0;
157
my $isawstatshistoryfile=0;
161
if ($_ =~ /^AWSTATS DATA FILE/) {
162
print "This file looks like an AWStats history file. Searching URLs list...\n";
163
$isawstatshistoryfile=1;
166
# Split line out into fields
167
@field=split(/\s+/,$_);
168
if (! $field[0]) { next; }
170
# If we're at the start of the URL section of file
171
if (! $isawstatshistoryfile || $field[0] eq $FILEMARKER1) {
176
my @field=split(/\s+/,$_);
179
while ($field[0] ne $FILEMARKER2) {
181
# compare awstats data entry against urlalias entry
182
# only if we don't just want to write current items
183
# to the file (i.e. overwrite)
184
if($overwritedata == 0) {
185
foreach my $key (@archivedKeys) {
186
if($field[0] eq $key) {
191
# it's a new URL, so add to list of items to retrieve
193
@addToAliasFile[$addToAliasFileCount] = $field[0];
194
$addToAliasFileCount++;
195
#print "new: " . $field[0] . "\n"
199
# no comparison, so everything is 'new'
200
@addToAliasFile[$addToAliasFileCount] = $field[0];
201
$addToAliasFileCount++;
206
@field=split(/\s+/,$_);
213
if($addToAliasFileCount == 0) {
214
print "Found no new documents.\n\n" ;
218
print "Found " . $addToAliasFileCount . " new documents with no alias.\n";
222
print "Looking thoose pages on web site '$hostname' to get alias...\n";
224
# Create a user agent (browser) object
225
my $ua = new LWP::UserAgent;
226
# set user agent name
227
$ua->agent($userAgent);
228
# set user agents owners e-mail address
229
$ua->from($spiderOwner);
230
# set timeout for requests
231
$ua->timeout($getTimeOut);
233
# set proxy for access to external sites
234
$ua->proxy(["http","https"],$proxyServer);
235
# avoid proxy for these hosts
236
$ua->no_proxy(@hostsNoProxy);
238
# set maximum size of document to retrieve (in bytes)
239
$ua->max_size($maxDocSizeBytes);
240
if(!($ua->is_protocol_supported('https')) && $useHTTPS) {
241
print "SSL is not supported on this machine.\n\n";
247
# Now lets build the contents to write (or append) to urlalias file
248
foreach my $newAlias (@addToAliasFile) {
249
sleep $throttleRequestsTime;
250
my $newAliasEntry = &Generate_Alias_List_Entry($newAlias);
251
$fileOutput .= $newAliasEntry . "\n";
254
# write the data back to urlalias file
255
if (! $overwritedata) {
257
open(FILE,">>$urlAliasFile") || die "Error: Failed to open file for writing: $_\n\n";
258
print FILE $fileOutput;
262
open(FILE,">$urlAliasFile") || die "Error: Failed to open file for writing: $_\n\n";
263
foreach my $newAlias (@addToAliasFile) {
264
my $newAliasEntry = &Generate_Alias_List_Entry($newAlias);
265
print FILE "$newAliasEntry\n";
269
print "File $urlAliasFile created/updated.\n";
273
#--------------------------- End of Main -----------------------------
277
# Generate new lines for urlalias file by doing a http get using data
280
sub Generate_Alias_List_Entry {
282
# take in the path & document
283
my $urltoget = shift;
285
my $urlPrefix = "http://";
288
$urlPrefix = "https://";
293
$AliasLine = $urltoget;
296
# build a full HTTP request to pass to user agent
297
my $fullurltoget = $urlPrefix . $hostname . $urltoget;
299
# Create a HTTP request
300
print "Getting page $fullurltoget\n";
302
my $req = new HTTP::Request GET => $fullurltoget;
304
# Pass request to the user agent and get a response back
305
my $res = $ua->request($req);
307
# Parse returned document for page title
308
if ($res->is_success()) {
309
$pageTitle = $res->title;
311
print "Failed to get page: ".$res->status_line."\n";
312
$pageTitle = "Unknown Title";
314
if ($pageTitle eq "") {
315
$pageTitle = "Unknown Title";
317
return $AliasLine . $pageTitle;