1
# AWSTATS SEARCH ENGINES DATABASE
2
#------------------------------------------------------------------------------
3
# If you want to add a Search Engine to extend AWStats database detection capabilities,
4
# you must add an entry in SearchEnginesSearchIDOrder, SearchEnginesHashID and in
5
# SearchEnginesHashLib.
6
# An entry if known in SearchEnginesKnownUrl is also welcome.
7
#------------------------------------------------------------------------------
8
# $Revision: 1.28 $ - $Author: eldy $ - $Date: 2003/12/06 00:33:54 $
14
# SearchEnginesSearchIDOrder
15
# It contains all matching criteria to search for in log fields. This list is
16
# used to know in which order to search Search Engines IDs.
17
# Most frequent one are in list1, used when LevelForSearchEnginesDetection is 1 or more
18
# Minor robots are in list2, used when LevelForSearchEnginesDetection is 2 or more
19
# Note: Regex IDs are in lower case and ' ' and '+' are changed into '_'
20
#------------------------------------------------------------------------------
21
@SearchEnginesSearchIDOrder_list1=(
22
# Major internationnal search engines
24
'google\.','216\.239\.(35\.101|37\.101|39\.100|39\.101|51\.100|51\.101|35\.100)',
38
'search\.sli\.sympatico\.ca',
42
@SearchEnginesSearchIDOrder_list2=(
43
# Minor internationnal search engines
57
'overture\.com', # Replace 'goto\.com','Goto.com',
66
'search\.earthlink\.net',
68
# Minor brazilian search engines
69
'engine\.exe', 'miner\.bol\.com\.br',
70
# Minor chinese search engines
71
'baidu\.com','search\.sina\.com','search\.sohu\.com',
72
# Minor czech search engines
73
'atlas\.cz','seznam\.cz','quick\.cz','centrum\.cz','najdi\.to','redbox\.cz',
74
# Minor danish search-engines
75
'opasia\.dk', 'danielsen\.com', 'sol\.dk', 'jubii\.dk', 'find\.dk', 'edderkoppen\.dk', 'netstjernen\.dk', 'orbis\.dk', 'tyfon\.dk', '1klik\.dk', 'ofir\.dk',
76
# Minor dutch search engines
78
# Minor english search engines
79
'(^|\.)ask\.co\.uk','bbc\.co\.uk/cgi-bin/search','ifind\.freeserve','looksmart\.co\.uk','mirago\.','splut\.','spotjockey\.','ukdirectory\.','ukindex\.co\.uk','ukplus\.','searchy\.co\.uk',
80
# Minor finnish search engines
82
# Minor french search engines
83
'recherche\.aol\.fr','ctrouve\.','francite\.','\.lbb\.org','rechercher\.libertysurf\.fr', 'search[\w\-]+\.free\.fr', 'recherche\.club-internet\.fr',
84
# Minor german search engines
86
'fireball\.de','infoseek\.de','suche\d?\.web\.de','[a-z]serv\.rrzn\.uni-hannover\.de',
87
'suchen\.abacho\.de','brisbane\.t-online\.de','allesklar\.de','meinestadt\.de',
89
'(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42)',
90
# Minor hungarian search engines
91
'heureka\.hu','vizsla\.origo\.hu','lapkereso\.hu','goliat\.hu','index\.hu','wahoo\.hu','webmania\.hu','search\.internetto\.hu',
92
# Minor italian search engines
94
# Minor norvegian search engines
96
# Minor polish search engines
98
# Minor russian search engines
99
'ya(ndex)?\.ru', 'aport\.ru', 'rambler\.ru', 'turtle\.ru', 'metabot\.ru',
100
# Minor swedish search engines
101
'evreka\.passagen\.se',
102
# Minor swiss search engines
103
'search\.ch', 'search\.bluewin\.ch'
105
@SearchEnginesSearchIDOrder_listgen=(
106
# Generic search engines
111
# NotSearchEnginesKeys
112
# If a search engie key is found, we check its exclude list to know if it's
113
# really a search engine
114
#------------------------------------------------------------------------------
115
%NotSearchEnginesKeys=(
116
'msn\.'=>'hotmail\.msn\.',
117
'yahoo\.'=>'mail\.yahoo\.'
121
# SearchEnginesHashID
122
# Each Search Engine Search ID is associated to an AWStats id string
123
#------------------------------------------------------------------------------
124
%SearchEnginesHashID = (
125
# Major internationnal search engines
126
'images\.google\.','google_image',
127
'google\.','google','216\.239\.(35\.101|37\.101|39\.100|39\.101|51\.100|51\.101|35\.100)','google',
132
'alexa\.com','alexa',
133
'alltheweb\.com','alltheweb',
134
'altavista\.','altavista',
136
'netscape\.','netscape',
137
'search\.terra\.','terra',
138
'www\.search\.com','search.com',
139
'tiscali\.','tiscali',
140
'search\.aol\.co','aol',
141
'search\.sli\.sympatico\.ca','sympatico',
143
# Minor internationnal search engines
144
'northernlight\.','northernlight',
147
'webcrawler\.','webcrawler',
148
'metacrawler\.','metacrawler',
149
'go2net\.com','go2net',
150
'(^|\.)go\.com','go',
151
'euroseek\.','euroseek',
152
'looksmart\.','looksmart',
154
'nbci\.com/search','nbci',
155
'(^|\.)ask\.com','ask',
157
'overture\.com','overture', # Replace 'goto\.com','Goto.com',
159
'findarticles\.com','findarticles',
160
'infospace\.com','infospace',
162
'dejanews\.','dejanews',
163
'dogpile\.com','dogpile',
164
'wisenut\.com','wisenut',
165
'ixquick\.com','ixquick',
166
'search\.earthlink\.net','earthlink',
168
# Minor brazilian search engines
169
'engine\.exe','engine',
170
'miner\.bol\.com\.br','miner',
171
# Minor chinese search engines
172
'baidu\.com','baidu',
173
'search\.sina\.com','sina',
174
'search\.sohu\.com','sohu',
175
# Minor czech search engines
177
'seznam\.cz','seznam',
179
'centrum\.cz','centrum',
181
'redbox\.cz','redbox',
182
# Minor danish search-engines
183
'opasia\.dk','opasia',
184
'danielsen\.com','danielsen',
188
'edderkoppen\.dk','edderkoppen',
189
'netstjernen\.dk','netstjernen',
194
# Minor dutch search engines
197
# Minor english search engines
198
'(^|\.)ask\.co\.uk','askuk',
199
'bbc\.co\.uk/cgi-bin/search','bbc',
200
'ifind\.freeserve','freeserve',
201
'looksmart\.co\.uk','looksmartuk',
204
'spotjockey\.','spotjockey',
205
'ukdirectory\.','ukdirectory',
206
'ukindex\.co\.uk','ukindex',
208
'searchy\.co\.uk','searchy',
209
# Minor finnish search engines
210
'haku\.www\.fi','haku',
211
# Minor french search engines
212
'recherche\.aol\.fr','aolfr',
213
'ctrouve\.','ctrouve',
214
'francite\.','francite',
216
'rechercher\.libertysurf\.fr','libertysurf',
217
'search[\w\-]+\.free\.fr','free',
218
'recherche\.club-internet\.fr','clubinternet',
219
# Minor german search engines
220
'sucheaol\.aol\.de','aolde',
221
'fireball\.de','fireball',
222
'infoseek\.de','infoseek',
223
'suche\d?\.web\.de','webde',
224
'[a-z]serv\.rrzn\.uni-hannover\.de','meta',
225
'suchen\.abacho\.de','abacho',
226
'brisbane\.t-online\.de','t-online',
227
'allesklar\.de','allesklar',
228
'meinestadt\.de','meinestadt',
229
'212\.227\.33\.241','metaspinner',
230
'(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42)','metacrawler_de',
231
# Minor hungarian search engines
232
'heureka\.hu','heureka',
233
'vizsla\.origo\.hu','origo',
234
'lapkereso\.hu','lapkereso',
235
'goliat\.hu','goliat',
236
'index\.hu','indexhu',
238
'webmania\.hu','webmania',
239
'search\.internetto\.hu','internetto',
240
# Minor italian search engines
241
'virgilio\.it','virgilio',
242
# Minor norvegian search engines
243
'sok\.start\.no','start',
244
# Minor polish search engines
245
'szukaj\.wp\.pl','wp',
246
# Minor russian search engines
247
'ya(ndex)?\.ru','yandex',
249
'rambler\.ru','rambler',
250
'turtle\.ru','turtle',
251
'metabot\.ru','metabot',
252
# Minor swedish search engines
253
'evreka\.passagen\.se','passagen',
254
# Minor swiss search engines
255
'search\.ch','searchch',
256
'search\.bluewin\.ch','bluewin',
257
# Generic search engines
258
'search\..*\.\w+','search'
262
# SearchEnginesKnownUrl
263
# Known rules to extract keywords from a referrer search engine URL
264
#------------------------------------------------------------------------------
265
%SearchEnginesKnownUrl=(
266
# Most common search engines
268
'alltheweb','q(|uery)=',
272
'google_image','(p|q)=',
275
'netscape','search=',
281
'sympatico', 'query=',
283
# Minor internationnal search engines
288
'findarticles','key=',
295
'metacrawler','general=',
297
'northernlight','qr=',
298
'overture','keywords=',
299
'dogpile', 'q(|kw)=',
303
'webcrawler','searchText=',
307
'iune','(keywords|q)=',
308
# Minor brazilian search engines
309
'engine','p1=', 'miner','q=',
310
# Minor chinese search engines
311
'baidu','word=', 'sina', 'word=', 'sohu','word=',
312
# Minor czech search engines
313
'atlas','searchtext=', 'seznam','w=', 'quick','query=', 'centrum','q=', 'najdi','dotaz=', 'redbox','srch=',
314
# Minor danish search engines
315
'opasia','q=', 'danielsen','q=', 'sol','q=', 'jubii','soegeord=', 'finddk','words=', 'edderkoppen','query=', 'orbis','search_field=', '1klik','query=', 'ofir','querytext=',
316
# Minor dutch search engines
317
'ilse','search_for=', 'vindex','in=',
318
# Minor english search engines
319
'askuk','ask=', 'bbc','q=', 'freeserve','q=', 'looksmart','key=',
320
'mirago','txtsearch=', 'splut','pattern=', 'spotjockey','Search_Keyword=', 'ukindex', 'stext=', 'ukdirectory','k=', 'ukplus','search=', 'searchy', 'search_term=',
321
# Minor finnish search engines
323
# Minor french search engines
324
'francite','name=', 'clubinternet', 'q=',
325
# Minor german search engines
327
'fireball','q=', 'infoseek','qt=', 'webde','su=',
328
'abacho','q=', 't-online','q=',
329
'metaspinner','qry=',
330
'metacrawler_de','qry=',
331
# Minor hungarian search engines
332
'heureka','heureka=', 'origo','(q|search)=', 'goliat','KERESES=', 'wahoo','q=', 'internetto','searchstr=',
333
# Minor norvegian search engines
335
# Minor polish search engines
337
# Minor russian search engines
338
'yandex', 'text=', 'rambler','words=', 'aport', 'r=', 'metabot', 'st=',
339
# Minor swedish search engines
341
# Minor swiss search engines
342
'searchch', 'q=', 'bluewin', 'qry='
345
# SearchEnginesKnownUrlNotFound
346
# Known rules to extract not found keywords from a referrer search engine URL
347
#------------------------------------------------------------------------------
348
%SearchEnginesKnownUrlNotFound=(
349
# Most common search engines
353
# If no rules are known, WordsToExtractSearchUrl will be used to search keyword parameter
354
# If no rules are known and search in WordsToExtractSearchUrl failed, this will be used to clean URL of not keyword parameters.
355
#------------------------------------------------------------------------------
356
@WordsToExtractSearchUrl= ('ask=','claus=','general=','key=','kw=','keyword=','keywords=','MT=','p=','q=','qr=','qt=','query=','s=','search=','searchText=','string=','su=','txtsearch=','w=');
357
@WordsToCleanSearchUrl= ('act=','annuaire=','btng=','cat=','categoria=','cfg=','cof=','cou=','count=','cp=','dd=','domain=','dt=','dw=','enc=','exec=','geo=','hc=','height=','hits=','hl=','hq=','hs=','id=','kl=','lang=','loc=','lr=','matchmode=','medor=','message=','meta=','mode=','order=','page=','par=','pays=','pg=','pos=','prg=','qc=','refer=','sa=','safe=','sc=','sort=','src=','start=','style=','stype=','sum=','tag=','temp=','theme=','type=','url=','user=','width=','what=','\\.x=','\\.y=','y=','look=');
359
# SearchEnginesKnownUTFCoding
360
# Known param that proves a search engines has coded its param in UTF8
361
#------------------------------------------------------------------------------
362
%SearchEnginesKnownUTFCoding=(
363
# Most common search engines
365
'alltheweb','cs=utf-8'
369
# SearchEnginesHashLib
370
# List of search engines names
371
# 'search_engine_id', 'search_engine_name',
372
#------------------------------------------------------------------------------
373
%SearchEnginesHashLib=(
374
# Major internationnal search engines
376
'alltheweb','AllTheWeb',
377
'altavista','AltaVista',
380
'google_image','Google (Images)',
383
'netscape','Netscape',
388
'search.com','Search.com',
390
'sympatico', 'Sympatico',
392
# Minor internationnal search engines
396
'dejanews','DejaNews',
397
'euroseek','Euroseek',
398
'findarticles','Find Articles',
399
'go2net','Go2Net (Metamoteur)',
401
'infospace','InfoSpace',
403
'looksmart','Looksmart',
405
'metacrawler','MetaCrawler (Metamoteur)',
407
'northernlight','NorthernLight',
408
'overture','Overture', # Replace 'goto\.com','Goto.com',
411
'teoma','Teoma', # Replace 'directhit\.com','DirectHit',
412
'webcrawler','WebCrawler',
414
'ixquick', 'ix quick',
415
'earthlink', 'Earth Link',
417
# Minor brazilian search engines
418
'engine','Cade', 'miner','Meta Miner',
419
# Minor chinese search engines
420
'baidu','Baidu', 'sina','Sina', 'sohu','Sohu',
421
# Minor czech search engines
422
'atlas','Atlas.cz', 'seznam','Seznam', 'quick','Quick.cz', 'centrum','Centrum.cz','najdi','Najdi.to','redbox','RedBox.cz',
423
# Minor danish search-engines
424
'opasia','Opasia', 'danielsen','Thor (danielsen.com)', 'sol','SOL', 'jubii','Jubii', 'finddk','Find', 'edderkoppen','Edderkoppen', 'netstjernen','Netstjernen', 'orbis','Orbis', 'tyfon','Tyfon', '1klik','1Klik', 'ofir','Ofir',
425
# Minor dutch search engines
426
'ilse','Ilse','vindex','Vindex\.nl',
427
# Minor english search engines
428
'askuk','Ask Jeeves UK', 'bbc','BBC', 'freeserve','Freeserve', 'looksmartuk','Looksmart UK',
429
'mirago','Mirago', 'splut','Splut', 'spotjockey','Spotjockey', 'ukdirectory','UK Directory', 'ukindex','UKIndex', 'ukplus','UK Plus', 'searchy','searchy.co.uk',
430
# Minor finnish search engines
432
# Minor french search engines
433
'aolfr','AOL (fr)', 'ctrouve','C\'est trouv�', 'francite','Francit�', 'lbb', 'LBB', 'libertysurf', 'Libertysurf', 'free', 'Free.fr', 'clubinternet', 'Club-internet',
434
# Minor german search engines
436
'fireball','Fireball', 'infoseek','Infoseek', 'webde','Web.de',
437
'abacho','Abacho', 't-online','T-Online',
438
'allesklar','allesklar.de', 'meinestadt','meinestadt.de',
439
'metaspinner','metaspinner',
440
'metacrawler_de','metacrawler.de',
441
# Minor hungarian search engines
442
'heureka','Heureka', 'origo','Origo-Vizsla', 'lapkereso','Startlapkeres�', 'goliat','G�li�t', 'indexhu','Index', 'wahoo','Wahoo', 'webmania','webmania.hu', 'internetto','Internetto Keres�',
443
# Minor italian search engines
444
'virgilio','Virgilio',
445
# Minor norvegian search engines
447
# Minor polish search engines
449
# Minor russian search engines
450
'yandex', 'Yandex', 'aport', 'Aport', 'rambler', 'Rambler', 'turtle', 'Turtle', 'metabot', 'MetaBot',
451
# Minor swedish search engines
453
# Minor Swiss search engines
454
'searchch', 'search.ch', 'bluewin', 'search.bluewin.ch',
455
# Generic search engines
456
'search','Unknown search engines'
461
# Enable this code and run perl search_engines.pm to check file entries are ok
462
#-----------------------------------------------------------------------------
463
#foreach my $key (@SearchEnginesSearchIDOrder_list1) {
464
# if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_list1 with no value in SearchEnginesHashID");
465
# foreach my $key2 (@SearchEnginesSearchIDOrder_list2) { if ($key2 eq $key) { error("$key is in 1 and 2\n"); } }
466
# foreach my $key2 (@SearchEnginesSearchIDOrder_listgen) { if ($key2 eq $key) { error("$key is in 1 and gen\n"); } }
468
#foreach my $key (@SearchEnginesSearchIDOrder_list2) {
469
# if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_list1 with no value in SearchEnginesHashID");
470
# foreach my $key2 (@SearchEnginesSearchIDOrder_list1) { if ($key2 eq $key) { error("$key is in 2 and 1\n"); } }
471
# foreach my $key2 (@SearchEnginesSearchIDOrder_listgen) { if ($key2 eq $key) { error("$key is in 2 and gen\n"); } }
473
#foreach my $key (@SearchEnginesSearchIDOrder_listgen) { if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_listgen with no value in SearchEnginesHashID"); } }
474
#foreach my $key (keys %NotSearchEnginesKeys) { if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in NotSearchEnginesKeys with no value in SearchEnginesHashID"); } }
475
#foreach my $key (keys %SearchEnginesKnownUrl) {
477
# foreach my $key2 (values %SearchEnginesHashID) {
478
# if ($key eq $key2) { $found=1; last; }
480
# if (! $found) { die "Entry '$key' has been found in SearchEnginesKnownUrl with no value in SearchEnginesHashID"; }
482
#foreach my $key (keys %SearchEnginesHashLib) {
484
# foreach my $key2 (values %SearchEnginesHashID) {
485
# if ($key eq $key2) { $found=1; last; }
487
# if (! $found) { die "Entry '$key' has been found in SearchEnginesHashLib with no value in SearchEnginesHashID"; }
489
#print @SearchEnginesSearchIDOrder_list1." ".@SearchEnginesSearchIDOrder_list2." ".@SearchEnginesSearchIDOrder_listgen;