~tsep-dev/tsep/0.9-beta

« back to all changes in this revision

Viewing changes to branches/symfony/app/vendors/robots_txt.php

  • Committer: geoffreyfishing
  • Date: 2011-01-11 23:46:12 UTC
  • Revision ID: svn-v4:ae0de26e-ed09-4cbe-9a20-e40b4c60ac6c::125
Created a symfony branch for future migration to symfony

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
<?php
 
2
 
 
3
        /* Robots.txt interpreter. Version 1.2
 
4
 
 
5
                Change Log
 
6
 
 
7
                1.2     * Bug fix for dumb author(!)
 
8
                1.1     * Bug fix for domain names that have non-word and/or non-digit characters
 
9
                        * Added doxygen documentation
 
10
                1.0     Original Release
 
11
 
 
12
                Copyright (c) Andy Pieters <Pieters.Andy@gmail.com>
 
13
 
 
14
                This software is released under the terms of the GPL v3, as found on http://www.gnu.org/licenses/gpl-3.0.txt
 
15
                
 
16
                Abstract
 
17
 
 
18
                Robots exclusion standard is considered propper netiquette, so any kind of script that exhibits
 
19
                crawling-like behavior is expected to abide by it.
 
20
 
 
21
                The intended use of this class is to feed it a url before you intend to visit it. The class will
 
22
                automatically attempt to read the robots.txt file and will return a boolean value to indicate if
 
23
                you are allowed to visit this url.
 
24
 
 
25
                Maximum Crawl-delays and request-rates maxed-out at 60seconds.
 
26
 
 
27
                The class will block until the detected crawl-delay (or request-rate) allows visiting the url.
 
28
 
 
29
                For instance, if Crawl-delay is set to 3, the Robots_txt::urlAllowed() method will block for 3
 
30
                seconds when called a second time. An internal clock is kept with the last visited time, so if
 
31
                the delay is already expired, the method will not block.
 
32
 
 
33
                Example usage
 
34
 
 
35
                foreach($arrUrlsToVisit as $strUrlToVisit) {
 
36
 
 
37
                        if(Robots_txt::urlAllowed($strUrlToVisit,$strUserAgent)) {
 
38
 
 
39
                                #visit url, do processing. . . 
 
40
                        }
 
41
                }
 
42
 
 
43
                The simple example above will ensure you abide by the wishes of the site owners.
 
44
 
 
45
                Note: an unofficial non-standard extension exists, that limits the times that crawlers
 
46
                          are allowed to visit a site. I choose to ignore this extension because I feel it
 
47
                          is unreasonable.
 
48
 
 
49
                Note: You are only *required* to specify your userAgent the first time you call the urlAllowed method, and
 
50
                          only the first value is ever used.
 
51
 
 
52
                For the real geeks out there, note that the way I set it up, I require the use of a public method, but only
 
53
                inside the class can an instance be created, and no instance is ever returned to the outside. So, is it still
 
54
                a public method? */
 
55
 
 
56
        /** @brief Robots_txt class
 
57
                @author Andy Pieters @a Pieters.Andy@gmail.com
 
58
 
 
59
                The intended use of this class is to feed it a url before you intend to visit it. The class will
 
60
                automatically attempt to read the robots.txt file and will return a boolean value to indicate if
 
61
                you are allowed to visit this url. */
 
62
        class Robots_txt {
 
63
 
 
64
                /** @brief The useragent name to use when evaluating robots.txt files */
 
65
                protected $strUserAgent;
 
66
 
 
67
                /** @brief Internal array to cache some rules. The '-' index is for disallows, the '+' index is for allows */
 
68
                protected $arrRules=array('-'=>array(), '+'=>array());
 
69
 
 
70
                /** @brief Cached crawl delay */
 
71
                protected $intDelay=0;
 
72
 
 
73
                /** @brief Internal variable to log last visit date/time */
 
74
                protected $intLastVisit=null;
 
75
 
 
76
                /** @brief Cached hostname  to crawl */
 
77
                protected $strHost=null;
 
78
 
 
79
                /** @brief Internal variable to cache instances of this class */
 
80
                protected static $arrInstances=array();
 
81
 
 
82
                /** @brief Internal variable to store useragent name */
 
83
                protected static $strReportedUserAgent=null;
 
84
 
 
85
                /** @brief Class constructor, can only be called from inside the class (or its children)
 
86
                        @param $strUrl the url that robots.txt is located on
 
87
                        @param $strUserAgent the useragent name */
 
88
                protected function __construct($strUrl,$strUserAgent) {
 
89
 
 
90
                        $parsed = parse_url($strUrl);
 
91
                        
 
92
                        $this->init($parsed['scheme'],$parsed['host'],$strUserAgent);
 
93
                }
 
94
                
 
95
                /** @brief Initializes this instance, retrieves the robots.txt file and parses it
 
96
                        @param $strScheme the protocol the host is on (http,https)
 
97
                        @param $strHost the host to crawl,check robots.txt
 
98
                        @param $strUserAgent the useragent name
 
99
                        @throws Exception in case empty parameters are passed */
 
100
                protected function init($strScheme,$strHost,$strUserAgent) {
 
101
 
 
102
                        if( (strlen(($this->strUserAgent=$strUserAgent))) && (strlen(($this->strScheme=$strScheme))) && (strlen(($this->strHost=$strHost)))) {
 
103
 
 
104
                                $this->parseFile($strUserAgent,file_get_contents("$strScheme://$strHost/robots.txt"));
 
105
                                
 
106
                        } else {
 
107
 
 
108
                                throw new Exception('Syntax Error');
 
109
                        }
 
110
                        
 
111
                }
 
112
 
 
113
                /** @brief Parses a robots.txt file
 
114
                        @param $strUserAgent The useragent name to use when looking for matches
 
115
                        @param $strRobotsFile The contents of a robots.txt file */
 
116
                protected function parseFile($strUserAgent,$strRobotsFile) {
 
117
 
 
118
                        if(strlen($strRobotsFile)) {
 
119
 
 
120
                                #convert end of line markers. Expected: CR or CR/LF, or LF
 
121
                                #What it does: it converts all CR/LF to LF, and then converts all CR to LF. So the expected output always has LF as line endings
 
122
 
 
123
                                $strRobotsFile=str_replace(array("\r\n","\r"),"\n",$strRobotsFile);
 
124
                        
 
125
                                if((($intCount=count((($arrRules=explode("\n",$strRobotsFile))))))) {
 
126
 
 
127
                                        $blUserAgentMatched=$blReadAgent=false;
 
128
                                        
 
129
                                        for($intCounter=0; $intCounter<$intCount; $intCounter++) {
 
130
 
 
131
                                                if( (strlen(($strLine=trim($arrRules[$intCounter])))) && (!(preg_match('/^\s*#.*$/',$strLine)))) {
 
132
 
 
133
                                                        #I know, the strpos function may return 0, but if the : is the first character, I can't use the input anyway
 
134
 
 
135
                                                        if(strpos($strLine,':')) {
 
136
 
 
137
                                                                $arrNameValuePair=explode(':',$strLine);
 
138
 
 
139
                                                                $strCommand=trim(strtolower($arrNameValuePair[0]));
 
140
 
 
141
                                                                $strArgument=trim($arrNameValuePair[1]);
 
142
 
 
143
                                                                switch($strCommand) {
 
144
 
 
145
                                                                        case 'user-agent': {
 
146
 
 
147
                                                                                /** ``The value of this field is the name of the robot the record is describing access policy for.
 
148
                                                                                          If more than one User-agent field is present the record describes an identical access policy
 
149
                                                                                          for more than one robot. At least one field needs to be present per record.
 
150
 
 
151
                                                                                          The robot should be liberal in interpreting this field. A case insensitive substring match of
 
152
                                                                                          the name without version information is recommended.
 
153
 
 
154
                                                                                          If the value is '*', the record describes the default access policy for any robot that has not
 
155
                                                                                          matched any of the other records. It is not allowed to have multiple such records in the "/robots.txt" file.
 
156
                                                                                */
 
157
 
 
158
                                                                                #Replace non-standards extension of using * inside the User-agent string, like Mediapartners-Google*
 
159
                                                                                #since we are already doing a substring match, it is not needed anyway
 
160
 
 
161
                                                                                $strArgument=preg_replace('#\w\*#',null,$strArgument);
 
162
 
 
163
                                                                                #case insensitive substring match it is then
 
164
 
 
165
                                                                                if( (!$blReadAgent) && (!$blUserAgentMatched) && (($strArgument=='*') || (stripos($strUserAgent,$strArgument)!==false))) {
 
166
 
 
167
                                                                                        $blUserAgentMatched=true;
 
168
                                                                                }
 
169
 
 
170
                                                                                break;
 
171
                                                                        }
 
172
 
 
173
                                                                        case 'disallow': {
 
174
 
 
175
                                                                                if($blUserAgentMatched && ($strArgument)) {
 
176
 
 
177
                                                                                        /** @NOTE       Although it is not a stable standard extension to use * to mean exclude all pages,
 
178
                                                                                                                We will add support for it here */
 
179
                                                                                                                
 
180
                                                                                        $this->arrRules['-'][]=($strArgument=='*'?'/':$strArgument);
 
181
                                                                                }
 
182
 
 
183
                                                                                break;
 
184
                                                                        }
 
185
 
 
186
                                                                        /* non-standard extension */
 
187
 
 
188
                                                                        case 'allow': {
 
189
                                                                        
 
190
                                                                                if($blUserAgentMatched && ($strArgument)) {
 
191
 
 
192
                                                                                        /** @NOTE       Although it is not a stable standard extension to use * to mean exclude all pages,
 
193
                                                                                                                We will add support for it here */
 
194
                                                                                                                
 
195
                                                                                        $this->arrRules['+'][]=($strArgument=='*'?'/':$strArgument);
 
196
                                                                                }
 
197
 
 
198
                                                                                break;
 
199
                                                                        }
 
200
 
 
201
                                                                        /* non-standard extension */
 
202
                                                                        case 'request-rate': {
 
203
 
 
204
                                                                                if(preg_match('#^(\d+)\s*/\s*(\d+)$#',$strArgument,$arrMatches)) {
 
205
 
 
206
                                                                                        if((int) $arrMatches[2]) {
 
207
 
 
208
                                                                                                $fltDelay=abs((int) $arrMatches[2]/(int) $arrMatches[1]);
 
209
 
 
210
                                                                                                if((int) $fltDelay!=$fltDelay) {
 
211
 
 
212
                                                                                                        $fltDelay=((int) $fltDelay)+1;
 
213
                                                                                                }
 
214
                                                                                        }
 
215
                                                                                }
 
216
                                                                        }
 
217
 
 
218
                                                                        #fall through to crawl-delay
 
219
                                                                        
 
220
                                                                        case 'crawl-delay': {
 
221
 
 
222
                                                                                if($blUserAgentMatched && ((int) $strArgument)) {
 
223
 
 
224
                                                                                        #a delay of more then a minute is in my humble opinion unreasonable
 
225
                                                                                        #so anything above 60 seconds is truncated
 
226
 
 
227
                                                                                        $intDelay=abs((int) $strArgument);
 
228
 
 
229
                                                                                        $intDelay=($intDelay>59?60:$intDelay);
 
230
 
 
231
                                                                                        $this->intDelay=$intDelay;
 
232
 
 
233
                                                                                }
 
234
 
 
235
                                                                        }
 
236
 
 
237
                                                                        break;
 
238
                                                                }
 
239
                                                        }
 
240
                                                        
 
241
                                                } else {
 
242
 
 
243
                                                        #anything that is not a directive, means end of matched userAgent
 
244
 
 
245
                                                        if($blUserAgentMatched) {
 
246
 
 
247
                                                                $blReadAgent=true;
 
248
                                                        }
 
249
                                                        
 
250
                                                        $blUserAgentMatched=false;
 
251
                                                }
 
252
                                        }
 
253
                                }
 
254
                        }
 
255
                }
 
256
 
 
257
                /** @brief Internal backend for static member urlAllowed
 
258
                        @param $strUrl The url to check */
 
259
                public function __urlAllowed($strUrl) {
 
260
 
 
261
                        $blOut=false;
 
262
 
 
263
                        #check if we are allowed to crawl this url. CASE MATTERS
 
264
 
 
265
                        $blMatched=false;
 
266
 
 
267
                        # Order: Deny,Allow (Deny has priority over Allow)
 
268
                        if(count($this->arrRules['-'])) {
 
269
 
 
270
                                foreach($this->arrRules['-'] as $strRule) {
 
271
 
 
272
                                        #To Exclude, we check if the url starts with the rule
 
273
                                        
 
274
                                        if($strRule=='/' || (strpos($strUrl,$strRule)===0)) {
 
275
                                                        
 
276
                                                $blMatched=true;
 
277
 
 
278
                                                break;
 
279
                                        }
 
280
                                }
 
281
                        }
 
282
                                                                
 
283
                        if(($blMatched) && (count($this->arrRules['+']))) {
 
284
 
 
285
                                foreach($this->arrRules['+'] as $strRule) {
 
286
 
 
287
                                        #To override an exclude, an exact match is required in the include
 
288
 
 
289
                                        if($strRule=='/' || ($strUrl==$strRule)) {
 
290
 
 
291
                                                $blMatched=$blOut=true;
 
292
 
 
293
                                                break;
 
294
                                        }
 
295
                                }
 
296
                                
 
297
                        }
 
298
 
 
299
                        $blOut=(!$blMatched?true:$blOut);
 
300
 
 
301
                        $intExpire=$this->intLastVisit+$this->intDelay;
 
302
 
 
303
                        if($blOut) {
 
304
 
 
305
                                #blocking is only necessary if we ARE allowed to visit
 
306
                                        
 
307
                                while($intExpire>time()) {
 
308
 
 
309
                                        usleep(100000);
 
310
                                }
 
311
 
 
312
                                $this->intLastVisit=time();
 
313
                        }
 
314
 
 
315
                        return $blOut;
 
316
                }
 
317
 
 
318
                /** @brief Check if the scheme (protocol is supported)
 
319
                        @param $strUrl The url to check
 
320
                        @param $arrResult The url split in its components (scheme,host,url); passed by reference
 
321
                        @returns boolean */
 
322
                public static function isSupportedScheme($strUrl,array &$arrResult=null) {
 
323
 
 
324
                        $blOut=false;
 
325
 
 
326
                        if(strlen(($strUrl=trim(strtolower($strUrl))))) {
 
327
 
 
328
                                # Version 1.1: added -_ as supported characters in domain name, and added 'u' (unicode) pattern modifier
 
329
                                if(preg_match('#^(https?)://([\w\d\-_]+(\.[\w\d\-_]+)+)(/*.*)$#u',$strUrl,$arrMatches)) {
 
330
 
 
331
                                        $strUrl=trim($arrMatches[4]);
 
332
 
 
333
                                        $strUrl=($strUrl==''?'/':$strUrl);
 
334
                                        
 
335
                                        $arrResult=array('scheme'=>$arrMatches[1],
 
336
                                                                         'host'=>$arrMatches[2],
 
337
                                                                         'url'=>$strUrl);
 
338
 
 
339
                                        $blOut=true;
 
340
                                }
 
341
                        }
 
342
 
 
343
                        return $blOut;
 
344
                }
 
345
 
 
346
                /** @brief Class factory
 
347
                        @param $strScheme the protocol the host is on (http,https)
 
348
                        @param $strHost the host to crawl,check robots.txt
 
349
                        @param $strUserAgent the useragent name
 
350
                        @returns instance of Robots_txt class */
 
351
                protected static function &getInstance($strScheme,$strHost,$strUserAgent) {
 
352
 
 
353
                        $objOut=null;
 
354
 
 
355
                        if((($intCount=count(self::$arrInstances)))) {
 
356
 
 
357
                                for($intCounter=0; $intCounter<$intCount; $intCounter++) {
 
358
 
 
359
                                        if(self::$arrInstances[$intCounter]->strHost==$strHost && self::$arrInstances[$intCounter]->strScheme==$strScheme) {
 
360
 
 
361
                                                $objOut=self::$arrInstances[$intCounter];
 
362
 
 
363
                                                break;
 
364
                                        }
 
365
                                }
 
366
                        }
 
367
 
 
368
                        if(!$objOut) {
 
369
 
 
370
                                $objOut=new Robots_txt($strScheme,$strHost,$strUserAgent);
 
371
 
 
372
                                self::$arrInstances[]=$objOut;
 
373
                        }
 
374
 
 
375
                        return $objOut;
 
376
                }
 
377
 
 
378
                /** @brief Checks if the url may be crawled
 
379
                        @param $strUrl the url to check
 
380
                        @param $strUserAgent the useragent name
 
381
                        @returns boolean
 
382
                        @throws Exception if $strUserAgent is missing on first call, if an instance cannot be created, or if an invalid url is passed */
 
383
                public static final function urlAllowed($strUrl,$strUserAgent=null) {
 
384
 
 
385
                        $blOut=null;
 
386
                        
 
387
                        //check userAgent
 
388
 
 
389
                        $strUserAgent=trim($strUserAgent);
 
390
 
 
391
                        if(is_null(self::$strReportedUserAgent) && (strlen($strUserAgent))) {
 
392
 
 
393
                                self::$strReportedUserAgent=$strUserAgent;
 
394
                        }
 
395
 
 
396
                        if(is_null(self::$strReportedUserAgent)) {
 
397
 
 
398
                                throw new Exception('strUserAgent is required on first call to Robots_txt::urlAllowed()');
 
399
                        }
 
400
 
 
401
                        if(self::isSupportedScheme($strUrl,$arrResult)) {
 
402
 
 
403
                                if((($objEngine=self::getInstance($arrResult['scheme'],$arrResult['host'],self::$strReportedUserAgent)))) {
 
404
 
 
405
                                        $blOut=$objEngine->__urlAllowed($arrResult['url']);
 
406
                                        
 
407
                                } else {
 
408
 
 
409
                                        throw new Exception('Cannot get Robots_txt instance.');
 
410
                                }
 
411
                        } else {
 
412
 
 
413
                                throw new Exception('Invalid URL');
 
414
                        }
 
415
 
 
416
                        return $blOut;
 
417
                }
 
418
        }
 
 
b'\\ No newline at end of file'