3
/* Robots.txt interpreter. Version 1.2
7
1.2 * Bug fix for dumb author(!)
8
1.1 * Bug fix for domain names that have non-word and/or non-digit characters
9
* Added doxygen documentation
12
Copyright (c) Andy Pieters <Pieters.Andy@gmail.com>
14
This software is released under the terms of the GPL v3, as found on http://www.gnu.org/licenses/gpl-3.0.txt
18
Robots exclusion standard is considered propper netiquette, so any kind of script that exhibits
19
crawling-like behavior is expected to abide by it.
21
The intended use of this class is to feed it a url before you intend to visit it. The class will
22
automatically attempt to read the robots.txt file and will return a boolean value to indicate if
23
you are allowed to visit this url.
25
Maximum Crawl-delays and request-rates maxed-out at 60seconds.
27
The class will block until the detected crawl-delay (or request-rate) allows visiting the url.
29
For instance, if Crawl-delay is set to 3, the Robots_txt::urlAllowed() method will block for 3
30
seconds when called a second time. An internal clock is kept with the last visited time, so if
31
the delay is already expired, the method will not block.
35
foreach($arrUrlsToVisit as $strUrlToVisit) {
37
if(Robots_txt::urlAllowed($strUrlToVisit,$strUserAgent)) {
39
#visit url, do processing. . .
43
The simple example above will ensure you abide by the wishes of the site owners.
45
Note: an unofficial non-standard extension exists, that limits the times that crawlers
46
are allowed to visit a site. I choose to ignore this extension because I feel it
49
Note: You are only *required* to specify your userAgent the first time you call the urlAllowed method, and
50
only the first value is ever used.
52
For the real geeks out there, note that the way I set it up, I require the use of a public method, but only
53
inside the class can an instance be created, and no instance is ever returned to the outside. So, is it still
56
/** @brief Robots_txt class
57
@author Andy Pieters @a Pieters.Andy@gmail.com
59
The intended use of this class is to feed it a url before you intend to visit it. The class will
60
automatically attempt to read the robots.txt file and will return a boolean value to indicate if
61
you are allowed to visit this url. */
64
/** @brief The useragent name to use when evaluating robots.txt files */
65
protected $strUserAgent;
67
/** @brief Internal array to cache some rules. The '-' index is for disallows, the '+' index is for allows */
68
protected $arrRules=array('-'=>array(), '+'=>array());
70
/** @brief Cached crawl delay */
71
protected $intDelay=0;
73
/** @brief Internal variable to log last visit date/time */
74
protected $intLastVisit=null;
76
/** @brief Cached hostname to crawl */
77
protected $strHost=null;
79
/** @brief Internal variable to cache instances of this class */
80
protected static $arrInstances=array();
82
/** @brief Internal variable to store useragent name */
83
protected static $strReportedUserAgent=null;
85
/** @brief Class constructor, can only be called from inside the class (or its children)
86
@param $strUrl the url that robots.txt is located on
87
@param $strUserAgent the useragent name */
88
protected function __construct($strUrl,$strUserAgent) {
90
$parsed = parse_url($strUrl);
92
$this->init($parsed['scheme'],$parsed['host'],$strUserAgent);
95
/** @brief Initializes this instance, retrieves the robots.txt file and parses it
96
@param $strScheme the protocol the host is on (http,https)
97
@param $strHost the host to crawl,check robots.txt
98
@param $strUserAgent the useragent name
99
@throws Exception in case empty parameters are passed */
100
protected function init($strScheme,$strHost,$strUserAgent) {
102
if( (strlen(($this->strUserAgent=$strUserAgent))) && (strlen(($this->strScheme=$strScheme))) && (strlen(($this->strHost=$strHost)))) {
104
$this->parseFile($strUserAgent,file_get_contents("$strScheme://$strHost/robots.txt"));
108
throw new Exception('Syntax Error');
113
/** @brief Parses a robots.txt file
114
@param $strUserAgent The useragent name to use when looking for matches
115
@param $strRobotsFile The contents of a robots.txt file */
116
protected function parseFile($strUserAgent,$strRobotsFile) {
118
if(strlen($strRobotsFile)) {
120
#convert end of line markers. Expected: CR or CR/LF, or LF
121
#What it does: it converts all CR/LF to LF, and then converts all CR to LF. So the expected output always has LF as line endings
123
$strRobotsFile=str_replace(array("\r\n","\r"),"\n",$strRobotsFile);
125
if((($intCount=count((($arrRules=explode("\n",$strRobotsFile))))))) {
127
$blUserAgentMatched=$blReadAgent=false;
129
for($intCounter=0; $intCounter<$intCount; $intCounter++) {
131
if( (strlen(($strLine=trim($arrRules[$intCounter])))) && (!(preg_match('/^\s*#.*$/',$strLine)))) {
133
#I know, the strpos function may return 0, but if the : is the first character, I can't use the input anyway
135
if(strpos($strLine,':')) {
137
$arrNameValuePair=explode(':',$strLine);
139
$strCommand=trim(strtolower($arrNameValuePair[0]));
141
$strArgument=trim($arrNameValuePair[1]);
143
switch($strCommand) {
147
/** ``The value of this field is the name of the robot the record is describing access policy for.
148
If more than one User-agent field is present the record describes an identical access policy
149
for more than one robot. At least one field needs to be present per record.
151
The robot should be liberal in interpreting this field. A case insensitive substring match of
152
the name without version information is recommended.
154
If the value is '*', the record describes the default access policy for any robot that has not
155
matched any of the other records. It is not allowed to have multiple such records in the "/robots.txt" file.
158
#Replace non-standards extension of using * inside the User-agent string, like Mediapartners-Google*
159
#since we are already doing a substring match, it is not needed anyway
161
$strArgument=preg_replace('#\w\*#',null,$strArgument);
163
#case insensitive substring match it is then
165
if( (!$blReadAgent) && (!$blUserAgentMatched) && (($strArgument=='*') || (stripos($strUserAgent,$strArgument)!==false))) {
167
$blUserAgentMatched=true;
175
if($blUserAgentMatched && ($strArgument)) {
177
/** @NOTE Although it is not a stable standard extension to use * to mean exclude all pages,
178
We will add support for it here */
180
$this->arrRules['-'][]=($strArgument=='*'?'/':$strArgument);
186
/* non-standard extension */
190
if($blUserAgentMatched && ($strArgument)) {
192
/** @NOTE Although it is not a stable standard extension to use * to mean exclude all pages,
193
We will add support for it here */
195
$this->arrRules['+'][]=($strArgument=='*'?'/':$strArgument);
201
/* non-standard extension */
202
case 'request-rate': {
204
if(preg_match('#^(\d+)\s*/\s*(\d+)$#',$strArgument,$arrMatches)) {
206
if((int) $arrMatches[2]) {
208
$fltDelay=abs((int) $arrMatches[2]/(int) $arrMatches[1]);
210
if((int) $fltDelay!=$fltDelay) {
212
$fltDelay=((int) $fltDelay)+1;
218
#fall through to crawl-delay
220
case 'crawl-delay': {
222
if($blUserAgentMatched && ((int) $strArgument)) {
224
#a delay of more then a minute is in my humble opinion unreasonable
225
#so anything above 60 seconds is truncated
227
$intDelay=abs((int) $strArgument);
229
$intDelay=($intDelay>59?60:$intDelay);
231
$this->intDelay=$intDelay;
243
#anything that is not a directive, means end of matched userAgent
245
if($blUserAgentMatched) {
250
$blUserAgentMatched=false;
257
/** @brief Internal backend for static member urlAllowed
258
@param $strUrl The url to check */
259
public function __urlAllowed($strUrl) {
263
#check if we are allowed to crawl this url. CASE MATTERS
267
# Order: Deny,Allow (Deny has priority over Allow)
268
if(count($this->arrRules['-'])) {
270
foreach($this->arrRules['-'] as $strRule) {
272
#To Exclude, we check if the url starts with the rule
274
if($strRule=='/' || (strpos($strUrl,$strRule)===0)) {
283
if(($blMatched) && (count($this->arrRules['+']))) {
285
foreach($this->arrRules['+'] as $strRule) {
287
#To override an exclude, an exact match is required in the include
289
if($strRule=='/' || ($strUrl==$strRule)) {
291
$blMatched=$blOut=true;
299
$blOut=(!$blMatched?true:$blOut);
301
$intExpire=$this->intLastVisit+$this->intDelay;
305
#blocking is only necessary if we ARE allowed to visit
307
while($intExpire>time()) {
312
$this->intLastVisit=time();
318
/** @brief Check if the scheme (protocol is supported)
319
@param $strUrl The url to check
320
@param $arrResult The url split in its components (scheme,host,url); passed by reference
322
public static function isSupportedScheme($strUrl,array &$arrResult=null) {
326
if(strlen(($strUrl=trim(strtolower($strUrl))))) {
328
# Version 1.1: added -_ as supported characters in domain name, and added 'u' (unicode) pattern modifier
329
if(preg_match('#^(https?)://([\w\d\-_]+(\.[\w\d\-_]+)+)(/*.*)$#u',$strUrl,$arrMatches)) {
331
$strUrl=trim($arrMatches[4]);
333
$strUrl=($strUrl==''?'/':$strUrl);
335
$arrResult=array('scheme'=>$arrMatches[1],
336
'host'=>$arrMatches[2],
346
/** @brief Class factory
347
@param $strScheme the protocol the host is on (http,https)
348
@param $strHost the host to crawl,check robots.txt
349
@param $strUserAgent the useragent name
350
@returns instance of Robots_txt class */
351
protected static function &getInstance($strScheme,$strHost,$strUserAgent) {
355
if((($intCount=count(self::$arrInstances)))) {
357
for($intCounter=0; $intCounter<$intCount; $intCounter++) {
359
if(self::$arrInstances[$intCounter]->strHost==$strHost && self::$arrInstances[$intCounter]->strScheme==$strScheme) {
361
$objOut=self::$arrInstances[$intCounter];
370
$objOut=new Robots_txt($strScheme,$strHost,$strUserAgent);
372
self::$arrInstances[]=$objOut;
378
/** @brief Checks if the url may be crawled
379
@param $strUrl the url to check
380
@param $strUserAgent the useragent name
382
@throws Exception if $strUserAgent is missing on first call, if an instance cannot be created, or if an invalid url is passed */
383
public static final function urlAllowed($strUrl,$strUserAgent=null) {
389
$strUserAgent=trim($strUserAgent);
391
if(is_null(self::$strReportedUserAgent) && (strlen($strUserAgent))) {
393
self::$strReportedUserAgent=$strUserAgent;
396
if(is_null(self::$strReportedUserAgent)) {
398
throw new Exception('strUserAgent is required on first call to Robots_txt::urlAllowed()');
401
if(self::isSupportedScheme($strUrl,$arrResult)) {
403
if((($objEngine=self::getInstance($arrResult['scheme'],$arrResult['host'],self::$strReportedUserAgent)))) {
405
$blOut=$objEngine->__urlAllowed($arrResult['url']);
409
throw new Exception('Cannot get Robots_txt instance.');
413
throw new Exception('Invalid URL');
b'\\ No newline at end of file'