3
* HTTP Crawler for The Search Engine Project
5
* @author geoffreyfishing
7
* The following will be filled automatically by SubVersion!
8
* Do not change by hand!
11
* $LastChangedRevision: $
18
* $crawler = new TSEPCrawler('starturl', 'some_regx');
19
* while ($result = $crawler->crawl())
21
* $content_of_page = $result->content;
22
* $url_of_page = $result->url;
32
* The regular expression that urls must match to be crawled
39
* URLS that are queued to be crawled
42
private $urls = array();
46
* URLs that have already been crawled
49
private $done = array();
60
* The user agent to act as
67
* Initializes the class
68
* @param string $start The start url
69
* @param string $regex The Regular Expression that URLs must match to be crawled
70
* @param string $elements The elements and their properties that contain the links
72
function __construct($start, $regex, $agent) {
74
//Queue the start url to be crawled
76
array_push($this->urls, $start);
79
$this->regex = $regex;
82
$this->agent = $agent;
86
//Grab the robots.txt file
87
$this->parseRobots($start);
93
* Advances the crawler to the next URL and returns the page contents
99
//We will return false if there is nothing left to crawl
100
if (empty($this->urls))
103
$this->url = array_pop($this->urls);
105
// Create the stream context
106
$context = stream_context_create(array(
108
'timeout' => 5 // Timeout in seconds
112
// Fetch the URL's contents
113
$contents = @file_get_contents($this->url, 0, $context);
116
if (!empty($contents)) $this->parse($contents);
120
* If retreiving the contents failed, we don't need to do anything
121
* because the URL will simply be removed from the list. However,
122
* we need to add the URL to $this->done whatever the case is
123
* because we don't want to call the same page twice (or more)
126
array_push($this->done, $this->url);
128
//Now remove duplacate URLs, URLs that don't match the RegEx, and already crawled URLs
135
$type = $this->getType($contents);
137
//And that is pretty much it
138
return new Page($contents, $url, $type);
141
private function parseRobots ($url) {
142
# parse url to retrieve host and path
143
$parsed = parse_url($url);
145
$useragent = $this->agent;
147
$agents = array(preg_quote('*'));
148
if($useragent) $agents[] = preg_quote($useragent);
149
$agents = implode('|', $agents);
151
# location of robots.txt file
152
$robotstxt = @file("http://{$parsed['host']}/robots.txt");
153
if(!$robotstxt) return true;
156
$ruleapplies = false;
157
foreach($robotstxt as $line) {
159
if(!$line = trim($line)) continue;
161
# following rules only apply if User-agent matches $useragent or '*'
162
if(preg_match('/User-agent: (.*)/i', $line, $match)) {
163
$ruleapplies = preg_match("/($agents)/i", $match[1]);
165
if($ruleapplies && preg_match('/Disallow:(.*)/i', $line, $regs)) {
166
# an empty rule implies full access - no further tests required
167
if(!$regs[1]) return true;
168
# add rules that apply to array for testing
169
$rules[] = preg_quote(trim($regs[1]), '/');
173
foreach($rules as $rule) {
174
# Push the URL into the 'done' array
175
array_push($this->done, url_to_absolute("http://{$parsed['host']}/robots.txt", $rule));
180
private function getType($contents) {
182
if(preg_match('/<[^<>]+>/', $contents)) {
186
return 'text/javascript';
192
* Parses an page and adds all the URLs it can find to $this->urls
193
* @param string $contents The contents to parse
195
private function parse($contents) {
198
$type = $this->getType($contents);
202
$this->parseHTML($contents);
204
case 'text/javascript':
205
$this->parseJS($contents);
208
$this->parseCSS($contents);
210
default: //Attempt to parse all three
211
$this->parseHTML($contents);
212
$this->parseCSS($contents);
213
$this->parseJS($contents);
219
catch (Exception $ex) {
225
private function parseHTML($contents) {
227
$dom = new DOMDocument();
229
@$dom->recover = true;
230
@$dom->loadHTML($contents);
232
$simple = simplexml_import_dom($dom);
234
unset($dom); //DomDocument is heavy and bloated
237
$links = $simple->xpath('/html/body//a');
239
foreach ($links as $link){
241
array_push($this->urls, url_to_absolute($this->url, $link['href']));
248
//TODO: Implement Javascript parsing
251
* Parses links from JavaScript
252
* @param string $contents The JavaScript to parse
254
private function parseJS ($contents) {
259
//TODO: Implement CSS parsing
262
* Parses links from CSS
263
* @param string $contents The CSS to parse
265
private function parseCSS ($contents) {
271
* Removes all URLs that are duplicates, have already been crawled, or do not match the RegEx
273
private function cleanURLs () {
276
$this->urls = array_unique($this->urls);
277
$this->done = array_unique($this->done);
280
foreach ($this->urls as $key => $value)
281
if(in_array($value, $this->done))
282
unset($this->urls[$key]);
283
//TODO:Design a user friendly system of creating a regex
285
//foreach ($this->urls as $key => $value)
286
// if(!preg_match($this->regex, $value))
287
// unset($this->urls[$key]);
289
foreach ($this->urls as $key => $value) {
290
//Check that the URL is on the same domain
291
$parsed = parse_url($value);
292
if ($this->regex != @$parsed['host'])
293
unset($this->urls[$key]);
295
//Check that the URL is not mailto
296
if(preg_match('/mailto\:([^">]+)/', $value))
297
unset($this->urls[$key]);
301
$this->done = array_values($this->done);
302
$this->urls = array_values($this->urls);
309
* A page crawled by TSEPCrawler
318
function __construct($content, $url, $type = 'text/html') {
319
$this->content = $content;
b'\\ No newline at end of file'