5
hKit Library for PHP5 - a generic library for parsing Microformats
6
Copyright (C) 2006 Drew McLellan
8
This library is free software; you can redistribute it and/or
9
modify it under the terms of the GNU Lesser General Public
10
License as published by the Free Software Foundation; either
11
version 2.1 of the License, or (at your option) any later version.
13
This library is distributed in the hope that it will be useful,
14
but WITHOUT ANY WARRANTY; without even the implied warranty of
15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
Lesser General Public License for more details.
18
You should have received a copy of the GNU Lesser General Public
19
License along with this library; if not, write to the Free Software
20
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
Drew McLellan - http://allinthehead.com/
26
Scott Reynen - http://www.randomchaos.com/
28
Version 0.4, 23-Jun-2006
29
prevented nested includes from causing infinite loops
30
returns false if URL can't be fetched
31
added pre-flight check for base support level
32
added deduping of once-only classnames
33
prevented accumulation of multiple 'value' values
34
tuned whitespace handling and treatment of DEL elements
35
Version 0.3, 21-Jun-2006
36
added post-processor callback method into profiles
37
fixed minor problems raised by hcard testsuite
38
added support for include-pattern
39
added support for td@headers pattern
40
added implied-n optimization into default hcard profile
41
Version 0.2, 20-Jun-2006
42
added class callback mechanism
43
added resolvePath & resolveEmail
44
added basic BASE support
45
Version 0.1.1, 19-Jun-2006 (different timezone, no time machine)
46
added external Tidy option
47
Version 0.1, 20-Jun-2006
58
public $tidy_mode = 'php'; // 'proxy', 'exec', 'php' or 'none'
59
public $tidy_proxy = 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy
60
public $tmp_dir = '/path/to/writable/dir/'; // required only for tidy_mode=exec
62
private $root_class = '';
63
private $classes = '';
64
private $singles = '';
65
private $required = '';
66
private $att_map = '';
67
private $callbacks = '';
68
private $processor = '';
75
public function hKit ()
79
$required = array ('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string');
82
foreach ($required as $f){
83
if (!function_exists($f)){
85
$missing[] = $f . ' ()';
90
die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>');
95
public function getByURL($profile='', $url='')
98
if ($profile=='' || $url == '') return false;
100
$this->loadProfile($profile);
102
$source = $this->loadURL($url);
105
$tidy_xhtml = $this->tidyThis($source);
109
if (strrchr($url, '#'))
110
$fragment = array_pop(explode('#', $url));
112
$doc = $this->loadDoc($tidy_xhtml, $fragment);
113
$s = $this->processNodes($doc, $this->classes);
114
$s = $this->postProcess($profile, $s);
122
public function getByString($profile='', $input_xml='')
124
if ($profile=='' || $input_xml == '') return false;
126
$this->loadProfile($profile);
128
$doc = $this->loadDoc($input_xml);
129
$s = $this->processNodes($doc, $this->classes);
130
$s = $this->postProcess($profile, $s);
136
private function processNodes($items, $classes, $allow_includes=true){
140
foreach($items as $item){
143
for ($i=0; $i<sizeof($classes); $i++){
145
if (!is_array ($classes[$i])){
147
$xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]";
148
$results = $item->xpath($xpath);
151
foreach ($results as $result){
152
if (isset($classes[$i+1]) && is_array ($classes[$i+1])){
153
$nodes = $this->processNodes($results, $classes[$i+1]);
154
$data[$classes[$i]] = (sizeof($nodes) > 0 ? $nodes : $this->getNodeValue($result, $classes[$i]));
157
if (isset($data[$classes[$i]])){
158
if (is_array ($data[$classes[$i]])){
159
// is already an array - append
160
$data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]);
164
if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern
165
$data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]);
167
$old_val = $data[$classes[$i]];
168
$data[$classes[$i]] = array ($old_val, $this->getNodeValue($result, $classes[$i]));
173
// set as normal value
174
$data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
179
// td@headers pattern
180
if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){
181
$include_ids = explode(' ', $result['headers']);
183
foreach ($include_ids as $id){
184
$xpath = "//*[@id='$id']/..";
185
$includes = $doc->xpath($xpath);
186
foreach ($includes as $include){
187
$tmp = $this->processNodes($include, $this->classes);
188
if (is_array ($tmp)) $data = array_merge($data, $tmp);
199
if ($allow_includes){
200
$xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]";
201
$results = $item->xpath($xpath);
204
foreach ($results as $result){
205
if (strtoupper(dom_import_simplexml($result)->tagName)== "OBJECT" &&
206
preg_match('/\binclude\b/', $result['class']) && $result['data']){
207
$id = str_replace('#', '', $result['data']);
209
$xpath = "//*[@id='$id']";
210
$includes = $doc->xpath($xpath);
211
foreach ($includes as $include){
212
$include = simplexml_load_string('<root1><root2>'.$include->asXML ().'</root2></root1>'); // don't ask.
213
$tmp = $this->processNodes($include, $this->classes, false);
214
if (is_array ($tmp)) $data = array_merge($data, $tmp);
222
return (sizeof($out) > 1 ? $out : ( ! empty ( $data) ? $data : '' ));
226
private function getNodeValue($node, $className)
229
$tag_name = strtoupper(dom_import_simplexml($node)->tagName);
233
if ($tag_name == 'DEL') return $s;
235
// look up att map values
236
if (array_key_exists($className, $this->att_map)){
238
foreach ($this->att_map[$className] as $map){
239
if (preg_match("/$tag_name\|/", $map)){
240
$s = ''.$node[array_pop(explode('|', $map))];
245
// if nothing and OBJ, try data.
246
if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data'];
248
// if nothing and IMG, try alt.
249
if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt'];
251
// if nothing and AREA, try alt.
252
if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt'];
254
// if nothing, try title.
255
if (!$s && $node['title']) $s = ''.$node['title'];
258
// if nothing found, go with node text
259
$s = ($s ? $s : implode(array_filter($node->xpath('child::node ()'), array (&$this, "filterBlankValues")), ' '));
262
if (array_key_exists($className, $this->callbacks)){
263
$s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1);
266
// trim and remove line breaks
267
if ($tag_name != 'PRE'){
268
$s = trim(preg_replace('/[\r\n\t]+/', '', $s));
269
$s = trim(preg_replace('/(\s{2})+/', ' ', $s));
275
private function filterBlankValues($s){
276
return preg_match("/\w+/", $s);
280
private function tidyThis($source)
282
switch ( $this->tidy_mode )
285
$tmp_file = $this->tmp_dir.md5($source).'.txt';
286
file_put_contents($tmp_file, $source);
287
exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy);
289
return implode("\n", $tidy);
293
$tidy = tidy_parse_string($source);
294
return tidy_clean_repair($tidy);
305
private function loadProfile($profile)
307
require_once("$profile.profile.php");
311
private function loadDoc($input_xml, $fragment=false)
313
$xml = simplexml_load_string($input_xml);
318
$doc = $xml->xpath("//*[@id='$fragment']");
319
$xml = simplexml_load_string($doc[0]->asXML ());
324
if ($xml->head->base['href']) $this->base = $xml->head->base['href'];
326
// xml:base attribute - PITA with SimpleXML
327
preg_match('/xml:base="(.*)"/', $xml->asXML (), $matches);
328
if (is_array ($matches) && sizeof($matches)>1) $this->base = $matches[1];
330
return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]");
335
private function loadURL($url)
339
if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){
340
$url = $this->tidy_proxy . $url;
343
return @file_get_contents($url);
348
private function postProcess($profile, $s)
351
$required = $this->required;
356
if (array_key_exists($required[0], $s)){
360
$s = $this->dedupeSingles($s);
362
if (function_exists('hKit_'.$profile.'_post')){
363
$s = call_user_func('hKit_'.$profile.'_post', $s);
370
private function resolvePath($filepath)
371
{ // ugly code ahoy: needs a serious tidy up
373
$filepath = $filepath[0];
378
if ($base != '' && strpos ($base, '://') !== false)
381
$r = parse_url($url);
382
$domain = $r['scheme'] . '://' . $r['host'];
384
if (!isset($r['path'])) $r['path'] = '/';
385
$path = explode('/', $r['path']);
386
$file = explode('/', $filepath);
389
if (strpos ($filepath, '://') !== false || strpos ($filepath, 'data:') !== false){
395
return ''.$domain . implode('/', $file);
398
if ($path[sizeof($path)-1] == '') array_pop($path);
399
if (strpos ($path[sizeof($path)-1], '.') !== false) array_pop($path);
401
foreach ($file as $segment){
402
if ($segment == '..'){
408
return ''.$domain . implode('/', $path) . implode('/', $new);
412
private function resolveEmail($v)
414
$parts = parse_url($v[0]);
415
return ($parts['path']);
419
private function dedupeSingles($s)
421
$singles = $this->singles;
423
foreach ($s as &$item){
424
foreach ($singles as $classname){
425
if (array_key_exists($classname, $item) && is_array ($item[$classname])){
426
$item[$classname] = $item[$classname][0];
b'\\ No newline at end of file'