5
* A simple command-line utility to extract all of the URLS contained
6
* within <A HREF> tags from a document.
8
* NOTE: Only works with tidy for PHP 5, please see urlgrab.php for tidy for PHP 4.3.x
10
* By: John Coggeshall <john@php.net>
12
* Usage: php urlgrab5.php <file>
15
function dump_nodes(tidy_node $node, &$urls = NULL) {
17
$urls = (is_array($urls)) ? $urls : array();
19
if(isset($node->id)) {
20
if($node->id == TIDY_TAG_A) {
21
$urls[] = $node->attribute['href'];
25
if($node->hasChildren()) {
27
foreach($node->child as $c) {
28
dump_nodes($c, $urls);
36
$a = tidy_parse_file($_SERVER['argv'][1]);
38
print_r(dump_nodes($a->html()));