6
* A simple command-line utility to extract all of the URLS contained
7
* within <A HREF> tags from a document.
9
* NOTE: Only works with tidy for PHP 4.3.x, please see urlgrab5.php for tidy for PHP 5
11
* By: John Coggeshall <john@php.net>
13
* Usage: php urlgrab.php <file>
17
/* Parse the document */
18
tidy_parse_file($_SERVER['argv'][1]);
20
/* Fix up the document */
23
/* Get an object representing everything from the <HTML> tag in */
24
$html = tidy_get_html();
26
/* Traverse the document tree */
27
print_r(get_links($html));
29
function get_links($node) {
32
/* Check to see if we are on an <A> tag or not */
33
if($node->id == TIDY_TAG_A) {
34
/* If we are, find the HREF attribute */
35
$attrib = $node->get_attr(TIDY_ATTR_HREF);
37
/* Add the value of the HREF attrib to $urls */
38
$urls[] = $attrib->value;
43
/* Are there any children? */
44
if($node->has_children()) {
46
/* Traverse down each child recursively */
47
foreach($node->children() as $child) {
49
/* Append the results from recursion to $urls */
50
foreach(get_links($child) as $url) {
b'\\ No newline at end of file'