1
/* Copyright 2002-2004 Elliotte Rusty Harold
3
This library is free software; you can redistribute it and/or modify
4
it under the terms of version 2.1 of the GNU Lesser General Public
5
License as published by the Free Software Foundation.
7
This library is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU Lesser General Public License for more details.
12
You should have received a copy of the GNU Lesser General Public
13
License along with this library; if not, write to the
14
Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15
Boston, MA 02111-1307 USA
17
You can contact Elliotte Rusty Harold by sending e-mail to
18
elharo@metalab.unc.edu. Please include the word "XOM" in the
19
subject line. The XOM home page is located at http://www.xom.nu/
22
package nu.xom.samples;
24
import java.net.MalformedURLException;
26
import java.util.HashSet;
27
import java.util.LinkedList;
28
import java.util.List;
31
import nu.xom.Attribute;
32
import nu.xom.Builder;
33
import nu.xom.Document;
34
import nu.xom.Element;
35
import nu.xom.Elements;
37
import nu.xom.ProcessingInstruction;
42
* Demonstrates the reading of attributes in namespaces,
43
* searching for particular processing instructions in the
44
* document prolog, and maintaining a stack of hierarchy-based
45
* state during document traversal.
48
* @author Elliotte Rusty Harold
52
public class PoliteSpider {
54
private Set spidered = new HashSet();
55
private Builder parser = new Builder();
56
private List queue = new LinkedList();
58
public static final String XLINK_NS
59
= "http://www.w3.org/1999/xlink";
60
public static final String XML_NS
61
= "http://www.w3.org/XML/1998/namespace";
63
public void search(URL url) {
66
String systemID = url.toExternalForm();
67
Document doc = parser.build(systemID);
69
boolean follow = true;
71
for (int i = 0; i < doc.getChildCount(); i++) {
72
Node child = doc.getChild(i);
73
if (child instanceof Element) break;
74
if (child instanceof ProcessingInstruction){
75
ProcessingInstruction instruction
76
= (ProcessingInstruction) child;
77
if (instruction.getTarget().equals("robots")) {
79
= PseudoAttributes.getAttributes(instruction);
80
Attribute indexAtt = data.getAttribute("index");
81
if (indexAtt != null) {
82
String value = indexAtt.getValue().trim();
83
if (value.equals("no")) index = false;
85
Attribute followAtt = data.getAttribute("follow");
86
if (followAtt != null) {
87
String value = followAtt.getValue().trim();
88
if (value.equals("no")) follow = false;
94
if (index) System.out.println(url);
95
if (follow) search(doc.getRootElement(), url);
97
catch (Exception ex) {
98
// just skip this document
101
if (queue.isEmpty()) return;
103
URL discovered = (URL) queue.remove(0);
104
spidered.add(discovered);
109
private void search(Element element, URL base) {
111
Attribute href = element.getAttribute("href", XLINK_NS);
112
Attribute xmlbase = element.getAttribute("base", XML_NS);
114
if (xmlbase != null) base = new URL(base, xmlbase.getValue());
116
catch (MalformedURLException ex) {
117
//Java can't handle the kind of URLs used inside this element
121
String uri = href.getValue();
124
URL discovered = new URL(base, uri);
125
// strip fragment identifier if any
126
discovered = new URL(
127
discovered.getProtocol(),
128
discovered.getHost(),
132
if (!spidered.contains(discovered)
133
&& !queue.contains(discovered)) {
134
queue.add(discovered);
137
catch (MalformedURLException ex) {
141
Elements children = element.getChildElements();
142
for (int i = 0; i < children.size(); i++) {
143
search(children.get(i), base);
148
public static void main(String[] args) {
150
XLinkSpider spider = new XLinkSpider();
151
for (int i = 0; i < args.length; i++) {
153
spider.search(new URL(args[i]));
155
catch (MalformedURLException ex) {
156
System.err.println(ex);
b'\\ No newline at end of file'