1
/* Copyright 2002-2004 Elliotte Rusty Harold
3
This library is free software; you can redistribute it and/or modify
4
it under the terms of version 2.1 of the GNU Lesser General Public
5
License as published by the Free Software Foundation.
7
This library is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU Lesser General Public License for more details.
12
You should have received a copy of the GNU Lesser General Public
13
License along with this library; if not, write to the
14
Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15
Boston, MA 02111-1307 USA
17
You can contact Elliotte Rusty Harold by sending e-mail to
18
elharo@metalab.unc.edu. Please include the word "XOM" in the
19
subject line. The XOM home page is located at http://www.xom.nu/
22
package nu.xom.samples;
24
import java.net.MalformedURLException;
26
import java.util.HashSet;
27
import java.util.LinkedList;
28
import java.util.List;
31
import nu.xom.Attribute;
32
import nu.xom.Builder;
33
import nu.xom.Document;
34
import nu.xom.Element;
35
import nu.xom.Elements;
41
* Demonstrates the reading of attributes in namespaces,
42
* as well as maintaining a stack of hierarchy-based state
43
* during document traversal.
46
* @author Elliotte Rusty Harold
50
public class XLinkSpider {
52
private Set spidered = new HashSet();
53
private Builder parser = new Builder();
54
private List queue = new LinkedList();
56
public static final String XLINK_NS
57
= "http://www.w3.org/1999/xlink";
58
public static final String XML_NS
59
= "http://www.w3.org/XML/1998/namespace";
61
public void search(URL url) {
64
String systemID = url.toExternalForm();
65
Document doc = parser.build(systemID);
66
System.out.println(url);
67
search(doc.getRootElement(), url);
69
catch (Exception ex) {
70
// just skip this document
73
if (queue.isEmpty()) return;
75
URL discovered = (URL) queue.remove(0);
76
spidered.add(discovered);
81
private void search(Element element, URL base) {
83
Attribute href = element.getAttribute("href", XLINK_NS);
84
Attribute xmlbase = element.getAttribute("base", XML_NS);
86
if (xmlbase != null) {
87
base = new URL(base, xmlbase.getValue());
90
catch (MalformedURLException ex) {
91
// Probably just no protocol handler for the
92
// kind of URLs used inside this element
96
String uri = href.getValue();
99
URL discovered = new URL(base, uri);
100
// remove fragment identifier if any
101
discovered = new URL(
102
discovered.getProtocol(),
103
discovered.getHost(),
107
if (!spidered.contains(discovered)
108
&& !queue.contains(discovered)) {
109
queue.add(discovered);
112
catch (MalformedURLException ex) {
116
Elements children = element.getChildElements();
117
for (int i = 0; i < children.size(); i++) {
118
search(children.get(i), base);
123
public static void main(String[] args) {
125
XLinkSpider spider = new XLinkSpider();
126
for (int i = 0; i < args.length; i++) {
128
spider.search(new URL(args[i]));
130
catch (MalformedURLException ex) {
131
System.err.println(ex);
b'\\ No newline at end of file'