1
package org.apache.solr.analysis;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.BufferedReader;
22
import java.io.FileReader;
23
import java.io.IOException;
24
import java.io.Reader;
25
import java.io.StringReader;
26
import java.util.HashSet;
29
import org.apache.lucene.analysis.CharReader;
30
import org.apache.lucene.util.LuceneTestCase;
32
import org.apache.solr.SolrTestCaseJ4;
34
public class HTMLStripCharFilterTest extends LuceneTestCase {
37
public void setUp() throws Exception {
42
public void tearDown() throws Exception {
45
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
47
public void test() throws IOException {
48
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
49
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
50
"This is an entity: & plus a <. Here is an &. <!-- is a comment -->";
51
String gold = " this is some text here is a link and " +
53
"This is an entity: & plus a <. Here is an &. ";
54
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
55
StringBuilder builder = new StringBuilder();
57
char [] goldArray = gold.toCharArray();
59
while ((ch = reader.read()) != -1){
60
char theChar = (char) ch;
61
builder.append(theChar);
62
assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal: " + goldArray[position]
63
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
66
assertEquals(gold, builder.toString());
69
//Some sanity checks, but not a full-fledged check
70
public void testHTML() throws Exception {
72
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new FileReader(SolrTestCaseJ4.getFile("htmlStripReaderTest.html"))));
73
StringBuilder builder = new StringBuilder();
75
while ((ch = reader.read()) != -1){
76
builder.append((char)ch);
78
String str = builder.toString();
79
assertTrue("Entity not properly escaped", str.indexOf("<") == -1);//there is one > in the text
80
assertTrue("Forrest should have been stripped out", str.indexOf("forrest") == -1 && str.indexOf("Forrest") == -1);
81
assertTrue("File should start with 'Welcome to Solr' after trimming", str.trim().startsWith("Welcome to Solr"));
83
assertTrue("File should start with 'Foundation.' after trimming", str.trim().endsWith("Foundation."));
87
public void testGamma() throws Exception {
88
String test = "Γ";
89
String gold = "\u0393";
90
Set<String> set = new HashSet<String>();
92
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
93
StringBuilder builder = new StringBuilder();
95
while ((ch = reader.read()) != -1){
96
builder.append((char)ch);
98
String result = builder.toString();
99
// System.out.println("Resu: " + result + "<EOL>");
100
// System.out.println("Gold: " + gold + "<EOL>");
101
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
104
public void testEntities() throws Exception {
105
String test = " <foo> Übermensch = Γ bar Γ";
106
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
107
Set<String> set = new HashSet<String>();
109
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
110
StringBuilder builder = new StringBuilder();
112
while ((ch = reader.read()) != -1){
113
builder.append((char)ch);
115
String result = builder.toString();
116
// System.out.println("Resu: " + result + "<EOL>");
117
// System.out.println("Gold: " + gold + "<EOL>");
118
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
121
public void testMoreEntities() throws Exception {
122
String test = " <junk/> ! @ and ’";
123
String gold = " <junk/> ! @ and ’";
124
Set<String> set = new HashSet<String>();
126
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
127
StringBuilder builder = new StringBuilder();
129
while ((ch = reader.read()) != -1){
130
builder.append((char)ch);
132
String result = builder.toString();
133
// System.out.println("Resu: " + result + "<EOL>");
134
// System.out.println("Gold: " + gold + "<EOL>");
135
assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
138
public void testReserved() throws Exception {
139
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
140
Set<String> set = new HashSet<String>();
142
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
143
StringBuilder builder = new StringBuilder();
145
while ((ch = reader.read()) != -1){
146
builder.append((char)ch);
148
String result = builder.toString();
149
// System.out.println("Result: " + result);
150
assertTrue("Escaped tag not preserved: " + result.indexOf("reserved"), result.indexOf("reserved") == 9);
151
assertTrue("Escaped tag not preserved: " + result.indexOf("reserved", 15), result.indexOf("reserved", 15) == 38);
152
assertTrue("Escaped tag not preserved: " + result.indexOf("reserved", 41), result.indexOf("reserved", 41) == 54);
153
assertTrue("Other tag should be removed", result.indexOf("other") == -1);
156
public void testMalformedHTML() throws Exception {
157
String test = "a <a hr<ef=aa<a>> </close</a>";
158
String gold = "a <a hr<ef=aa > </close ";
159
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
160
StringBuilder builder = new StringBuilder();
162
while ((ch = reader.read()) != -1){
163
builder.append((char)ch);
165
String result = builder.toString();
166
// System.out.println("Resu: " + result + "<EOL>");
167
// System.out.println("Gold: " + gold + "<EOL>");
168
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
171
public void testBufferOverflow() throws Exception {
172
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
173
testBuilder.append("ah<?> ??????");
174
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
175
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
177
testBuilder.setLength(0);
178
testBuilder.append("<!--");//comments
179
appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
181
testBuilder.append("-->foo");
182
processBuffer(testBuilder.toString(), "Failed w/ comment");
184
testBuilder.setLength(0);
185
testBuilder.append("<?");
186
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
187
testBuilder.append("?>");
188
processBuffer(testBuilder.toString(), "Failed with proc. instr.");
190
testBuilder.setLength(0);
191
testBuilder.append("<b ");
192
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
193
testBuilder.append("/>");
194
processBuffer(testBuilder.toString(), "Failed on tag");
198
private void appendChars(StringBuilder testBuilder, int numChars) {
199
int i1 = numChars / 2;
200
for (int i = 0; i < i1; i++){
201
testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes HTMLStripCharFilter think it is a processing instruction
206
private void processBuffer(String test, String assertMsg) throws IOException {
207
// System.out.println("-------------------processBuffer----------");
208
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
210
StringBuilder builder = new StringBuilder();
212
while ((ch = reader.read()) != -1){
213
builder.append((char)ch);
216
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
218
assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
221
public void testComment() throws Exception {
223
String test = "<!--- three dashes, still a valid comment ---> ";
225
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
227
StringBuilder builder = new StringBuilder();
229
while ((ch = reader.read()) != -1){
230
builder.append((char)ch);
233
// System.out.println("String: " + builder.toString());
235
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
239
public void doTestOffsets(String in) throws Exception {
240
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
242
int off = 0; // offset in the reader
243
int strOff = -1; // offset in the original string
244
while ((ch = reader.read()) != -1) {
245
int correctedOff = reader.correctOffset(off);
248
strOff = in.indexOf('X',strOff+1);
249
assertEquals(strOff, correctedOff);
256
public void testOffsets() throws Exception {
257
doTestOffsets("hello X how X are you");
258
doTestOffsets("hello <p> X<p> how <p>X are you");
259
doTestOffsets("X & X ( X < > X");
262
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");