1
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
2
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
5
import java.util.Locale;
6
import java.util.Properties;
8
public class HTMLParser implements HTMLParserConstants {
9
public static int SUMMARY_LENGTH = 200;
11
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
12
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
13
Properties metaTags=new Properties();
14
String currentMetaTag=null;
15
String currentMetaContent=null;
17
boolean titleComplete = false;
18
boolean inTitle = false;
19
boolean inMetaTag = false;
20
boolean inStyle = false;
21
boolean afterTag = false;
22
boolean afterSpace = false;
23
String eol = System.getProperty("line.separator");
26
private MyPipedInputStream pipeInStream = null;
27
private PipedOutputStream pipeOutStream = null;
29
private class MyPipedInputStream extends PipedInputStream{
31
public MyPipedInputStream(){
35
public MyPipedInputStream(PipedOutputStream src) throws IOException{
39
public boolean full() throws IOException{
40
return this.available() >= PipedInputStream.PIPE_SIZE;
44
public String getTitle() throws IOException, InterruptedException {
46
getReader(); // spawn parsing thread
49
if (titleComplete || pipeInStream.full())
54
return title.toString().trim();
57
public Properties getMetaTags() throws IOException,
58
InterruptedException {
60
getReader(); // spawn parsing thread
63
if (titleComplete || pipeInStream.full())
72
public String getSummary() throws IOException, InterruptedException {
74
getReader(); // spawn parsing thread
77
if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
82
if (summary.length() > SUMMARY_LENGTH)
83
summary.setLength(SUMMARY_LENGTH);
85
String sum = summary.toString().trim();
86
String tit = getTitle();
93
public Reader getReader() throws IOException {
95
pipeInStream = new MyPipedInputStream();
96
pipeOutStream = new PipedOutputStream(pipeInStream);
97
pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
98
pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
100
Thread thread = new ParserThread(this);
101
thread.start(); // start parsing
107
void addToSummary(String text) {
108
if (summary.length() < SUMMARY_LENGTH) {
109
summary.append(text);
110
if (summary.length() >= SUMMARY_LENGTH) {
118
void addText(String text) throws IOException {
125
if (!titleComplete && !(title.length() == 0)) { // finished title
127
titleComplete = true; // tell waiting threads
133
length += text.length();
140
metaTags.setProperty(currentMetaTag, currentMetaContent);
141
currentMetaTag = null;
142
currentMetaContent = null;
146
void addSpace() throws IOException {
153
String space = afterTag ? eol : " ";
154
length += space.length();
155
pipeOut.write(space);
160
final public void HTMLDocument() throws ParseException, IOException {
164
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
180
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
199
t = jj_consume_token(Word);
200
addText(t.image); afterTag = false;
203
t = jj_consume_token(Entity);
204
addText(Entities.decode(t.image)); afterTag = false;
207
t = jj_consume_token(Punct);
208
addText(t.image); afterTag = false;
211
jj_consume_token(Space);
212
addSpace(); afterTag = false;
216
jj_consume_token(-1);
217
throw new ParseException();
223
final public void Tag() throws ParseException, IOException {
225
boolean inImg = false;
226
t1 = jj_consume_token(TagName);
227
String tagName = t1.image.toLowerCase(Locale.ENGLISH);
228
if(Tags.WS_ELEMS.contains(tagName) ) {
231
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
232
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
233
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
234
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
238
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
246
t1 = jj_consume_token(ArgName);
247
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
249
jj_consume_token(ArgEquals);
250
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
255
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
256
addText("[" + t2.image + "]");
259
( t1.image.equalsIgnoreCase("name") ||
260
t1.image.equalsIgnoreCase("HTTP-EQUIV")
264
currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
265
if(currentMetaTag != null && currentMetaContent != null) {
269
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
272
currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
273
if(currentMetaTag != null && currentMetaContent != null) {
288
jj_consume_token(TagEnd);
291
final public Token ArgValue() throws ParseException {
293
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
295
t = jj_consume_token(ArgValue);
296
{if (true) return t;}
301
jj_consume_token(ArgQuote1);
302
jj_consume_token(CloseQuote1);
303
{if (true) return t;}
305
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
307
jj_consume_token(ArgQuote1);
308
t = jj_consume_token(Quote1Text);
309
jj_consume_token(CloseQuote1);
310
{if (true) return t;}
315
jj_consume_token(ArgQuote2);
316
jj_consume_token(CloseQuote2);
317
{if (true) return t;}
319
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
321
jj_consume_token(ArgQuote2);
322
t = jj_consume_token(Quote2Text);
323
jj_consume_token(CloseQuote2);
324
{if (true) return t;}
328
jj_consume_token(-1);
329
throw new ParseException();
335
throw new Error("Missing return statement in function");
338
final public Token Decl() throws ParseException {
340
t = jj_consume_token(DeclName);
343
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
355
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
357
jj_consume_token(ArgName);
365
jj_consume_token(ArgEquals);
369
jj_consume_token(-1);
370
throw new ParseException();
373
jj_consume_token(TagEnd);
374
{if (true) return t;}
375
throw new Error("Missing return statement in function");
378
final public void CommentTag() throws ParseException {
379
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
381
jj_consume_token(Comment1);
384
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
392
jj_consume_token(CommentText1);
394
jj_consume_token(CommentEnd1);
397
jj_consume_token(Comment2);
400
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
408
jj_consume_token(CommentText2);
410
jj_consume_token(CommentEnd2);
414
jj_consume_token(-1);
415
throw new ParseException();
419
final public void ScriptTag() throws ParseException {
420
jj_consume_token(ScriptStart);
423
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
431
jj_consume_token(ScriptText);
433
jj_consume_token(ScriptEnd);
436
private boolean jj_2_1(int xla) {
437
jj_la = xla; jj_lastpos = jj_scanpos = token;
438
try { return !jj_3_1(); }
439
catch(LookaheadSuccess ls) { return true; }
440
finally { jj_save(0, xla); }
443
private boolean jj_2_2(int xla) {
444
jj_la = xla; jj_lastpos = jj_scanpos = token;
445
try { return !jj_3_2(); }
446
catch(LookaheadSuccess ls) { return true; }
447
finally { jj_save(1, xla); }
450
private boolean jj_3_2() {
451
if (jj_scan_token(ArgQuote2)) return true;
452
if (jj_scan_token(CloseQuote2)) return true;
456
private boolean jj_3_1() {
457
if (jj_scan_token(ArgQuote1)) return true;
458
if (jj_scan_token(CloseQuote1)) return true;
462
/** Generated Token Manager. */
463
public HTMLParserTokenManager token_source;
464
SimpleCharStream jj_input_stream;
465
/** Current token. */
470
private Token jj_scanpos, jj_lastpos;
473
final private int[] jj_la1 = new int[14];
474
static private int[] jj_la1_0;
478
private static void jj_la1_init_0() {
479
jj_la1_0 = new int[] {0x2c7e,0x2c7e,0x10000,0x380000,0x20000,0x80000,0x100000,0x200000,0x3b0000,0x3b0000,0x8000000,0x20000000,0x30,0x4000,};
481
final private JJCalls[] jj_2_rtns = new JJCalls[2];
482
private boolean jj_rescan = false;
483
private int jj_gc = 0;
485
/** Constructor with InputStream. */
486
public HTMLParser(java.io.InputStream stream) {
489
/** Constructor with InputStream and supplied encoding */
490
public HTMLParser(java.io.InputStream stream, String encoding) {
491
try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
492
token_source = new HTMLParserTokenManager(jj_input_stream);
496
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
497
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
501
public void ReInit(java.io.InputStream stream) {
502
ReInit(stream, null);
505
public void ReInit(java.io.InputStream stream, String encoding) {
506
try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
507
token_source.ReInit(jj_input_stream);
511
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
512
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
516
public HTMLParser(java.io.Reader stream) {
517
jj_input_stream = new SimpleCharStream(stream, 1, 1);
518
token_source = new HTMLParserTokenManager(jj_input_stream);
522
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
523
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
527
public void ReInit(java.io.Reader stream) {
528
jj_input_stream.ReInit(stream, 1, 1);
529
token_source.ReInit(jj_input_stream);
533
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
534
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
537
/** Constructor with generated Token Manager. */
538
public HTMLParser(HTMLParserTokenManager tm) {
543
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
544
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
548
public void ReInit(HTMLParserTokenManager tm) {
553
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
554
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
557
private Token jj_consume_token(int kind) throws ParseException {
559
if ((oldToken = token).next != null) token = token.next;
560
else token = token.next = token_source.getNextToken();
562
if (token.kind == kind) {
566
for (int i = 0; i < jj_2_rtns.length; i++) {
567
JJCalls c = jj_2_rtns[i];
569
if (c.gen < jj_gen) c.first = null;
578
throw generateParseException();
581
static private final class LookaheadSuccess extends java.lang.Error { }
582
final private LookaheadSuccess jj_ls = new LookaheadSuccess();
583
private boolean jj_scan_token(int kind) {
584
if (jj_scanpos == jj_lastpos) {
586
if (jj_scanpos.next == null) {
587
jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken();
589
jj_lastpos = jj_scanpos = jj_scanpos.next;
592
jj_scanpos = jj_scanpos.next;
595
int i = 0; Token tok = token;
596
while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; }
597
if (tok != null) jj_add_error_token(kind, i);
599
if (jj_scanpos.kind != kind) return true;
600
if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls;
605
/** Get the next Token. */
606
final public Token getNextToken() {
607
if (token.next != null) token = token.next;
608
else token = token.next = token_source.getNextToken();
614
/** Get the specific Token. */
615
final public Token getToken(int index) {
617
for (int i = 0; i < index; i++) {
618
if (t.next != null) t = t.next;
619
else t = t.next = token_source.getNextToken();
624
private int jj_ntk() {
625
if ((jj_nt=token.next) == null)
626
return (jj_ntk = (token.next=token_source.getNextToken()).kind);
628
return (jj_ntk = jj_nt.kind);
631
private java.util.List<int[]> jj_expentries = new java.util.ArrayList<int[]>();
632
private int[] jj_expentry;
633
private int jj_kind = -1;
634
private int[] jj_lasttokens = new int[100];
635
private int jj_endpos;
637
private void jj_add_error_token(int kind, int pos) {
638
if (pos >= 100) return;
639
if (pos == jj_endpos + 1) {
640
jj_lasttokens[jj_endpos++] = kind;
641
} else if (jj_endpos != 0) {
642
jj_expentry = new int[jj_endpos];
643
for (int i = 0; i < jj_endpos; i++) {
644
jj_expentry[i] = jj_lasttokens[i];
646
jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
647
int[] oldentry = (int[])(it.next());
648
if (oldentry.length == jj_expentry.length) {
649
for (int i = 0; i < jj_expentry.length; i++) {
650
if (oldentry[i] != jj_expentry[i]) {
651
continue jj_entries_loop;
654
jj_expentries.add(jj_expentry);
655
break jj_entries_loop;
658
if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
662
/** Generate ParseException. */
663
public ParseException generateParseException() {
664
jj_expentries.clear();
665
boolean[] la1tokens = new boolean[31];
667
la1tokens[jj_kind] = true;
670
for (int i = 0; i < 14; i++) {
671
if (jj_la1[i] == jj_gen) {
672
for (int j = 0; j < 32; j++) {
673
if ((jj_la1_0[i] & (1<<j)) != 0) {
679
for (int i = 0; i < 31; i++) {
681
jj_expentry = new int[1];
683
jj_expentries.add(jj_expentry);
688
jj_add_error_token(0, 0);
689
int[][] exptokseq = new int[jj_expentries.size()][];
690
for (int i = 0; i < jj_expentries.size(); i++) {
691
exptokseq[i] = jj_expentries.get(i);
693
return new ParseException(token, exptokseq, tokenImage);
696
/** Enable tracing. */
697
final public void enable_tracing() {
700
/** Disable tracing. */
701
final public void disable_tracing() {
704
private void jj_rescan_token() {
706
for (int i = 0; i < 2; i++) {
708
JJCalls p = jj_2_rtns[i];
710
if (p.gen > jj_gen) {
711
jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
713
case 0: jj_3_1(); break;
714
case 1: jj_3_2(); break;
719
} catch(LookaheadSuccess ls) { }
724
private void jj_save(int index, int xla) {
725
JJCalls p = jj_2_rtns[index];
726
while (p.gen > jj_gen) {
727
if (p.next == null) { p = p.next = new JJCalls(); break; }
730
p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
733
static final class JJCalls {
740
// void handleException(Exception e) {
741
// System.out.println(e.toString()); // print the error message
742
// System.out.println("Skipping...");
745
// t = getNextToken();
746
// } while (t.kind != TagEnd);