1
package org.apache.lucene.analysis.wikipedia;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
24
%class WikipediaTokenizerImpl
27
%function getNextToken
33
public static final int ALPHANUM = WikipediaTokenizer.ALPHANUM_ID;
34
public static final int APOSTROPHE = WikipediaTokenizer.APOSTROPHE_ID;
35
public static final int ACRONYM = WikipediaTokenizer.ACRONYM_ID;
36
public static final int COMPANY = WikipediaTokenizer.COMPANY_ID;
37
public static final int EMAIL = WikipediaTokenizer.EMAIL_ID;
38
public static final int HOST = WikipediaTokenizer.HOST_ID;
39
public static final int NUM = WikipediaTokenizer.NUM_ID;
40
public static final int CJ = WikipediaTokenizer.CJ_ID;
41
public static final int INTERNAL_LINK = WikipediaTokenizer.INTERNAL_LINK_ID;
42
public static final int EXTERNAL_LINK = WikipediaTokenizer.EXTERNAL_LINK_ID;
43
public static final int CITATION = WikipediaTokenizer.CITATION_ID;
44
public static final int CATEGORY = WikipediaTokenizer.CATEGORY_ID;
45
public static final int BOLD = WikipediaTokenizer.BOLD_ID;
46
public static final int ITALICS = WikipediaTokenizer.ITALICS_ID;
47
public static final int BOLD_ITALICS = WikipediaTokenizer.BOLD_ITALICS_ID;
48
public static final int HEADING = WikipediaTokenizer.HEADING_ID;
49
public static final int SUB_HEADING = WikipediaTokenizer.SUB_HEADING_ID;
50
public static final int EXTERNAL_LINK_URL = WikipediaTokenizer.EXTERNAL_LINK_URL_ID;
53
private int currentTokType;
54
private int numBalanced = 0;
55
private int positionInc = 1;
56
private int numLinkToks = 0;
57
//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
58
//this can be useful for detecting when a new reserved token is encountered
59
//see https://issues.apache.org/jira/browse/LUCENE-1133
60
private int numWikiTokensSeen = 0;
62
public static final String [] TOKEN_TYPES = WikipediaTokenizer.TOKEN_TYPES;
65
Returns the number of tokens seen inside a category or link, etc.
66
@return the number of tokens seen inside the context of wiki syntax.
68
public final int getNumWikiTokensSeen(){
69
return numWikiTokensSeen;
72
public final int yychar()
77
public final int getPositionIncrement(){
82
* Fills Lucene token with the current token text.
84
final void getText(CharTermAttribute t) {
85
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
88
final int setText(StringBuilder buffer){
89
int length = zzMarkedPos - zzStartRead;
90
buffer.append(zzBuffer, zzStartRead, length);
97
// basic word: a sequence of digits & letters
98
ALPHANUM = ({LETTER}|{DIGIT}|{KOREAN})+
100
// internal apostrophes: O'Reilly, you're, O'Reilly's
101
// use a post-filter to remove possesives
102
APOSTROPHE = {ALPHA} ("'" {ALPHA})+
104
// acronyms: U.S.A., I.B.M., etc.
105
// use a post-filter to remove dots
106
ACRONYM = {ALPHA} "." ({ALPHA} ".")+
108
// company names like AT&T and Excite@Home.
109
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
112
EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
115
HOST = {ALPHANUM} ((".") {ALPHANUM})+
117
// floating point, serial, model numbers, ip addresses, etc.
118
// every other segment must have at least one digit
119
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
120
| {DIGIT}+ {P} {DIGIT}+
121
| {HAS_DIGIT} {P} {ALPHANUM}
122
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
123
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
124
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
125
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
127
TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
130
P = ("_"|"-"|"/"|"."|",")
132
// at least one digit
141
LETTER = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
143
DIGIT = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
145
KOREAN = [\uac00-\ud7af\u1100-\u11ff]
148
CJ = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
150
WHITESPACE = \r\n | [ \r\n\t\f]
153
DOUBLE_BRACKET = "["{2}
154
DOUBLE_BRACKET_CLOSE = "]"{2}
155
DOUBLE_BRACKET_CAT = "["{2}":"?"Category:"
157
TWO_SINGLE_QUOTES = "'"{2}
159
CITATION_CLOSE = "</ref>"
160
INFOBOX = {DOUBLE_BRACE}("I"|"i")nfobox_
162
DOUBLE_BRACE = "{"{2}
163
DOUBLE_BRACE_CLOSE = "}"{2}
165
DOUBLE_EQUALS = "="{2}
168
%state CATEGORY_STATE
169
%state INTERNAL_LINK_STATE
170
%state EXTERNAL_LINK_STATE
172
%state TWO_SINGLE_QUOTES_STATE
173
%state THREE_SINGLE_QUOTES_STATE
174
%state FIVE_SINGLE_QUOTES_STATE
175
%state DOUBLE_EQUALS_STATE
176
%state DOUBLE_BRACE_STATE
181
<YYINITIAL>{ALPHANUM} {positionInc = 1; return ALPHANUM; }
182
<YYINITIAL>{APOSTROPHE} {positionInc = 1; return APOSTROPHE; }
183
<YYINITIAL>{ACRONYM} {positionInc = 1; return ACRONYM; }
184
<YYINITIAL>{COMPANY} {positionInc = 1; return COMPANY; }
185
<YYINITIAL>{EMAIL} {positionInc = 1; return EMAIL; }
186
<YYINITIAL>{NUM} {positionInc = 1; return NUM; }
187
<YYINITIAL>{HOST} {positionInc = 1; return HOST; }
188
<YYINITIAL>{CJ} {positionInc = 1; return CJ; }
192
//First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state
193
//set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent
194
//tokens within the link are incremented
195
{DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;}
196
{DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;}
197
{EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;}
198
{TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;}
199
{DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;}
200
{DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
201
{CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
203
. | {WHITESPACE} |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
206
<INTERNAL_LINK_STATE>{
207
//First {ALPHANUM} is always the link, set position to 0 for these
208
//This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL
209
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
210
{DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
212
. | {WHITESPACE} { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
215
<EXTERNAL_LINK_STATE>{
216
//increment the link token, but then don't increment the tokens after that which are still in the link
217
("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
218
{ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
219
"]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
220
{WHITESPACE} { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
224
{ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
225
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;}
227
. | {WHITESPACE} { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
230
<TWO_SINGLE_QUOTES_STATE>{
231
"'" {currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;}
232
"'''" {currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;}
233
{ALPHANUM} {currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/}
234
//we can have links inside, let those override
235
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
236
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
237
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
240
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
243
<THREE_SINGLE_QUOTES_STATE>{
244
{ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
245
//we can have links inside, let those override
246
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
247
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
248
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
251
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
255
<FIVE_SINGLE_QUOTES_STATE>{
256
{ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
257
//we can have links inside, let those override
258
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
259
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
260
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
263
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
266
<DOUBLE_EQUALS_STATE>{
267
"=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;}
268
{ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
269
{DOUBLE_EQUALS} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
271
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
274
<DOUBLE_BRACE_STATE>{
275
{ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
276
{DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
277
{CITATION_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
279
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
283
"'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/}
284
"'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/}
285
"''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/}
286
"===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/}
287
{ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/}
288
//we can have links inside, let those override
289
{DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
290
{DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
291
{EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
294
{PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
296
.|{WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
303
{INTERNAL_LINK} { return curentTokType; }
305
{CITATION} { return currentTokType; }
306
{CATEGORY} { return currentTokType; }
308
{BOLD} { return currentTokType; }
309
{ITALICS} { return currentTokType; }
310
{BOLD_ITALICS} { return currentTokType; }
311
{HEADING} { return currentTokType; }
312
{SUB_HEADING} { return currentTokType; }
317
/** Ignore the rest */
318
. | {WHITESPACE}|{TAGS} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
321
//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}
322
//EXTERNAL_LINK = "["http://"{HOST}.*?"]"
323
//CITATION = "{"{2}({ALPHANUM}+{WHITESPACE}*)+"}"{2}
324
//CATEGORY = "["{2}"Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
325
//CATEGORY_COLON = "["{2}":Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
326
//BOLD = '''({ALPHANUM}+{WHITESPACE}*)+'''
327
//ITALICS = ''({ALPHANUM}+{WHITESPACE}*)+''
328
//BOLD_ITALICS = '''''({ALPHANUM}+{WHITESPACE}*)+'''''
329
//HEADING = "="{2}({ALPHANUM}+{WHITESPACE}*)+"="{2}
330
//SUB_HEADING ="="{3}({ALPHANUM}+{WHITESPACE}*)+"="{3}