~slub.team/goobi-indexserver/3.x

« back to all changes in this revision

Viewing changes to lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex

  • Committer: Sebastian Meyer
  • Date: 2012-08-03 09:12:40 UTC
  • Revision ID: sebastian.meyer@slub-dresden.de-20120803091240-x6861b0vabq1xror
Remove Lucene and Solr source code and add patches instead
Fix Bug #985487: Auto-suggestion for the search interface

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
package org.apache.lucene.analysis.wikipedia;
2
 
 
3
 
/**
4
 
 * Licensed to the Apache Software Foundation (ASF) under one or more
5
 
 * contributor license agreements.  See the NOTICE file distributed with
6
 
 * this work for additional information regarding copyright ownership.
7
 
 * The ASF licenses this file to You under the Apache License, Version 2.0
8
 
 * (the "License"); you may not use this file except in compliance with
9
 
 * the License.  You may obtain a copy of the License at
10
 
 *
11
 
 *     http://www.apache.org/licenses/LICENSE-2.0
12
 
 *
13
 
 * Unless required by applicable law or agreed to in writing, software
14
 
 * distributed under the License is distributed on an "AS IS" BASIS,
15
 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
 
 * See the License for the specific language governing permissions and
17
 
 * limitations under the License.
18
 
 */
19
 
 
20
 
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
21
 
 
22
 
%%
23
 
 
24
 
%class WikipediaTokenizerImpl
25
 
%unicode 3.0
26
 
%integer
27
 
%function getNextToken
28
 
%pack
29
 
%char
30
 
 
31
 
%{
32
 
 
33
 
public static final int ALPHANUM          = WikipediaTokenizer.ALPHANUM_ID;
34
 
public static final int APOSTROPHE        = WikipediaTokenizer.APOSTROPHE_ID;
35
 
public static final int ACRONYM           = WikipediaTokenizer.ACRONYM_ID;
36
 
public static final int COMPANY           = WikipediaTokenizer.COMPANY_ID;
37
 
public static final int EMAIL             = WikipediaTokenizer.EMAIL_ID;
38
 
public static final int HOST              = WikipediaTokenizer.HOST_ID;
39
 
public static final int NUM               = WikipediaTokenizer.NUM_ID;
40
 
public static final int CJ                = WikipediaTokenizer.CJ_ID;
41
 
public static final int INTERNAL_LINK     = WikipediaTokenizer.INTERNAL_LINK_ID;
42
 
public static final int EXTERNAL_LINK     = WikipediaTokenizer.EXTERNAL_LINK_ID;
43
 
public static final int CITATION          = WikipediaTokenizer.CITATION_ID;
44
 
public static final int CATEGORY          = WikipediaTokenizer.CATEGORY_ID;
45
 
public static final int BOLD              = WikipediaTokenizer.BOLD_ID;
46
 
public static final int ITALICS           = WikipediaTokenizer.ITALICS_ID;
47
 
public static final int BOLD_ITALICS      = WikipediaTokenizer.BOLD_ITALICS_ID;
48
 
public static final int HEADING           = WikipediaTokenizer.HEADING_ID;
49
 
public static final int SUB_HEADING       = WikipediaTokenizer.SUB_HEADING_ID;
50
 
public static final int EXTERNAL_LINK_URL = WikipediaTokenizer.EXTERNAL_LINK_URL_ID;
51
 
 
52
 
 
53
 
private int currentTokType;
54
 
private int numBalanced = 0;
55
 
private int positionInc = 1;
56
 
private int numLinkToks = 0;
57
 
//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
58
 
//this can be useful for detecting when a new reserved token is encountered
59
 
//see https://issues.apache.org/jira/browse/LUCENE-1133
60
 
private int numWikiTokensSeen = 0;
61
 
 
62
 
public static final String [] TOKEN_TYPES = WikipediaTokenizer.TOKEN_TYPES;
63
 
 
64
 
/**
65
 
Returns the number of tokens seen inside a category or link, etc.
66
 
@return the number of tokens seen inside the context of wiki syntax.
67
 
**/
68
 
public final int getNumWikiTokensSeen(){
69
 
  return numWikiTokensSeen;
70
 
}
71
 
 
72
 
public final int yychar()
73
 
{
74
 
    return yychar;
75
 
}
76
 
 
77
 
public final int getPositionIncrement(){
78
 
  return positionInc;
79
 
}
80
 
 
81
 
/**
82
 
 * Fills Lucene token with the current token text.
83
 
 */
84
 
final void getText(CharTermAttribute t) {
85
 
  t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
86
 
}
87
 
 
88
 
final int setText(StringBuilder buffer){
89
 
  int length = zzMarkedPos - zzStartRead;
90
 
  buffer.append(zzBuffer, zzStartRead, length);
91
 
  return length;
92
 
}
93
 
 
94
 
 
95
 
%}
96
 
 
97
 
// basic word: a sequence of digits & letters
98
 
ALPHANUM   = ({LETTER}|{DIGIT}|{KOREAN})+
99
 
 
100
 
// internal apostrophes: O'Reilly, you're, O'Reilly's
101
 
// use a post-filter to remove possesives
102
 
APOSTROPHE =  {ALPHA} ("'" {ALPHA})+
103
 
 
104
 
// acronyms: U.S.A., I.B.M., etc.
105
 
// use a post-filter to remove dots
106
 
ACRONYM    =  {ALPHA} "." ({ALPHA} ".")+
107
 
 
108
 
// company names like AT&T and Excite@Home.
109
 
COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}
110
 
 
111
 
// email addresses
112
 
EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
113
 
 
114
 
// hostname
115
 
HOST       =  {ALPHANUM} ((".") {ALPHANUM})+
116
 
 
117
 
// floating point, serial, model numbers, ip addresses, etc.
118
 
// every other segment must have at least one digit
119
 
NUM        = ({ALPHANUM} {P} {HAS_DIGIT}
120
 
           | {DIGIT}+ {P} {DIGIT}+
121
 
           | {HAS_DIGIT} {P} {ALPHANUM}
122
 
           | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
123
 
           | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
124
 
           | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
125
 
           | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
126
 
 
127
 
TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
128
 
 
129
 
// punctuation
130
 
P                = ("_"|"-"|"/"|"."|",")
131
 
 
132
 
// at least one digit
133
 
HAS_DIGIT  =
134
 
    ({LETTER}|{DIGIT})*
135
 
    {DIGIT}
136
 
    ({LETTER}|{DIGIT})*
137
 
 
138
 
ALPHA      = ({LETTER})+
139
 
 
140
 
 
141
 
LETTER     = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
142
 
 
143
 
DIGIT      = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
144
 
 
145
 
KOREAN     = [\uac00-\ud7af\u1100-\u11ff]
146
 
 
147
 
// Chinese, Japanese
148
 
CJ         = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
149
 
 
150
 
WHITESPACE = \r\n | [ \r\n\t\f]
151
 
 
152
 
//Wikipedia
153
 
DOUBLE_BRACKET = "["{2}
154
 
DOUBLE_BRACKET_CLOSE = "]"{2}
155
 
DOUBLE_BRACKET_CAT = "["{2}":"?"Category:"
156
 
EXTERNAL_LINK = "["
157
 
TWO_SINGLE_QUOTES = "'"{2}
158
 
CITATION = "<ref>"
159
 
CITATION_CLOSE = "</ref>"
160
 
INFOBOX = {DOUBLE_BRACE}("I"|"i")nfobox_
161
 
 
162
 
DOUBLE_BRACE = "{"{2}
163
 
DOUBLE_BRACE_CLOSE = "}"{2}
164
 
PIPE = "|"
165
 
DOUBLE_EQUALS = "="{2}
166
 
 
167
 
 
168
 
%state CATEGORY_STATE
169
 
%state INTERNAL_LINK_STATE
170
 
%state EXTERNAL_LINK_STATE
171
 
 
172
 
%state TWO_SINGLE_QUOTES_STATE
173
 
%state THREE_SINGLE_QUOTES_STATE
174
 
%state FIVE_SINGLE_QUOTES_STATE
175
 
%state DOUBLE_EQUALS_STATE
176
 
%state DOUBLE_BRACE_STATE
177
 
%state STRING
178
 
 
179
 
%%
180
 
 
181
 
<YYINITIAL>{ALPHANUM}                                                     {positionInc = 1; return ALPHANUM; }
182
 
<YYINITIAL>{APOSTROPHE}                                                   {positionInc = 1; return APOSTROPHE; }
183
 
<YYINITIAL>{ACRONYM}                                                      {positionInc = 1; return ACRONYM; }
184
 
<YYINITIAL>{COMPANY}                                                      {positionInc = 1; return COMPANY; }
185
 
<YYINITIAL>{EMAIL}                                                        {positionInc = 1; return EMAIL; }
186
 
<YYINITIAL>{NUM}                                                          {positionInc = 1; return NUM; }
187
 
<YYINITIAL>{HOST}                                                         {positionInc = 1; return HOST; }
188
 
<YYINITIAL>{CJ}                                                           {positionInc = 1; return CJ; }
189
 
 
190
 
//wikipedia
191
 
<YYINITIAL>{
192
 
  //First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state
193
 
  //set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent
194
 
  //tokens within the link are incremented
195
 
  {DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;}
196
 
  {DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;}
197
 
  {EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;}
198
 
  {TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;}
199
 
  {DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;}
200
 
  {DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
201
 
  {CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
202
 
//ignore
203
 
  . | {WHITESPACE} |{INFOBOX}                                               {numWikiTokensSeen = 0;  positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
204
 
}
205
 
 
206
 
<INTERNAL_LINK_STATE>{
207
 
//First {ALPHANUM} is always the link, set position to 0 for these
208
 
//This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL
209
 
  {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
210
 
  {DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
211
 
  //ignore
212
 
  . | {WHITESPACE}                                               { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
213
 
}
214
 
 
215
 
<EXTERNAL_LINK_STATE>{
216
 
//increment the link token, but then don't increment the tokens after that which are still in the link
217
 
  ("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
218
 
  {ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
219
 
  "]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
220
 
  {WHITESPACE}                                               { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
221
 
}
222
 
 
223
 
<CATEGORY_STATE>{
224
 
  {ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
225
 
  {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;}
226
 
  //ignore
227
 
  . | {WHITESPACE}                                               { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
228
 
}
229
 
//italics
230
 
<TWO_SINGLE_QUOTES_STATE>{
231
 
  "'" {currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;}
232
 
   "'''" {currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;}
233
 
   {ALPHANUM} {currentTokType = ITALICS; numWikiTokensSeen++;  yybegin(STRING); return currentTokType;/*italics*/}
234
 
   //we can have links inside, let those override
235
 
   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
236
 
   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
237
 
   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
238
 
 
239
 
   //ignore
240
 
  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
241
 
}
242
 
//bold
243
 
<THREE_SINGLE_QUOTES_STATE>{
244
 
  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
245
 
  //we can have links inside, let those override
246
 
   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
247
 
   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
248
 
   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
249
 
 
250
 
   //ignore
251
 
  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
252
 
 
253
 
}
254
 
//bold italics
255
 
<FIVE_SINGLE_QUOTES_STATE>{
256
 
  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
257
 
  //we can have links inside, let those override
258
 
   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0;  yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
259
 
   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
260
 
   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
261
 
 
262
 
   //ignore
263
 
  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
264
 
}
265
 
 
266
 
<DOUBLE_EQUALS_STATE>{
267
 
 "=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;}
268
 
 {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
269
 
 {DOUBLE_EQUALS} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
270
 
  //ignore
271
 
  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
272
 
}
273
 
 
274
 
<DOUBLE_BRACE_STATE>{
275
 
  {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
276
 
  {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
277
 
  {CITATION_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
278
 
   //ignore
279
 
  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
280
 
}
281
 
 
282
 
<STRING> {
283
 
  "'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/}
284
 
  "'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/}
285
 
  "''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/}
286
 
  "===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/}
287
 
  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/}
288
 
  //we can have links inside, let those override
289
 
   {DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
290
 
   {DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
291
 
   {EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
292
 
 
293
 
 
294
 
  {PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
295
 
 
296
 
  .|{WHITESPACE}                                              { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
297
 
}
298
 
 
299
 
 
300
 
 
301
 
 
302
 
/*
303
 
{INTERNAL_LINK}                                                { return curentTokType; }
304
 
 
305
 
{CITATION}                                                { return currentTokType; }
306
 
{CATEGORY}                                                { return currentTokType; }
307
 
 
308
 
{BOLD}                                                { return currentTokType; }
309
 
{ITALICS}                                                { return currentTokType; }
310
 
{BOLD_ITALICS}                                                { return currentTokType; }
311
 
{HEADING}                                                { return currentTokType; }
312
 
{SUB_HEADING}                                                { return currentTokType; }
313
 
 
314
 
*/
315
 
//end wikipedia
316
 
 
317
 
/** Ignore the rest */
318
 
. | {WHITESPACE}|{TAGS}                                                { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
319
 
 
320
 
 
321
 
//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}
322
 
//EXTERNAL_LINK = "["http://"{HOST}.*?"]"
323
 
//CITATION = "{"{2}({ALPHANUM}+{WHITESPACE}*)+"}"{2}
324
 
//CATEGORY = "["{2}"Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
325
 
//CATEGORY_COLON = "["{2}":Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
326
 
//BOLD = '''({ALPHANUM}+{WHITESPACE}*)+'''
327
 
//ITALICS = ''({ALPHANUM}+{WHITESPACE}*)+''
328
 
//BOLD_ITALICS = '''''({ALPHANUM}+{WHITESPACE}*)+'''''
329
 
//HEADING = "="{2}({ALPHANUM}+{WHITESPACE}*)+"="{2}
330
 
//SUB_HEADING ="="{3}({ALPHANUM}+{WHITESPACE}*)+"="{3}