1
package org.apache.lucene.queryParser.standard.parser;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.util.Locale;
22
import org.apache.lucene.messages.MessageImpl;
23
import org.apache.lucene.queryParser.core.messages.QueryParserMessages;
24
import org.apache.lucene.queryParser.core.parser.EscapeQuerySyntax;
25
import org.apache.lucene.queryParser.core.util.UnescapedCharSequence;
29
public class EscapeQuerySyntaxImpl implements EscapeQuerySyntax {
31
private static final char[] wildcardChars = { '*', '?' };
33
private static final String[] escapableTermExtraFirstChars = { "+", "-", "@" };
35
private static final String[] escapableTermChars = { "\"", "<", ">", "=",
36
"!", "(", ")", "^", "[", "{", ":", "]", "}", "~" };
38
// TODO: check what to do with these "*", "?", "\\"
39
private static final String[] escapableQuotedChars = { "\"" };
40
private static final String[] escapableWhiteChars = { " ", "\t", "\n", "\r",
41
"\f", "\b", "\u3000" };
42
private static final String[] escapableWordTokens = { "AND", "OR", "NOT",
43
"TO", "WITHIN", "SENTENCE", "PARAGRAPH", "INORDER" };
45
private static final CharSequence escapeChar(CharSequence str, Locale locale) {
46
if (str == null || str.length() == 0)
49
CharSequence buffer = str;
51
// regular escapable Char for terms
52
for (int i = 0; i < escapableTermChars.length; i++) {
53
buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(),
57
// First Character of a term as more escaping chars
58
for (int i = 0; i < escapableTermExtraFirstChars.length; i++) {
59
if (buffer.charAt(0) == escapableTermExtraFirstChars[i].charAt(0)) {
60
buffer = "\\" + buffer.charAt(0)
61
+ buffer.subSequence(1, buffer.length());
69
private final CharSequence escapeQuoted(CharSequence str, Locale locale) {
70
if (str == null || str.length() == 0)
73
CharSequence buffer = str;
75
for (int i = 0; i < escapableQuotedChars.length; i++) {
76
buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(),
82
private static final CharSequence escapeTerm(CharSequence term, Locale locale) {
86
// Escape single Chars
87
term = escapeChar(term, locale);
88
term = escapeWhiteChar(term, locale);
90
// Escape Parser Words
91
for (int i = 0; i < escapableWordTokens.length; i++) {
92
if (escapableWordTokens[i].equalsIgnoreCase(term.toString()))
99
* replace with ignore case
102
* string to get replaced
104
* the old character sequence in lowercase
106
* the new character to prefix sequence1 in return string.
107
* @return the new String
109
private static CharSequence replaceIgnoreCase(CharSequence string,
110
CharSequence sequence1, CharSequence escapeChar, Locale locale) {
111
if (escapeChar == null || sequence1 == null || string == null)
112
throw new NullPointerException();
115
int count = string.length();
116
int sequence1Length = sequence1.length();
117
if (sequence1Length == 0) {
118
StringBuilder result = new StringBuilder((count + 1)
119
* escapeChar.length());
120
result.append(escapeChar);
121
for (int i = 0; i < count; i++) {
122
result.append(string.charAt(i));
123
result.append(escapeChar);
125
return result.toString();
129
StringBuilder result = new StringBuilder();
130
char first = sequence1.charAt(0);
131
int start = 0, copyStart = 0, firstIndex;
132
while (start < count) {
133
if ((firstIndex = string.toString().toLowerCase(locale).indexOf(first,
136
boolean found = true;
137
if (sequence1.length() > 1) {
138
if (firstIndex + sequence1Length > count)
140
for (int i = 1; i < sequence1Length; i++) {
141
if (string.toString().toLowerCase(locale).charAt(firstIndex + i) != sequence1
149
result.append(string.toString().substring(copyStart, firstIndex));
150
result.append(escapeChar);
151
result.append(string.toString().substring(firstIndex,
152
firstIndex + sequence1Length));
153
copyStart = start = firstIndex + sequence1Length;
155
start = firstIndex + 1;
158
if (result.length() == 0 && copyStart == 0)
160
result.append(string.toString().substring(copyStart));
161
return result.toString();
165
* escape all tokens that are part of the parser syntax on a given string
168
* string to get replaced
170
* locale to be used when performing string compares
171
* @return the new String
173
private static final CharSequence escapeWhiteChar(CharSequence str,
175
if (str == null || str.length() == 0)
178
CharSequence buffer = str;
180
for (int i = 0; i < escapableWhiteChars.length; i++) {
181
buffer = replaceIgnoreCase(buffer, escapableWhiteChars[i].toLowerCase(),
187
public CharSequence escape(CharSequence text, Locale locale, Type type) {
188
if (text == null || text.length() == 0)
191
// escape wildcards and the escape char (this has to be perform before
193
// since we need to preserve the UnescapedCharSequence and escape the
194
// original escape chars
195
if (text instanceof UnescapedCharSequence) {
196
text = ((UnescapedCharSequence) text).toStringEscaped(wildcardChars);
198
text = new UnescapedCharSequence(text).toStringEscaped(wildcardChars);
201
if (type == Type.STRING) {
202
return escapeQuoted(text, locale);
204
return escapeTerm(text, locale);
209
* Returns a String where the escape char has been removed, or kept only once
210
* if there was a double escape.
212
* Supports escaped unicode characters, e. g. translates <code>A</code> to
216
public static UnescapedCharSequence discardEscapeChar(CharSequence input)
217
throws ParseException {
218
// Create char array to hold unescaped char sequence
219
char[] output = new char[input.length()];
220
boolean[] wasEscaped = new boolean[input.length()];
222
// The length of the output can be less than the input
223
// due to discarded escape chars. This variable holds
224
// the actual length of the output
227
// We remember whether the last processed character was
228
// an escape character
229
boolean lastCharWasEscapeChar = false;
231
// The multiplier the current unicode digit must be multiplied with.
232
// E. g. the first digit must be multiplied with 16^3, the second with
234
int codePointMultiplier = 0;
236
// Used to calculate the codepoint of the escaped unicode character
239
for (int i = 0; i < input.length(); i++) {
240
char curChar = input.charAt(i);
241
if (codePointMultiplier > 0) {
242
codePoint += hexToInt(curChar) * codePointMultiplier;
243
codePointMultiplier >>>= 4;
244
if (codePointMultiplier == 0) {
245
output[length++] = (char) codePoint;
248
} else if (lastCharWasEscapeChar) {
249
if (curChar == 'u') {
250
// found an escaped unicode character
251
codePointMultiplier = 16 * 16 * 16;
253
// this character was escaped
254
output[length] = curChar;
255
wasEscaped[length] = true;
258
lastCharWasEscapeChar = false;
260
if (curChar == '\\') {
261
lastCharWasEscapeChar = true;
263
output[length] = curChar;
269
if (codePointMultiplier > 0) {
270
throw new ParseException(new MessageImpl(
271
QueryParserMessages.INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION));
274
if (lastCharWasEscapeChar) {
275
throw new ParseException(new MessageImpl(
276
QueryParserMessages.INVALID_SYNTAX_ESCAPE_CHARACTER));
279
return new UnescapedCharSequence(output, wasEscaped, 0, length);
282
/** Returns the numeric value of the hexadecimal character */
283
private static final int hexToInt(char c) throws ParseException {
284
if ('0' <= c && c <= '9') {
286
} else if ('a' <= c && c <= 'f') {
288
} else if ('A' <= c && c <= 'F') {
291
throw new ParseException(new MessageImpl(
292
QueryParserMessages.INVALID_SYNTAX_ESCAPE_NONE_HEX_UNICODE, c));