1
package org.apache.lucene.analysis.hi;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import static org.apache.lucene.analysis.util.StemmerUtil.*;
23
* Normalizer for Hindi.
25
* Normalizes text to remove some differences in spelling variations.
27
* Implements the Hindi-language specific algorithm specified in:
28
* <i>Word normalization in Indian languages</i>
29
* Prasad Pingali and Vasudeva Varma.
30
* http://web2py.iiit.ac.in/publications/default/download/inproceedings.pdf.3fe5b38c-02ee-41ce-9a8f-3e745670be32.pdf
32
* with the following additions from <i>Hindi CLIR in Thirty Days</i>
33
* Leah S. Larkey, Margaret E. Connell, and Nasreen AbdulJaleel.
34
* http://maroo.cs.umass.edu/pub/web/getpdf.php?id=454:
36
* <li>Internal Zero-width joiner and Zero-width non-joiners are removed
37
* <li>In addition to chandrabindu, NA+halant is normalized to anusvara
41
public class HindiNormalizer {
43
* Normalize an input buffer of Hindi text
45
* @param s input buffer
46
* @param len length of input buffer
47
* @return length of input buffer after normalization
49
public int normalize(char s[], int len) {
51
for (int i = 0; i < len; i++) {
55
if (i + 1 < len && s[i + 1] == '\u094D') {
57
len = delete(s, i + 1, len);
60
// candrabindu -> bindu
66
len = delete(s, i, len);
102
// zwj/zwnj -> delete
105
len = delete(s, i, len);
110
len = delete(s, i, len);
113
// chandra/short -> replace
133
// long -> short ind. vowels
155
// long -> short dep. vowels