2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.lucene.analysis.cn.smart.hhmm;
20
import java.io.UnsupportedEncodingException;
24
* SmartChineseAnalyzer abstract dictionary implementation.
27
* Contains methods for dealing with GB2312 encoding.
29
* @lucene.experimental
31
abstract class AbstractDictionary {
33
* First Chinese Character in GB2312 (15 * 94)
34
* Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
36
public static final int GB2312_FIRST_CHAR = 1410;
39
* Last Chinese Character in GB2312 (87 * 94).
40
* Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned.
42
public static final int GB2312_CHAR_NUM = 87 * 94;
45
* Dictionary data contains 6768 Chinese characters with frequency statistics.
47
public static final int CHAR_NUM_IN_FILE = 6768;
49
// =====================================================
50
// code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F
51
// B0A0 啊 阿 埃 挨 哎 唉 哀 皑 癌 蔼 矮 艾 碍 爱 隘
52
// B0B0 鞍 氨 安 俺 按 暗 岸 胺 案 肮 昂 盎 凹 敖 熬 翱
53
// B0C0 袄 傲 奥 懊 澳 芭 捌 扒 叭 吧 笆 八 疤 巴 拔 跋
54
// B0D0 靶 把 耙 坝 霸 罢 爸 白 柏 百 摆 佰 败 拜 稗 斑
55
// B0E0 班 搬 扳 般 颁 板 版 扮 拌 伴 瓣 半 办 绊 邦 帮
56
// B0F0 梆 榜 膀 绑 棒 磅 蚌 镑 傍 谤 苞 胞 包 褒 剥
57
// =====================================================
59
// GB2312 character set:
67
// 08 63 Phonetic Symbols
68
// 09 76 Drawing Symbols
70
// 16-55 3755 Plane 1, in pinyin order
71
// 56-87 3008 Plane 2, in radical/stroke order
73
// ======================================================
77
* Transcode from GB2312 ID to Unicode
80
* GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols.
81
* Some regions are unassigned (reserved).
84
* @param ccid GB2312 id
85
* @return unicode String
87
public String getCCByGB2312Id(int ccid) {
88
if (ccid < 0 || ccid > WordDictionary.GB2312_CHAR_NUM)
90
int cc1 = ccid / 94 + 161;
91
int cc2 = ccid % 94 + 161;
92
byte[] buffer = new byte[2];
93
buffer[0] = (byte) cc1;
94
buffer[1] = (byte) cc2;
96
String cchar = new String(buffer, "GB2312");
98
} catch (UnsupportedEncodingException e) {
104
* Transcode from Unicode to GB2312
106
* @param ch input character in Unicode, or character in Basic Latin range.
107
* @return position in GB2312
109
public short getGB2312Id(char ch) {
111
byte[] buffer = Character.toString(ch).getBytes("GB2312");
112
if (buffer.length != 2) {
113
// Should be a two-byte character
116
int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
117
int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
118
// Therefore, each code page only has 16*6-2=94 characters.
119
return (short) (b0 * 94 + b1);
120
} catch (UnsupportedEncodingException e) {
127
* 32-bit FNV Hash Function
129
* @param c input character
132
public long hash1(char c) {
133
final long p = 1099511628211L;
134
long hash = 0xcbf29ce484222325L;
135
hash = (hash ^ (c & 0x00FF)) * p;
136
hash = (hash ^ (c >> 8)) * p;
146
* 32-bit FNV Hash Function
148
* @param carray character array
151
public long hash1(char carray[]) {
152
final long p = 1099511628211L;
153
long hash = 0xcbf29ce484222325L;
154
for (int i = 0; i < carray.length; i++) {
156
hash = (hash ^ (d & 0x00FF)) * p;
157
hash = (hash ^ (d >> 8)) * p;
160
// hash += hash << 13;
161
// hash ^= hash >> 7;
162
// hash += hash << 3;
163
// hash ^= hash >> 17;
164
// hash += hash << 5;
169
* djb2 hash algorithm,this algorithm (k=33) was first reported by dan
170
* bernstein many years ago in comp.lang.c. another version of this algorithm
171
* (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
172
* the magic of number 33 (why it works better than many other constants,
173
* prime or not) has never been adequately explained.
178
public int hash2(char c) {
182
hash = ((hash << 5) + hash) + c & 0x00FF;
183
hash = ((hash << 5) + hash) + c >> 8;
189
* djb2 hash algorithm,this algorithm (k=33) was first reported by dan
190
* bernstein many years ago in comp.lang.c. another version of this algorithm
191
* (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
192
* the magic of number 33 (why it works better than many other constants,
193
* prime or not) has never been adequately explained.
195
* @param carray character array
198
public int hash2(char carray[]) {
202
for (int i = 0; i < carray.length; i++) {
204
hash = ((hash << 5) + hash) + d & 0x00FF;
205
hash = ((hash << 5) + hash) + d >> 8;