2
***********************************************************************
4
* Copyright (C) 2006, International Business Machines Corporation and
5
* others. All Rights Reserved.
7
***********************************************************************
11
* This tool produces the character usage frequency statistics for the Big5
12
* Chinese charset, for use by the ICU charset detectors.
14
* usage: java BIG5Tool [-d] [directory path]
16
* -d: Produce the data in a form to be exported to the ICU implementation
17
* Default is to produce an informative dump.
19
* -sjis Do Shift_JIS. The structure of sjis is very similar to Big5.
22
* Source directory for the text files to be analyzed.
23
* All files in the specified directory must be in the Big5 encoding.
27
package com.ibm.icu.dev.tool.charsetdet.mbcs;
30
import java.io.FileInputStream;
31
import java.util.ArrayList;
32
import java.util.Arrays;
33
import java.util.HashMap;
34
import java.util.List;
37
public class BIG5Tool {
39
// The file buffer and file data length need to be out in class member variables
40
// so that the code lifted from charSet detection for scanning the multi-byte chars
41
// can see them conveniently.
42
byte [] buf = new byte[1000000];
45
boolean option_d = false; // data option. Produce exportable data
46
boolean option_v = true; // verbose informaional output.
47
boolean sjis = false; // True if input text files are Shift_JIS encoded.
51
public static void main(String[] args) {
52
BIG5Tool This = new BIG5Tool();
58
void Main(String[] args) {
62
// Command Line Option Handling
64
String dirName = null;
65
for (i=0; i<args.length; i++) {
66
if (args[i].equals("-d")) {
71
if (args[i].equals("-sjis")) {
75
if (args[i].startsWith("-")) {
76
System.err.println("Unrecognized option: " + args[i]);
79
if (dirName == null) {
82
System.err.println("Unrecognized option: " + dirName);
86
if (dirName == null) {
91
// Verify that the specified directory exists.
93
File dir = new File(dirName);
94
if (dir.isDirectory() == false) {
95
System.err.println("\"" + dirName + "\" is not a directory");
103
// Collect statistics from all ordinary files in a specified directory.
105
void processDir(File dir) {
106
int totalMbcsChars = 0;
107
HashMap m = new HashMap(10000);
110
System.out.println(dir.getName());
111
File[] files = dir.listFiles();
112
for (i=0; i<files.length; i++) {
114
if (files[i].isFile()) {
115
FileInputStream is = new FileInputStream(files[i]);
116
fileSize = is.read(buf);
118
System.out.println(files[i].getPath());
119
System.out.println(" " + fileSize + " bytes.");
121
iteratedChar ichar = new iteratedChar();
123
int fileMbcsChars = 0;
126
while (nextChar(ichar)) {
127
if (ichar.error == true) {
132
if (ichar.charValue > 255) {
136
if (ichar.charValue <= 255) {
137
// Don't keep occurence statistics for the single byte range
142
// Frequency of occurence statistics are accumulated in a map.
144
ChEl keyEl = new ChEl(ichar.charValue, 0);
145
ChEl valEl = (ChEl)m.get(keyEl);
153
System.out.println(" " + fileChars + " Chars");
154
System.out.println(" " + fileMbcsChars + " mbcs Chars");
155
System.out.println(" " + errs + " errors");
156
System.out.println("\n");
160
catch (Exception e) {
161
System.err.println("Exception:" + e);
167
// We've processed through all of the files.
168
// sort and dump out the frequency statistics.
170
Object [] encounteredChars = m.values().toArray();
171
Arrays.sort(encounteredChars);
172
int cumulativeChars = 0;
173
int cumulativePercent = 0;
175
System.out.println("# <char code> <occurences> <Cumulative %>");
176
for (i=0; i<encounteredChars.length; i++) {
177
ChEl c = (ChEl)encounteredChars[i];
178
cumulativeChars += c.occurences;
179
cumulativePercent = cumulativeChars*100/totalMbcsChars;
180
System.out.println(i + " " + Integer.toHexString(c.charCode) + " "
181
+ c.occurences + " " + cumulativePercent);
186
// Output the list of characters formatted for pasting into a
187
// Java source code array initializer.
188
// Resort into order based on the character code value, not
189
// on frequency of occurence.
191
List charList = new ArrayList();
193
for (i=0; i<100 && cumulativePercent<50; i++) {
194
ChEl c = (ChEl)encounteredChars[i];
195
cumulativeChars += c.occurences;
196
cumulativePercent = cumulativeChars*100/totalMbcsChars;
197
charList.add(new Integer(c.charCode));
199
Object [] sortedChars = charList.toArray();
200
Arrays.sort(sortedChars);
202
System.out.print(" {");
203
for (i=0; i<sortedChars.length; i++) {
205
System.out.print(", ");
207
System.out.print("\n ");
210
int cp = ((Integer)sortedChars[i]).intValue();
211
System.out.print("0x" + Integer.toHexString(cp));
213
System.out.println("};");
218
// This is a little class containing a
219
// multi-byte character value and an occurence count for that char.
220
// Instances of this class are kept in the collection that accumulates statistics
222
// WARNING: this class's natural ordering (from Comparable) and equals()
225
static class ChEl implements Comparable {
234
// Equals needs to work with a map, with the charCode as the key.
235
// For insertion/lookup, we care about the char code only, not the occurence count.
236
public boolean equals(Object other) {
237
ChEl o = (ChEl)other;
238
return o.charCode == this.charCode;
241
// Hashcode needs to be compatible with equals
242
// We're using this in a hashMap!
243
public int hashCode() {
247
// We want to be able to sort the results by frequency of occurence
248
// Compare backwards. We want most frequent chars first.
249
public int compareTo(Object other) {
250
ChEl o = (ChEl)other;
251
return (this.occurences> o.occurences? -1 :
252
(this.occurences==o.occurences? 0 : 1));
258
// iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
259
// Pulls out one logical char according to the rules of EUC encoding.
262
int charValue = 0; // The char value is a value from the encoding.
263
// It's meaning is not well defined, other than
264
// different encodings
267
boolean error = false;
268
boolean done = false;
279
if (nextIndex >= fileSize) {
283
int byteValue = (int)buf[nextIndex++] & 0x00ff;
289
boolean nextChar(iteratedChar it) {
290
it.index = it.nextIndex;
296
firstByte = it.charValue = it.nextByte();
298
// Ran off the end of the input data
302
if (firstByte <= 0x0080 ||
303
(sjis && firstByte>=0x00a0 && firstByte< 0x00e0) ||
304
(sjis && firstByte>=0x00fd && firstByte<=0x00ff)) {
309
secondByte = it.nextByte();
310
it.charValue = (it.charValue << 8) | secondByte;
312
if (secondByte < 0x40 ||
313
secondByte == 0x007f ||
314
secondByte == 0x00ff ||
315
sjis && secondByte >= 0x00fd) {
320
System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte));
324
return (it.done == false);