2
**********************************************************************
3
* Copyright (C) 2005-2008, International Business Machines
4
* Corporation and others. All Rights Reserved.
5
**********************************************************************
11
#include "unicode/utypes.h"
13
#if !UCONFIG_NO_CONVERSION
19
// "Character" iterated character class.
20
// Recognizers for specific mbcs encodings make their "characters" available
21
// by providing a nextChar() function that fills in an instance of IteratedChar
22
// with the next char from the input.
23
// The returned characters are not converted to Unicode, but remain as the raw
24
// bytes (concatenated into an int) from the codepage data.
26
// For Asian charsets, use the raw input rather than the input that has been
27
// stripped of markup. Detection only considers multi-byte chars, effectively
28
// stripping markup anyway, and double byte chars do occur in markup too.
30
class IteratedChar : public UMemory
33
uint32_t charValue; // 1-4 bytes from the raw input data
42
int32_t nextByte(InputText* det);
46
class CharsetRecog_mbcs : public CharsetRecognizer {
50
* Test the match of this charset with the input text data
51
* which is obtained via the CharsetDetector object.
53
* @param det The CharsetDetector, which contains the input text
54
* to be checked for being in this charset.
55
* @return Two values packed into one int (Damn java, anyhow)
57
* bits 0-7: the match confidence, ranging from 0-100
59
* bits 8-15: The match reason, an enum-like value.
61
int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen);
65
virtual ~CharsetRecog_mbcs();
68
* Get the IANA name of this charset.
69
* @return the charset name.
72
const char *getName() const = 0;
73
const char *getLanguage() const = 0;
74
int32_t match(InputText* det) = 0;
77
* Get the next character (however many bytes it is) from the input data
78
* Subclasses for specific charset encodings must implement this function
79
* to get characters according to the rules of their encoding scheme.
81
* This function is not a method of class IteratedChar only because
82
* that would require a lot of extra derived classes, which is awkward.
83
* @param it The IteratedChar "struct" into which the returned char is placed.
84
* @param det The charset detector, which is needed to get at the input byte data
85
* being iterated over.
86
* @return True if a character was returned, false at end of input.
88
virtual UBool nextChar(IteratedChar *it, InputText *textIn) = 0;
94
* Shift-JIS charset recognizer.
97
class CharsetRecog_sjis : public CharsetRecog_mbcs {
99
virtual ~CharsetRecog_sjis();
101
UBool nextChar(IteratedChar *it, InputText *det);
103
int32_t match(InputText *det);
105
const char *getName() const;
106
const char *getLanguage() const;
112
* EUC charset recognizers. One abstract class that provides the common function
113
* for getting the next character according to the EUC encoding scheme,
114
* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
117
class CharsetRecog_euc : public CharsetRecog_mbcs
120
virtual ~CharsetRecog_euc();
122
const char *getName() const = 0;
123
const char *getLanguage() const = 0;
125
int32_t match(InputText* det) = 0;
128
* Get the next character value for EUC based encodings.
129
* Character "value" is simply the raw bytes that make up the character
130
* packed into an int.
132
UBool nextChar(IteratedChar *it, InputText *det);
136
* The charset recognize for EUC-JP. A singleton instance of this class
137
* is created and kept by the public CharsetDetector class
139
class CharsetRecog_euc_jp : public CharsetRecog_euc
142
virtual ~CharsetRecog_euc_jp();
144
const char *getName() const;
145
const char *getLanguage() const;
147
int32_t match(InputText *det);
151
* The charset recognize for EUC-KR. A singleton instance of this class
152
* is created and kept by the public CharsetDetector class
154
class CharsetRecog_euc_kr : public CharsetRecog_euc
157
virtual ~CharsetRecog_euc_kr();
159
const char *getName() const;
160
const char *getLanguage() const;
162
int32_t match(InputText *det);
167
* Big5 charset recognizer.
170
class CharsetRecog_big5 : public CharsetRecog_mbcs
173
virtual ~CharsetRecog_big5();
175
UBool nextChar(IteratedChar* it, InputText* det);
177
const char *getName() const;
178
const char *getLanguage() const;
180
int32_t match(InputText *det);
186
* GB-18030 recognizer. Uses simplified Chinese statistics.
189
class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
192
virtual ~CharsetRecog_gb_18030();
194
UBool nextChar(IteratedChar* it, InputText* det);
196
const char *getName() const;
197
const char *getLanguage() const;
199
int32_t match(InputText *det);
205
#endif /* __CSRMBCS_H */