1
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* ***** BEGIN LICENSE BLOCK *****
3
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
5
* The contents of this file are subject to the Netscape Public License
6
* Version 1.1 (the "License"); you may not use this file except in
7
* compliance with the License. You may obtain a copy of the License at
8
* http://www.mozilla.org/NPL/
10
* Software distributed under the License is distributed on an "AS IS" basis,
11
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
* for the specific language governing rights and limitations under the
15
* The Original Code is mozilla.org code.
17
* The Initial Developer of the Original Code is
18
* Netscape Communications Corporation.
19
* Portions created by the Initial Developer are Copyright (C) 1998
20
* the Initial Developer. All Rights Reserved.
25
* Alternatively, the contents of this file may be used under the terms of
26
* either the GNU General Public License Version 2 or later (the "GPL"), or
27
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28
* in which case the provisions of the GPL or the LGPL are applicable instead
29
* of those above. If you wish to allow use of your version of this file only
30
* under the terms of either the GPL or the LGPL, and not to allow others to
31
* use your version of this file under the terms of the NPL, indicate your
32
* decision by deleting the provisions above and replace them with the notice
33
* and other provisions required by the GPL or the LGPL. If you do not delete
34
* the provisions above, a recipient may use your version of this file under
35
* the terms of any one of the NPL, the GPL or the LGPL.
37
* ***** END LICENSE BLOCK ***** */
42
#include "nsSBCharSetProber.h"
43
#include "nsSBCSGroupProber.h"
46
nsSBCSGroupProber::nsSBCSGroupProber()
48
mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
49
mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
50
mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
51
mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
52
mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
53
mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
54
mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
55
mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
56
mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
57
mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
59
// disable latin2 before latin1 is available, otherwise all latin1
60
// will be detected as latin2 because of their similarity.
61
//mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
62
//mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
67
nsSBCSGroupProber::~nsSBCSGroupProber()
69
for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
76
const char* nsSBCSGroupProber::GetCharSetName()
78
//if we have no answer yet
82
//no charset seems positive
84
//we will use default.
87
return mProbers[mBestGuess]->GetCharSetName();
90
void nsSBCSGroupProber::Reset(void)
92
for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
95
mIsActive[i] = PR_TRUE;
97
mActiveNum = NUM_OF_SBCS_PROBERS;
102
//This filter apply to all scripts that does not use latin letters (english letter)
103
PRBool nsSBCSGroupProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
105
//do filtering to reduce load to probers
107
char *prevPtr, *curPtr;
109
PRBool meetMSB = PR_FALSE;
110
newptr = *newBuf = (char*)PR_MALLOC(aLen);
114
for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
120
else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')
122
//current char is a symbol, most likely a punctuation. we treat it as segment delimiter
123
if (meetMSB && curPtr > prevPtr)
124
//this segment contains more than single symbol, and it has upper ascii, we need to keep it
126
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
131
else //ignore current segment. (either because it is just a symbol or just a english word
135
if (meetMSB && curPtr > prevPtr)
136
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
138
newLen = newptr - *newBuf;
143
#ifdef NO_ENGLISH_CONTAMINATION
144
//This filter apply to all scripts that does use latin letters (english letter)
145
PRBool nsSBCSGroupProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
147
//do filtering to reduce load to probers
149
char *prevPtr, *curPtr;
150
PRBool isInTag = PR_FALSE;
152
newptr = *newBuf = (char*)PR_MALLOC(aLen);
156
for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
160
else if (*curPtr == '<')
163
if (!(*curPtr & 0x80) &&
164
(*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
166
if (curPtr > prevPtr && !isInTag) //current segment contains more than just a symbol
167
// and it is not inside a tag, keep it
169
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
178
// If the current segment contains more than just a symbol
179
// and it is not inside a tag then keep it.
180
if (curPtr > prevPtr && !isInTag)
181
while (prevPtr < curPtr)
182
*newptr++ = *prevPtr++;
184
newLen = newptr - *newBuf;
188
#endif //NO_ENGLISH_CONTAMINATION
190
nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
197
//apply filter to original buffer, and we got new buffer back
198
//depend on what script it is, we will feed them the new buffer
199
//we got after applying proper filter
200
FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1);
202
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
206
st = mProbers[i]->HandleData(newBuf1, newLen1);
213
else if (st == eNotMe)
215
mIsActive[i] = PR_FALSE;
230
float nsSBCSGroupProber::GetConfidence(void)
233
float bestConf = 0.0, cf;
238
return (float)0.99; //sure yes
240
return (float)0.01; //sure no
242
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
246
cf = mProbers[i]->GetConfidence();
259
nsSBCSGroupProber::DumpStatus()
264
cf = GetConfidence();
265
printf("SBCS Group Prober --------begin status \r\n");
266
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
269
printf("[%s] is inactive(ie. cofidence is too low).\r\n", mProbers[i]->GetCharSetName(), i);
271
mProbers[i]->DumpStatus();
273
printf("SBCS Group found best match [%s] confidence %f.\r\n",
274
mProbers[mBestGuess]->GetCharSetName(), cf);