1
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* ***** BEGIN LICENSE BLOCK *****
3
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
5
* The contents of this file are subject to the Netscape Public License
6
* Version 1.1 (the "License"); you may not use this file except in
7
* compliance with the License. You may obtain a copy of the License at
8
* http://www.mozilla.org/NPL/
10
* Software distributed under the License is distributed on an "AS IS" basis,
11
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
* for the specific language governing rights and limitations under the
15
* The Original Code is mozilla.org code.
17
* The Initial Developer of the Original Code is
18
* Netscape Communications Corporation.
19
* Portions created by the Initial Developer are Copyright (C) 1998
20
* the Initial Developer. All Rights Reserved.
25
* Alternatively, the contents of this file may be used under the terms of
26
* either the GNU General Public License Version 2 or later (the "GPL"), or
27
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28
* in which case the provisions of the GPL or the LGPL are applicable instead
29
* of those above. If you wish to allow use of your version of this file only
30
* under the terms of either the GPL or the LGPL, and not to allow others to
31
* use your version of this file under the terms of the NPL, indicate your
32
* decision by deleting the provisions above and replace them with the notice
33
* and other provisions required by the GPL or the LGPL. If you do not delete
34
* the provisions above, a recipient may use your version of this file under
35
* the terms of any one of the NPL, the GPL or the LGPL.
37
* ***** END LICENSE BLOCK ***** */
39
* A character set converter from HZ to Unicode.
42
* @created 08/Sept/1999
43
* @author Yueheng Xu, Yueheng.Xu@intel.com
45
* Note: in this HZ-GB-2312 converter, we accept a string composed of 7-bit HZ
46
* encoded Chinese chars,as it is defined in RFC1843 available at
47
* http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html
48
* and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html.
50
* In an effort to match the similar extended capability of Microsoft Internet Explorer
51
* 5.0. We also accept the 8-bit GB encoded chars mixed in a HZ string.
52
* But this should not be a recommendedd practice for HTML authors.
54
* The priority of converting are as follows: first convert 8-bit GB code; then,
55
* consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current
56
* state ( default to ASCII state ) of the string, each 7-bit char is converted as an
57
* ASCII, or two 7-bit chars are converted into a Chinese character.
62
#include "nsUCvCnDll.h"
63
#include "nsHZToUnicode.h"
66
//----------------------------------------------------------------------
67
// Class nsHZToUnicode [implementation]
69
//----------------------------------------------------------------------
70
// Subclassing of nsTablesDecoderSupport class [implementation]
73
#define HZ_STATE_ASCII 2
74
#define HZ_STATE_TILD 3
80
nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1)
82
mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state
84
//Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp.
85
NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
89
PRInt32 * aDestLength)
92
PRInt32 iSrcLength = *aSrcLength;
97
for (i=0;i<iSrcLength;i++)
99
if ( iDestlen >= (*aDestLength) )
101
res = NS_OK_UDEC_MOREOUTPUT;
104
if ( *aSrc & 0x80 ) // if it is a 8-bit byte
106
// The source is a 8-bit GBCode
107
*aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
115
// otherwise, it is a 7-bit byte
116
// The source will be an ASCII or a 7-bit HZ code depending on ch1
119
if (ch1 == HZLEAD1 ) // if it is lead by '~'
125
// we are switching to HZ state
126
mHZState = HZ_STATE_GB;
132
// we are switching to ASCII state
133
mHZState = HZ_STATE_ASCII;
138
// we got a '~~', process like an ASCII, but no state change
140
*aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
147
// we got a "~\n", it means maintain double byte mode cross lines, ignore the '~' itself
148
// mHZState = HZ_STATE_GB;
149
// I find that "~\n" should interpreted as line continuation without mode change
150
// It should not be interpreted as line continuation with double byte mode on
154
// undefined ESC sequence '~X' are ignored since this is a illegal combination
158
continue;// go for next loop
164
// the following chars are HZ
165
*aDest = mUtil.GBKCharToUnicode(aSrc[0]|0x80, aSrc[1]|0x80);
173
// default behavior also like an ASCII
174
// when the source is an ASCII
175
*aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
183
*aDestLength = iDestlen;