2
charset.py - Module for converting characters sets
3
(c) by Michael Stroeder <michael@stroeder.com>
5
This module is distributed under the terms of the
6
GPL (GNU GENERAL PUBLIC LICENSE) Version 2
7
(see http://www.gnu.org/copyleft/gpl.html)
14
# Alphabet for encrypted passwords (see module crypt)
15
crypt_alphabet = './0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
16
crypt_alphabet_len = len(crypt_alphabet)
21
returns 1 if s is plain ASCII
24
pos=0 ; s_len = len(s)
25
while ((ord(s[pos]) & 0x80) == 0) and (pos<s_len-1):
30
return (ord(s[pos]) & 0x80) == 0
35
def escapeHTML(s,escape_html_chars='&;<>":={}()'):
37
Escape all characters with a special meaning in HTML
38
to appropriate character tags
40
result = ''; escape_html_chars_list = list(escape_html_chars)
42
if c in escape_html_chars:
43
result=result+'&#%d;'%ord(c)
51
Convert ISO-8859-1 to UTF-8 encoded Unicode
59
new = new+chr(0xC0 | (0x03 & (c >> 6)))+chr(0x80 | (0x3F & c))
63
UTF8len= ( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6 )
68
UTF8mask = (0x3F,0x7F,0x1F,0x0F,0x07,0x03,0x01)
72
Convert UTF-8 encoded Unicode to ISO-8859-1
80
clen = UTF8len[(c >> 2) & 0x3F]
81
u = c & UTF8mask[clen]
86
while clen and ind < slen:
89
if (c & 0xC0) == 0x80:
90
u = (u << 6) | (c & 0x3F)
104
Convert UTF-8 encoded Unicode to HTML-4 character representation
112
clen = UTF8len[(c >> 2) & 0x3F]
113
u = c & UTF8mask[clen]
118
while clen and ind < slen:
121
if (c & 0xC0) == 0x80:
122
u = (u << 6) | (c & 0x3F)
130
new = new + '&#%d;' % u
136
Convert ISO-8859-1 to HTML-4 character representation
144
new = new + '&#%d;' % (c)
150
Convert ISO-8859-1 to T.61 character representation
158
new = '%s\\x%X' % (new,ord(ch))
164
Convert T.61 character representation to ISO-8859-1
167
slashpos = string.find(s,'\\x')
169
if (s[slashpos]==0) or (s[slashpos]>0 and s[slashpos-1]!='\\'):
170
new = new+s[0:slashpos]+chr(string.atoi(s[slashpos+2:slashpos+4],16))
173
new = new+s[0:slashpos-1]
175
slashpos = string.find(s,'\\x')
181
Convert T.61 character representation to HTML-4 character representation
184
slashpos = string.find(s,'\\x')
186
if (s[slashpos]==0) or (s[slashpos]>0 and s[slashpos-1]!='\\'):
187
new = new+s[0:slashpos]+'&#%d;' % string.atoi(s[slashpos+2:slashpos+4],16)
190
new = new+s[0:slashpos-1]
192
slashpos = string.find(s,'\\x')
198
Convert ISO-8859-1 to BMPString
204
new = '%s\\x00%s' % (new,ch)
206
new = '%s\\x00\\x%X%s' % (new,ord(ch),ch)
212
Convert BMPString to ISO-8859-1
214
return t612iso(string.replace(s,'\\x00',''))
219
Convert BMPString to HTML-4 character representation
221
return t612html4(string.replace(s,'\\x00',''))
235
'ISO-8859-1' : utf2iso
239
def recode(s,source,target):
241
Convert from/to known character set / encoding
246
return recode_func[string.upper(source)][string.upper(target)](s)