1
# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
3
import stringprep, re, codecs
4
from unicodedata import ucd_3_2_0 as unicodedata
7
dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
11
uace_prefix = unicode(ace_prefix, "ascii")
13
# This assumes query strings, so AllowUnassigned is true
18
if stringprep.in_table_b1(c):
21
newlabel.append(stringprep.map_table_b2(c))
22
label = u"".join(newlabel)
25
label = unicodedata.normalize("NFKC", label)
29
if stringprep.in_table_c12(c) or \
30
stringprep.in_table_c22(c) or \
31
stringprep.in_table_c3(c) or \
32
stringprep.in_table_c4(c) or \
33
stringprep.in_table_c5(c) or \
34
stringprep.in_table_c6(c) or \
35
stringprep.in_table_c7(c) or \
36
stringprep.in_table_c8(c) or \
37
stringprep.in_table_c9(c):
38
raise UnicodeError("Invalid character %r" % c)
41
RandAL = map(stringprep.in_table_d1, label)
44
# There is a RandAL char in the string. Must perform further
46
# 1) The characters in section 5.8 MUST be prohibited.
47
# This is table C.8, which was already checked
48
# 2) If a string contains any RandALCat character, the string
49
# MUST NOT contain any LCat character.
50
if filter(stringprep.in_table_d2, label):
51
raise UnicodeError("Violation of BIDI requirement 2")
53
# 3) If a string contains any RandALCat character, a
54
# RandALCat character MUST be the first character of the
55
# string, and a RandALCat character MUST be the last
56
# character of the string.
57
if not RandAL[0] or not RandAL[-1]:
58
raise UnicodeError("Violation of BIDI requirement 3")
65
label = label.encode("ascii")
69
# Skip to step 3: UseSTD3ASCIIRules is false, so
71
if 0 < len(label) < 64:
73
raise UnicodeError("label empty or too long")
76
label = nameprep(label)
78
# Step 3: UseSTD3ASCIIRules is false
81
label = label.encode("ascii")
86
if 0 < len(label) < 64:
88
raise UnicodeError("label empty or too long")
90
# Step 5: Check ACE prefix
91
if label.startswith(uace_prefix):
92
raise UnicodeError("Label starts with ACE prefix")
94
# Step 6: Encode with PUNYCODE
95
label = label.encode("punycode")
97
# Step 7: Prepend ACE prefix
98
label = ace_prefix + label
101
if 0 < len(label) < 64:
103
raise UnicodeError("label empty or too long")
105
def ToUnicode(label):
106
# Step 1: Check for ASCII
107
if isinstance(label, str):
111
label = label.encode("ascii")
116
# Step 2: Perform nameprep
117
label = nameprep(label)
118
# It doesn't say this, but apparently, it should be ASCII now
120
label = label.encode("ascii")
122
raise UnicodeError("Invalid character in IDN label")
123
# Step 3: Check for ACE prefix
124
if not label.startswith(ace_prefix):
125
return unicode(label, "ascii")
127
# Step 4: Remove ACE prefix
128
label1 = label[len(ace_prefix):]
130
# Step 5: Decode using PUNYCODE
131
result = label1.decode("punycode")
133
# Step 6: Apply ToASCII
134
label2 = ToASCII(result)
136
# Step 7: Compare the result of step 6 with the one of step 3
137
# label2 will already be in lower case.
138
if label.lower() != label2:
139
raise UnicodeError("IDNA does not round-trip", label, label2)
141
# Step 8: return the result of step 5
146
class Codec(codecs.Codec):
147
def encode(self,input,errors='strict'):
149
if errors != 'strict':
150
# IDNA is quite clear that implementations must be strict
151
raise UnicodeError("unsupported error handling "+errors)
157
labels = dots.split(input)
158
if labels and len(labels[-1])==0:
164
result.append(ToASCII(label))
166
return ".".join(result)+trailing_dot, len(input)
168
def decode(self,input,errors='strict'):
170
if errors != 'strict':
171
raise UnicodeError("Unsupported error handling "+errors)
176
# IDNA allows decoding to operate on Unicode strings, too.
177
if isinstance(input, unicode):
178
labels = dots.split(input)
180
# Must be ASCII string
182
unicode(input, "ascii")
183
labels = input.split(".")
185
if labels and len(labels[-1]) == 0:
193
result.append(ToUnicode(label))
195
return u".".join(result)+trailing_dot, len(input)
197
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
198
def _buffer_encode(self, input, errors, final):
199
if errors != 'strict':
200
# IDNA is quite clear that implementations must be strict
201
raise UnicodeError("unsupported error handling "+errors)
206
labels = dots.split(input)
213
# Keep potentially unfinished label until the next call
221
result.append(ToASCII(label))
227
result = ".".join(result) + trailing_dot
228
size += len(trailing_dot)
229
return (result, size)
231
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
232
def _buffer_decode(self, input, errors, final):
233
if errors != 'strict':
234
raise UnicodeError("Unsupported error handling "+errors)
239
# IDNA allows decoding to operate on Unicode strings, too.
240
if isinstance(input, unicode):
241
labels = dots.split(input)
243
# Must be ASCII string
245
unicode(input, "ascii")
246
labels = input.split(".")
254
# Keep potentially unfinished label until the next call
262
result.append(ToUnicode(label))
267
result = u".".join(result) + trailing_dot
268
size += len(trailing_dot)
269
return (result, size)
271
class StreamWriter(Codec,codecs.StreamWriter):
274
class StreamReader(Codec,codecs.StreamReader):
277
### encodings module API
280
return codecs.CodecInfo(
282
encode=Codec().encode,
283
decode=Codec().decode,
284
incrementalencoder=IncrementalEncoder,
285
incrementaldecoder=IncrementalDecoder,
286
streamwriter=StreamWriter,
287
streamreader=StreamReader,