1
# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
3
import stringprep, re, codecs
4
from unicodedata import ucd_3_2_0 as unicodedata
7
dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
13
# This assumes query strings, so AllowUnassigned is true
18
if stringprep.in_table_b1(c):
21
newlabel.append(stringprep.map_table_b2(c))
22
label = "".join(newlabel)
25
label = unicodedata.normalize("NFKC", label)
29
if stringprep.in_table_c12(c) or \
30
stringprep.in_table_c22(c) or \
31
stringprep.in_table_c3(c) or \
32
stringprep.in_table_c4(c) or \
33
stringprep.in_table_c5(c) or \
34
stringprep.in_table_c6(c) or \
35
stringprep.in_table_c7(c) or \
36
stringprep.in_table_c8(c) or \
37
stringprep.in_table_c9(c):
38
raise UnicodeError("Invalid character %r" % c)
41
RandAL = [stringprep.in_table_d1(x) for x in label]
44
# There is a RandAL char in the string. Must perform further
46
# 1) The characters in section 5.8 MUST be prohibited.
47
# This is table C.8, which was already checked
48
# 2) If a string contains any RandALCat character, the string
49
# MUST NOT contain any LCat character.
50
if any(stringprep.in_table_d2(x) for x in label):
51
raise UnicodeError("Violation of BIDI requirement 2")
53
# 3) If a string contains any RandALCat character, a
54
# RandALCat character MUST be the first character of the
55
# string, and a RandALCat character MUST be the last
56
# character of the string.
57
if not RandAL[0] or not RandAL[-1]:
58
raise UnicodeError("Violation of BIDI requirement 3")
65
label = label.encode("ascii")
69
# Skip to step 3: UseSTD3ASCIIRules is false, so
71
if 0 < len(label) < 64:
73
raise UnicodeError("label empty or too long")
76
label = nameprep(label)
78
# Step 3: UseSTD3ASCIIRules is false
81
label = label.encode("ascii")
86
if 0 < len(label) < 64:
88
raise UnicodeError("label empty or too long")
90
# Step 5: Check ACE prefix
91
if label.startswith(sace_prefix):
92
raise UnicodeError("Label starts with ACE prefix")
94
# Step 6: Encode with PUNYCODE
95
label = label.encode("punycode")
97
# Step 7: Prepend ACE prefix
98
label = ace_prefix + label
101
if 0 < len(label) < 64:
103
raise UnicodeError("label empty or too long")
105
def ToUnicode(label):
106
# Step 1: Check for ASCII
107
if isinstance(label, bytes):
111
label = label.encode("ascii")
116
# Step 2: Perform nameprep
117
label = nameprep(label)
118
# It doesn't say this, but apparently, it should be ASCII now
120
label = label.encode("ascii")
122
raise UnicodeError("Invalid character in IDN label")
123
# Step 3: Check for ACE prefix
124
if not label.startswith(ace_prefix):
125
return str(label, "ascii")
127
# Step 4: Remove ACE prefix
128
label1 = label[len(ace_prefix):]
130
# Step 5: Decode using PUNYCODE
131
result = label1.decode("punycode")
133
# Step 6: Apply ToASCII
134
label2 = ToASCII(result)
136
# Step 7: Compare the result of step 6 with the one of step 3
137
# label2 will already be in lower case.
138
if str(label, "ascii").lower() != str(label2, "ascii"):
139
raise UnicodeError("IDNA does not round-trip", label, label2)
141
# Step 8: return the result of step 5
146
class Codec(codecs.Codec):
147
def encode(self, input, errors='strict'):
149
if errors != 'strict':
150
# IDNA is quite clear that implementations must be strict
151
raise UnicodeError("unsupported error handling "+errors)
157
result = input.encode('ascii')
158
except UnicodeEncodeError:
161
# ASCII name: fast path
162
labels = result.split(b'.')
163
for label in labels[:-1]:
164
if not (0 < len(label) < 64):
165
raise UnicodeError("label empty or too long")
166
if len(labels[-1]) >= 64:
167
raise UnicodeError("label too long")
168
return result, len(input)
171
labels = dots.split(input)
172
if labels and not labels[-1]:
181
result.extend(ToASCII(label))
182
return bytes(result+trailing_dot), len(input)
184
def decode(self, input, errors='strict'):
186
if errors != 'strict':
187
raise UnicodeError("Unsupported error handling "+errors)
192
# IDNA allows decoding to operate on Unicode strings, too.
193
if not isinstance(input, bytes):
194
# XXX obviously wrong, see #3232
197
if ace_prefix not in input:
200
return input.decode('ascii'), len(input)
201
except UnicodeDecodeError:
204
labels = input.split(b".")
206
if labels and len(labels[-1]) == 0:
214
result.append(ToUnicode(label))
216
return ".".join(result)+trailing_dot, len(input)
218
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
219
def _buffer_encode(self, input, errors, final):
220
if errors != 'strict':
221
# IDNA is quite clear that implementations must be strict
222
raise UnicodeError("unsupported error handling "+errors)
227
labels = dots.split(input)
234
# Keep potentially unfinished label until the next call
246
result.extend(ToASCII(label))
249
result += trailing_dot
250
size += len(trailing_dot)
251
return (bytes(result), size)
253
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
254
def _buffer_decode(self, input, errors, final):
255
if errors != 'strict':
256
raise UnicodeError("Unsupported error handling "+errors)
261
# IDNA allows decoding to operate on Unicode strings, too.
262
if isinstance(input, str):
263
labels = dots.split(input)
265
# Must be ASCII string
266
input = str(input, "ascii")
267
labels = input.split(".")
275
# Keep potentially unfinished label until the next call
283
result.append(ToUnicode(label))
288
result = ".".join(result) + trailing_dot
289
size += len(trailing_dot)
290
return (result, size)
292
class StreamWriter(Codec,codecs.StreamWriter):
295
class StreamReader(Codec,codecs.StreamReader):
298
### encodings module API
301
return codecs.CodecInfo(
303
encode=Codec().encode,
304
decode=Codec().decode,
305
incrementalencoder=IncrementalEncoder,
306
incrementaldecoder=IncrementalDecoder,
307
streamwriter=StreamWriter,
308
streamreader=StreamReader,