1
1
# Author: David Goodger
2
2
# Contact: goodger@users.sourceforge.net
3
# Revision: $Revision: 1.30 $
4
# Date: $Date: 2004/03/28 15:39:27 $
3
# Revision: $Revision: 3654 $
4
# Date: $Date: 2005-07-03 17:02:15 +0200 (Sun, 03 Jul 2005) $
5
5
# Copyright: This module has been placed in the public domain.
31
31
default_source_path = None
33
def __init__(self, source=None, source_path=None, encoding=None):
33
def __init__(self, source=None, source_path=None, encoding=None,
34
error_handler='strict'):
34
35
self.encoding = encoding
35
36
"""Text encoding for the input source."""
38
self.error_handler = error_handler
39
"""Text decoding error handler."""
37
41
self.source = source
38
42
"""The source of input data."""
43
47
if not source_path:
44
48
self.source_path = self.default_source_path
50
self.successful_encoding = None
51
"""The encoding that successfully decoded the source data."""
46
53
def __repr__(self):
47
54
return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
61
68
locale.setlocale(locale.LC_ALL, '')
63
if (self.encoding and self.encoding.lower() == 'unicode'
64
or isinstance(data, UnicodeType)):
66
encodings = [self.encoding, 'utf-8']
68
encodings.append(locale.nl_langinfo(locale.CODESET))
72
encodings.append(locale.getlocale()[1])
76
encodings.append(locale.getdefaultlocale()[1])
79
encodings.append('latin-1')
70
if self.encoding and self.encoding.lower() == 'unicode':
71
assert isinstance(data, UnicodeType), (
72
'input encoding is "unicode" '
73
'but input is not a unicode object')
74
if isinstance(data, UnicodeType):
75
# Accept unicode even if self.encoding != 'unicode'.
77
encodings = [self.encoding]
79
# Apply heuristics only if no encoding is explicitly given.
80
encodings.append('utf-8')
82
encodings.append(locale.nl_langinfo(locale.CODESET))
86
encodings.append(locale.getlocale()[1])
90
encodings.append(locale.getdefaultlocale()[1])
93
encodings.append('latin-1')
80
96
for enc in encodings:
84
return unicode(data, enc)
85
except (UnicodeError, LookupError):
100
decoded = unicode(data, enc, self.error_handler)
101
self.successful_encoding = enc
102
# Return decoded, removing BOMs.
103
return decoded.replace(u'\ufeff', u'')
104
except (UnicodeError, LookupError), error:
106
if error is not None:
107
error_details = '\n(%s: %s)' % (error.__class__.__name__, error)
87
108
raise UnicodeError(
88
'Unable to decode input data. Tried the following encodings: %s.'
89
% ', '.join([repr(enc) for enc in encodings if enc]))
109
'Unable to decode input data. Tried the following encodings: '
111
% (', '.join([repr(enc) for enc in encodings if enc]),
92
115
class Output(TransformSpec):
121
144
% (self.__class__, self.destination, self.destination_path))
123
146
def write(self, data):
147
"""`data` is a Unicode string, to be encoded by `self.encode`."""
124
148
raise NotImplementedError
126
150
def encode(self, data):
127
151
if self.encoding and self.encoding.lower() == 'unicode':
152
assert isinstance(data, UnicodeType), (
153
'the encoding given is "unicode" but the output is not '
156
if not isinstance(data, UnicodeType):
157
# Non-unicode (e.g. binary) output.
130
return data.encode(self.encoding, self.error_handler)
161
return data.encode(self.encoding, self.error_handler)
163
# ValueError is raised if there are unencodable chars
164
# in data and the error_handler isn't found.
165
if self.error_handler == 'xmlcharrefreplace':
166
# We are using xmlcharrefreplace with a Python
167
# version that doesn't support it (2.1 or 2.2), so
168
# we emulate its behavior.
169
return ''.join([self.xmlcharref_encode(char)
174
def xmlcharref_encode(self, char):
175
"""Emulate Python 2.3's 'xmlcharrefreplace' encoding error handler."""
177
return char.encode(self.encoding, 'strict')
179
return '&#%i;' % ord(char)
133
182
class FileInput(Input):
139
188
def __init__(self, source=None, source_path=None,
140
encoding=None, autoclose=1, handle_io_errors=1):
189
encoding=None, error_handler='strict',
190
autoclose=1, handle_io_errors=1):
143
193
- `source`: either a file-like object (which is read directly), or
144
194
`None` (which implies `sys.stdin` if no `source_path` given).
145
195
- `source_path`: a path to a file, which is opened and then read.
196
- `encoding`: the expected text encoding of the input file.
197
- `error_handler`: the encoding error handler to use.
146
198
- `autoclose`: close automatically after read (boolean); always
147
199
false if `sys.stdin` is the source.
200
- `handle_io_errors`: summarize I/O errors here, and exit?
149
Input.__init__(self, source, source_path, encoding)
202
Input.__init__(self, source, source_path, encoding, error_handler)
150
203
self.autoclose = autoclose
151
204
self.handle_io_errors = handle_io_errors
152
205
if source is None:
297
356
def write(self, data):
298
357
"""Do nothing ([don't even] send data to the bit bucket)."""
361
class DocTreeInput(Input):
364
Adapter for document tree input.
366
The document tree must be passed in the ``source`` parameter.
369
default_source_path = 'doctree input'
372
"""Return the document tree."""