1
/* ------------------------------------------------------------------------
3
Python Codec Registry and support functions
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
7
Copyright (c) Corporation for National Research Initiatives.
9
------------------------------------------------------------------------ */
14
/* --- Codec Registry ----------------------------------------------------- */
16
/* Import the standard encodings package which will register the first
17
codec search function.
19
This is done in a lazy way so that the Unicode implementation does
20
not downgrade startup time of scripts not needing it.
22
ImportErrors are silently ignored by this function. Only one try is
27
static int _PyCodecRegistry_Init(void); /* Forward */
29
int PyCodec_Register(PyObject *search_function)
31
PyInterpreterState *interp = PyThreadState_GET()->interp;
32
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
34
if (search_function == NULL) {
38
if (!PyCallable_Check(search_function)) {
39
PyErr_SetString(PyExc_TypeError, "argument must be callable");
42
return PyList_Append(interp->codec_search_path, search_function);
48
/* Convert a string to a normalized Python string: all characters are
49
converted to lower case, spaces are replaced with underscores. */
52
PyObject *normalizestring(const char *string)
55
size_t len = strlen(string);
59
if (len > PY_SSIZE_T_MAX) {
60
PyErr_SetString(PyExc_OverflowError, "string is too large");
64
v = PyString_FromStringAndSize(NULL, len);
67
p = PyString_AS_STRING(v);
68
for (i = 0; i < len; i++) {
69
register char ch = string[i];
73
ch = tolower(Py_CHARMASK(ch));
79
/* Lookup the given encoding and return a tuple providing the codec
82
The encoding string is looked up converted to all lower-case
83
characters. This makes encodings looked up through this mechanism
84
effectively case-insensitive.
86
If no codec is found, a LookupError is set and NULL returned.
88
As side effect, this tries to load the encodings package, if not
89
yet done. This is part of the lazy load strategy for the encodings
94
PyObject *_PyCodec_Lookup(const char *encoding)
96
PyInterpreterState *interp;
97
PyObject *result, *args = NULL, *v;
100
if (encoding == NULL) {
105
interp = PyThreadState_GET()->interp;
106
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
109
/* Convert the encoding to a normalized Python string: all
110
characters are converted to lower case, spaces and hyphens are
111
replaced with underscores. */
112
v = normalizestring(encoding);
115
PyString_InternInPlace(&v);
117
/* First, try to lookup the name in the registry dictionary */
118
result = PyDict_GetItem(interp->codec_search_cache, v);
119
if (result != NULL) {
125
/* Next, scan the search functions in order of registration */
126
args = PyTuple_New(1);
129
PyTuple_SET_ITEM(args,0,v);
131
len = PyList_Size(interp->codec_search_path);
135
PyErr_SetString(PyExc_LookupError,
136
"no codec search functions registered: "
137
"can't find encoding");
141
for (i = 0; i < len; i++) {
144
func = PyList_GetItem(interp->codec_search_path, i);
147
result = PyEval_CallObject(func, args);
150
if (result == Py_None) {
154
if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155
PyErr_SetString(PyExc_TypeError,
156
"codec search functions must return 4-tuples");
163
/* XXX Perhaps we should cache misses too ? */
164
PyErr_Format(PyExc_LookupError,
165
"unknown encoding: %s", encoding);
169
/* Cache and return the result */
170
PyDict_SetItem(interp->codec_search_cache, v, result);
180
PyObject *args_tuple(PyObject *object,
185
args = PyTuple_New(1 + (errors != NULL));
189
PyTuple_SET_ITEM(args,0,object);
193
v = PyString_FromString(errors);
198
PyTuple_SET_ITEM(args, 1, v);
203
/* Helper function to get a codec item */
206
PyObject *codec_getitem(const char *encoding, int index)
211
codecs = _PyCodec_Lookup(encoding);
214
v = PyTuple_GET_ITEM(codecs, index);
220
/* Helper function to create an incremental codec. */
223
PyObject *codec_getincrementalcodec(const char *encoding,
225
const char *attrname)
227
PyObject *codecs, *ret, *inccodec;
229
codecs = _PyCodec_Lookup(encoding);
232
inccodec = PyObject_GetAttrString(codecs, attrname);
234
if (inccodec == NULL)
237
ret = PyObject_CallFunction(inccodec, "s", errors);
239
ret = PyObject_CallFunction(inccodec, NULL);
244
/* Helper function to create a stream codec. */
247
PyObject *codec_getstreamcodec(const char *encoding,
252
PyObject *codecs, *streamcodec, *codeccls;
254
codecs = _PyCodec_Lookup(encoding);
258
codeccls = PyTuple_GET_ITEM(codecs, index);
260
streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
262
streamcodec = PyObject_CallFunction(codeccls, "O", stream);
267
/* Convenience APIs to query the Codec registry.
269
All APIs return a codec object with incremented refcount.
273
PyObject *PyCodec_Encoder(const char *encoding)
275
return codec_getitem(encoding, 0);
278
PyObject *PyCodec_Decoder(const char *encoding)
280
return codec_getitem(encoding, 1);
283
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
286
return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
289
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
292
return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
295
PyObject *PyCodec_StreamReader(const char *encoding,
299
return codec_getstreamcodec(encoding, stream, errors, 2);
302
PyObject *PyCodec_StreamWriter(const char *encoding,
306
return codec_getstreamcodec(encoding, stream, errors, 3);
309
/* Encode an object (e.g. an Unicode object) using the given encoding
310
and return the resulting encoded object (usually a Python string).
312
errors is passed to the encoder factory as argument if non-NULL. */
314
PyObject *PyCodec_Encode(PyObject *object,
315
const char *encoding,
318
PyObject *encoder = NULL;
319
PyObject *args = NULL, *result = NULL;
322
encoder = PyCodec_Encoder(encoding);
326
args = args_tuple(object, errors);
330
result = PyEval_CallObject(encoder,args);
334
if (!PyTuple_Check(result) ||
335
PyTuple_GET_SIZE(result) != 2) {
336
PyErr_SetString(PyExc_TypeError,
337
"encoder must return a tuple (object,integer)");
340
v = PyTuple_GET_ITEM(result,0);
342
/* We don't check or use the second (integer) entry. */
356
/* Decode an object (usually a Python string) using the given encoding
357
and return an equivalent object (e.g. an Unicode object).
359
errors is passed to the decoder factory as argument if non-NULL. */
361
PyObject *PyCodec_Decode(PyObject *object,
362
const char *encoding,
365
PyObject *decoder = NULL;
366
PyObject *args = NULL, *result = NULL;
369
decoder = PyCodec_Decoder(encoding);
373
args = args_tuple(object, errors);
377
result = PyEval_CallObject(decoder,args);
380
if (!PyTuple_Check(result) ||
381
PyTuple_GET_SIZE(result) != 2) {
382
PyErr_SetString(PyExc_TypeError,
383
"decoder must return a tuple (object,integer)");
386
v = PyTuple_GET_ITEM(result,0);
388
/* We don't check or use the second (integer) entry. */
402
/* Register the error handling callback function error under the name
403
name. This function will be called by the codec when it encounters
404
an unencodable characters/undecodable bytes and doesn't know the
405
callback name, when name is specified as the error parameter
406
in the call to the encode/decode function.
407
Return 0 on success, -1 on error */
408
int PyCodec_RegisterError(const char *name, PyObject *error)
410
PyInterpreterState *interp = PyThreadState_GET()->interp;
411
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
413
if (!PyCallable_Check(error)) {
414
PyErr_SetString(PyExc_TypeError, "handler must be callable");
417
return PyDict_SetItemString(interp->codec_error_registry,
418
(char *)name, error);
421
/* Lookup the error handling callback function registered under the
422
name error. As a special case NULL can be passed, in which case
423
the error handling callback for strict encoding will be returned. */
424
PyObject *PyCodec_LookupError(const char *name)
426
PyObject *handler = NULL;
428
PyInterpreterState *interp = PyThreadState_GET()->interp;
429
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
434
handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
436
PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
442
static void wrong_exception_type(PyObject *exc)
444
PyObject *type = PyObject_GetAttrString(exc, "__class__");
446
PyObject *name = PyObject_GetAttrString(type, "__name__");
449
PyObject *string = PyObject_Str(name);
451
if (string != NULL) {
452
PyErr_Format(PyExc_TypeError,
453
"don't know how to handle %.400s in error callback",
454
PyString_AS_STRING(string));
461
PyObject *PyCodec_StrictErrors(PyObject *exc)
463
if (PyExceptionInstance_Check(exc))
464
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
466
PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
471
#ifdef Py_USING_UNICODE
472
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
475
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
476
if (PyUnicodeEncodeError_GetEnd(exc, &end))
479
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
480
if (PyUnicodeDecodeError_GetEnd(exc, &end))
483
else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
484
if (PyUnicodeTranslateError_GetEnd(exc, &end))
488
wrong_exception_type(exc);
491
/* ouch: passing NULL, 0, pos gives None instead of u'' */
492
return Py_BuildValue("(u#n)", &end, 0, end);
496
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
503
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
506
if (PyUnicodeEncodeError_GetStart(exc, &start))
508
if (PyUnicodeEncodeError_GetEnd(exc, &end))
510
res = PyUnicode_FromUnicode(NULL, end-start);
513
for (p = PyUnicode_AS_UNICODE(res), i = start;
516
restuple = Py_BuildValue("(On)", res, end);
520
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
521
Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
522
if (PyUnicodeDecodeError_GetEnd(exc, &end))
524
return Py_BuildValue("(u#n)", &res, 1, end);
526
else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
529
if (PyUnicodeTranslateError_GetStart(exc, &start))
531
if (PyUnicodeTranslateError_GetEnd(exc, &end))
533
res = PyUnicode_FromUnicode(NULL, end-start);
536
for (p = PyUnicode_AS_UNICODE(res), i = start;
538
*p = Py_UNICODE_REPLACEMENT_CHARACTER;
539
restuple = Py_BuildValue("(On)", res, end);
544
wrong_exception_type(exc);
549
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
551
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
561
if (PyUnicodeEncodeError_GetStart(exc, &start))
563
if (PyUnicodeEncodeError_GetEnd(exc, &end))
565
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
567
startp = PyUnicode_AS_UNICODE(object);
568
for (p = startp+start, ressize = 0; p < startp+end; ++p) {
577
#ifndef Py_UNICODE_WIDE
589
/* allocate replacement */
590
res = PyUnicode_FromUnicode(NULL, ressize);
595
/* generate replacement */
596
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
597
p < startp+end; ++p) {
619
#ifndef Py_UNICODE_WIDE
625
else if (*p<100000) {
629
else if (*p<1000000) {
639
*outp++ = '0' + c/base;
645
restuple = Py_BuildValue("(On)", res, end);
651
wrong_exception_type(exc);
656
static Py_UNICODE hexdigits[] = {
657
'0', '1', '2', '3', '4', '5', '6', '7',
658
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
661
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
663
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
673
if (PyUnicodeEncodeError_GetStart(exc, &start))
675
if (PyUnicodeEncodeError_GetEnd(exc, &end))
677
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
679
startp = PyUnicode_AS_UNICODE(object);
680
for (p = startp+start, ressize = 0; p < startp+end; ++p) {
681
#ifdef Py_UNICODE_WIDE
682
if (*p >= 0x00010000)
692
res = PyUnicode_FromUnicode(NULL, ressize);
695
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
696
p < startp+end; ++p) {
699
#ifdef Py_UNICODE_WIDE
700
if (c >= 0x00010000) {
702
*outp++ = hexdigits[(c>>28)&0xf];
703
*outp++ = hexdigits[(c>>24)&0xf];
704
*outp++ = hexdigits[(c>>20)&0xf];
705
*outp++ = hexdigits[(c>>16)&0xf];
706
*outp++ = hexdigits[(c>>12)&0xf];
707
*outp++ = hexdigits[(c>>8)&0xf];
713
*outp++ = hexdigits[(c>>12)&0xf];
714
*outp++ = hexdigits[(c>>8)&0xf];
718
*outp++ = hexdigits[(c>>4)&0xf];
719
*outp++ = hexdigits[c&0xf];
722
restuple = Py_BuildValue("(On)", res, end);
728
wrong_exception_type(exc);
734
static PyObject *strict_errors(PyObject *self, PyObject *exc)
736
return PyCodec_StrictErrors(exc);
740
#ifdef Py_USING_UNICODE
741
static PyObject *ignore_errors(PyObject *self, PyObject *exc)
743
return PyCodec_IgnoreErrors(exc);
747
static PyObject *replace_errors(PyObject *self, PyObject *exc)
749
return PyCodec_ReplaceErrors(exc);
753
static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
755
return PyCodec_XMLCharRefReplaceErrors(exc);
759
static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
761
return PyCodec_BackslashReplaceErrors(exc);
765
static int _PyCodecRegistry_Init(void)
780
#ifdef Py_USING_UNICODE
800
"xmlcharrefreplace_errors",
801
xmlcharrefreplace_errors,
808
"backslashreplace_errors",
809
backslashreplace_errors,
816
PyInterpreterState *interp = PyThreadState_GET()->interp;
820
if (interp->codec_search_path != NULL)
823
interp->codec_search_path = PyList_New(0);
824
interp->codec_search_cache = PyDict_New();
825
interp->codec_error_registry = PyDict_New();
827
if (interp->codec_error_registry) {
828
for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
829
PyObject *func = PyCFunction_New(&methods[i].def, NULL);
832
Py_FatalError("can't initialize codec error registry");
833
res = PyCodec_RegisterError(methods[i].name, func);
836
Py_FatalError("can't initialize codec error registry");
840
if (interp->codec_search_path == NULL ||
841
interp->codec_search_cache == NULL ||
842
interp->codec_error_registry == NULL)
843
Py_FatalError("can't initialize codec registry");
845
mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
847
if (PyErr_ExceptionMatches(PyExc_ImportError)) {
848
/* Ignore ImportErrors... this is done so that
849
distributions can disable the encodings package. Note
850
that other errors are not masked, e.g. SystemErrors
851
raised to inform the user of an error in the Python
852
configuration are still reported back to the user. */