~vcs-imports/kupfer/master-new

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# -*- encoding: UTF-8 -*-

import locale
from unicodedata import normalize, category

def _folditems():
	_folding_table = {
		# general non-decomposing characters
		# FIXME: This is not complete
		u"ł" : u"l",
		u"œ" : u"oe",
		u"ð" : u"d",
		u"þ" : u"th",
		u"ß" : u"ss",
		# germano-scandinavic canonical transliterations
		u"ü" : u"ue",
		u"å" : u"aa",
		u"ä" : u"ae",
		u"æ" : u"ae",
		u"ö" : u"oe",
		u"ø" : u"oe",
	}

	for c, rep in _folding_table.iteritems():
		yield (ord(c.upper()), rep.title())
		yield (ord(c), rep)

folding_table = dict(_folditems())

def tounicode(utf8str):
	"""Return `unicode` from UTF-8 encoded @utf8str
	This is to use the same error handling etc everywhere
	"""
	if isinstance(utf8str, unicode):
		return utf8str
	return utf8str.decode("UTF-8", "replace") if utf8str is not None else u""

def toutf8(ustr):
	"""Return UTF-8 `str` from unicode @ustr
	This is to use the same error handling etc everywhere
	if ustr is `str`, just return it
	"""
	if isinstance(ustr, str):
		return ustr
	return ustr.encode("UTF-8")

def fromlocale(lstr):
	"""Return a unicode string from locale bytestring @lstr"""
	assert isinstance(lstr, str)
	enc = locale.getpreferredencoding(do_setlocale=False)
	return lstr.decode(enc, "replace")

def tolocale(ustr):
	"""Return a locale-encoded bytestring from unicode @ustr"""
	assert isinstance(ustr, unicode)
	enc = locale.getpreferredencoding(do_setlocale=False)
	return ustr.encode(enc)


def tofolded(ustr):
	u"""Fold @ustr

	Return a unicode string where composed characters are replaced by
	their base, and extended latin characters are replaced by
	similar basic latin characters.

	>>> tofolded(u"Wyłącz")
	u'Wylacz'
	>>> tofolded(u"naïveté")
	u'naivete'

	Characters from other scripts are not transliterated.

	>>> print tofolded(u"Ἑλλάς")
	Ελλας
	"""
	srcstr = normalize("NFKD", ustr.translate(folding_table))
	return u"".join([c for c in srcstr if category(c) != 'Mn'])

if __name__ == '__main__':
	import sys
	reload(sys)
	sys.setdefaultencoding("UTF-8")

	import doctest
	doctest.testmod()