1
# -*- coding: utf-8 -*-
2
# read unihan.txt and save it as a db
4
from sqlalchemy import (Table, Integer, Float, Unicode, Column, MetaData,
5
ForeignKey, Boolean, String, Date, UniqueConstraint,
7
from sqlalchemy import (create_engine)
8
from sqlalchemy.orm import mapper, sessionmaker, relation, backref, \
9
object_session as _object_session
10
from sqlalchemy.sql import select, text, and_
11
from sqlalchemy.exceptions import DBAPIError
18
Column("id", Integer, primary_key=True),
19
Column("mandarin", UnicodeText),
20
Column("cantonese", UnicodeText),
21
Column("grade", Integer),
24
engine = create_engine("sqlite:///unihan.db",
25
echo=False, strategy='threadlocal')
26
session = sessionmaker(bind=engine,
29
metadata.create_all(engine)
33
# Convert codes to accents
34
##########################################################################
35
# code from Donald Chai
38
'a' : [u'a', u'ā', u'á', u'ǎ', u'à', u'a'],
39
'e' : [u'e', u'ē', u'é', u'ě', u'è', u'e'],
40
'i' : [u'i', u'ī', u'í', u'ǐ', u'ì', u'i'],
41
'o' : [u'o', u'ō', u'ó', u'ǒ', u'ò', u'o'],
42
'u' : [u'u', u'ū', u'ú', u'ǔ', u'ù', u'u'],
43
'v' : [u'ü', u'ǖ', u'ǘ', u'ǚ', u'ǜ', u'ü'],
46
'''Converts a pinyin word to unicode'''
49
# convert ü to v for now to make life easier
50
word = re.sub(u'\xfc|\xc3\xbc', 'v', word)
52
mo = re.match('([qwrtypsdfghjklzxcbnm]*)([aeiouv]*)(\D*)(\d?)', word)
57
# do nothing if no vowel or tone
58
if vowel=='' or tone=='':
62
vowel = accenttable[vowel][tone]
63
elif vowel[-2]=='i' or vowel[-2]=='u':
65
vowel = vowel[:-1] + accenttable[vowel[-1]][tone]
67
# put over second to last
68
vowel = vowel[:-2] + accenttable[vowel[-2]][tone] + vowel[-1]
69
return init+vowel+final
71
##########################################################################
75
for line in codecs.open("Unihan.txt", encoding="utf-8"):
77
(u, f, v) = line.strip().split("\t")
80
if not u.startswith("U+"):
82
n = int(u.replace("U+",""), 16)
86
kanji[n]['mandarin'] = " ".join([convert(w) for w in v.split()])
87
elif f == "kCantonese":
88
kanji[n]['cantonese'] = v
89
elif f == "kGradeLevel":
90
kanji[n]['grade'] = int(v)
93
'mandarin': v.get('mandarin'),
94
'cantonese': v.get('cantonese'),
95
'grade': v.get('grade') } for (k,v) in kanji.items()
96
if v.get('mandarin') or v.get('cantonese') or v.get('grade')]
97
s.execute(unihanTable.insert(), dict)