2
from models import Word, GramInfo, corrGramInfo
3
from django.core.exceptions import ObjectDoesNotExist
5
import xml.dom, xml.dom.minidom
7
class InconsistentGramDataException(Exception):
8
def __init__(self, value):
13
def getNextWord(word, allowPunctuation=True, reverse=False, quiet=False):
14
m = re.match(r"^(.+)\.(\d+)$", word.id)
17
k = 1 if not reverse else -1
18
while (i < maxretries):
19
wid = m.group(1) + "." + unicode(int(m.group(2)) + i*k)
20
q = Word.objects.filter(pk = wid)
21
if q and (allowPunctuation or not q[0].isPunctuation): return q[0]
23
if not quiet: raise ObjectDoesNotExist
25
def replaceCharsByDict(unistr, frm, to):
26
if len(frm) != len(to): raise "frm != to"
28
for i in range(len(result)):
29
if result[i] in frm: result[i] = to[frm.index(result[i])]
30
return u''.join(char for char in result)
32
def generateGramData(gramDataCSV, word):
35
corrg = corrGramInfo()
40
def setGramAttribute(attrname, data, gramdict):
41
#Sets grammatical attribute for both GramInfo and corrGramInfo
42
if data == u"вин/род":
43
# Animate is a special case
47
data = data.replace(u"а/имп", u"а%имп").replace(u"р/скл", u"р%скл").replace(u"н/б",u"н%б")
48
dataspl = data.split("/")
50
setattr(g, attrname, gramdict[dataspl[0]])
52
setattr(corrg, attrname, gramdict[dataspl[1]])
55
raise InconsistentGramDataException("Invalid %s: %s (word %s)" % (attrname, data, word.src))
57
dTypes = {"a": 1, "ja": 2, "o":3, "jo":4, "u":5, u"i": 6, "en": 7, "men": 8, "es": 9, "ent": 10,
58
"er":11, "uu": 12, u"личн": 13, u"м": 14, u"тв": 15, u"р%скл": 16}
59
cases = {u"им": 1, u"род": 2, u"дат": 3, u"вин": 4, u"тв": 5, u"мест": 6, u"зв": 7}
60
verbTenses = {u"н%б": 1, u"аорпр": 2, u"аорсигм": 3, u"аоргл": 4, u"аорнов": 5, u"а%имп": 6, u"имп": 7,
61
u"перф": 8, u"плюскв": 9, u"буд": 10, u"буд1": 11, u"буд2": 12, u"прош": 13}
62
numbers = {u"ед": 1, u"дв": 2, u"мн": 3, u"0": 0}
63
persons = {u"1": 1, u"2": 2, u"3": 3}
64
verbClasses = {u"1": 1, u"2": 2, u"3": 3, u"4": 4, u"5": 5}
65
verbRoles = {u"св": 1, u"пр-св": 2, u"пр": 3, u"инф": 4}
66
genders = {u"м": 1, u"ж": 2, u"ср": 3, u"0": 0}
67
participleTenses = {u"наст": 1, u"прош": 2}
68
moods = {u"изъяв": 1, u"повел": 2, u"сосл": 3}
72
gramDataCSV[2] = replaceCharsByDict(gramDataCSV[2],u'ао', u'ao')
73
setGramAttribute("declensionType", gramDataCSV[2], dTypes)
74
setGramAttribute("case", gramDataCSV[3], cases)
75
setGramAttribute("number", gramDataCSV[4], numbers)
76
setGramAttribute("gender", gramDataCSV[5], genders)
79
gramDataCSV[2] = replaceCharsByDict(gramDataCSV[2],u'ао', u'ao')
80
setGramAttribute("declensionType", gramDataCSV[2], dTypes)
81
setGramAttribute("participleTense", gramDataCSV[3], participleTenses)
82
setGramAttribute("case", gramDataCSV[4], cases)
83
setGramAttribute("number", gramDataCSV[5], numbers)
84
setGramAttribute("gender", gramDataCSV[6], genders)
87
setGramAttribute("mood", gramDataCSV[2], moods)
89
setGramAttribute("verbTense", gramDataCSV[3], verbTenses)
90
if g.verbTense in range(2,8) or g.verbTense == 10:
91
setGramAttribute("person", gramDataCSV[4], persons)
92
setGramAttribute("number", gramDataCSV[5], numbers)
93
elif g.verbTense == 13:
94
setGramAttribute("gender", gramDataCSV[4], genders)
95
setGramAttribute("number", gramDataCSV[5], numbers)
96
elif g.verbTense == 1:
97
setGramAttribute("person", gramDataCSV[4], persons)
98
setGramAttribute("number", gramDataCSV[5], numbers)
99
setGramAttribute("verbClass", gramDataCSV[6], verbClasses)
101
setGramAttribute("verbRole", gramDataCSV[6], verbRoles)
103
setGramAttribute("person", gramDataCSV[4], persons)
104
setGramAttribute("number", gramDataCSV[5], numbers)
105
if g.verbRole in (2,3):
106
setGramAttribute("gender", gramDataCSV[4], genders)
107
setGramAttribute("number", gramDataCSV[5], numbers)
109
pass #it's infinitive, we already have everything needed
112
setGramAttribute("person", gramDataCSV[3], persons)
113
setGramAttribute("number", gramDataCSV[4], numbers)
114
setGramAttribute("verbClass", gramDataCSV[5], verbClasses)
117
setGramAttribute("verbRole", gramDataCSV[5], verbRoles)
119
setGramAttribute("person", gramDataCSV[3], persons)
120
setGramAttribute("number", gramDataCSV[4], numbers)
122
setGramAttribute("gender", gramDataCSV[3], genders)
123
setGramAttribute("number", gramDataCSV[4], numbers)
126
if gramDataCSV[1] == u"сущ":
129
elif gramDataCSV[1] == u"мест":
131
gramDataCSV[2] = replaceCharsByDict(gramDataCSV[2],u'ао', u'ao')
132
setGramAttribute("declensionType", gramDataCSV[2], dTypes)
133
if g.declensionType == 13:
134
if gramDataCSV[3] == u"возвр":
136
setGramAttribute("case", gramDataCSV[4], cases)
138
setGramAttribute("person", gramDataCSV[3], persons)
139
setGramAttribute("case", gramDataCSV[4], cases)
140
setGramAttribute("number", gramDataCSV[5], numbers)
142
setGramAttribute("case", gramDataCSV[3], cases)
143
setGramAttribute("number", gramDataCSV[4], numbers)
144
setGramAttribute("gender", gramDataCSV[5], genders)
145
elif gramDataCSV[1] == u"прил":
148
elif gramDataCSV[1] == u"прил/ср":
151
elif gramDataCSV[1] == u"числ":
154
elif gramDataCSV[1] == u"числ/п":
157
elif gramDataCSV[1] == u"прич":
161
elif gramDataCSV[1] == u"прич/в":
165
elif gramDataCSV[1] == u"гл":
169
elif gramDataCSV[1] == u"гл/в":
173
elif gramDataCSV[1] == u"инф":
176
elif gramDataCSV[1] == u"инф/в":
179
elif gramDataCSV[1] == u"суп":
181
elif gramDataCSV[1] == u"нар":
183
elif gramDataCSV[1] == u"пред":
185
elif gramDataCSV[1] == u"посл":
187
elif gramDataCSV[1] == u"союз":
189
elif gramDataCSV[1] == u"част":
191
elif gramDataCSV[1] == u"межд":
193
else: raise InconsistentGramDataException("Invalid part of speech: '" + gramDataCSV[1] + "'")
199
corrg = corrGramInfo.objects.filter(pk = word)
204
def ParseXMLIntoDB(xmlfile, mscript):
205
doc = xml.dom.minidom.parse(xmlfile)
209
div1Els = doc.getElementsByTagName("div1")
212
part = int(div1.getAttribute("n"))
213
div2Els = div1.getElementsByTagName("div2")
215
page = int(div2.getAttribute("n"))
216
div3Els = div2.getElementsByTagName("div3")
218
front = (div3.getAttribute("type") == "front")
219
div4Els = div3.getElementsByTagName("div4")
221
column = int(div4.getAttribute("n"))
222
lEls = div4.getElementsByTagName("l")
224
line = int(l.getAttribute("n"))
226
childEls = [el for el in l.childNodes if isinstance(el, xml.dom.minidom.Element)]
229
word.positioninline = positioninline
236
word.manuscript = mscript
237
if ce.tagName in (u"w", u"name", u"add", u"num"):
238
if ce.tagName != "w": ce = ce.getElementsByTagName("w")[0]
239
word.id = ce.getAttribute("xml:id")
240
orig = ce.getElementsByTagName('orig')[0]
241
if len(orig.getElementsByTagName('sic')) > 0:
243
word.orig = orig.getElementsByTagName('sic')[0].toxml()[5:-6]
244
if len(orig.getElementsByTagName('corr')) > 0:
245
word.corr = orig.getElementsByTagName('corr')[0].toxml()[6:-7]
247
word.orig = ce.getElementsByTagName('orig')[0].toxml()[6:-7]
248
if ce.parentNode.tagName == 'name': word.name = True
249
if ce.parentNode.tagName == 'add': word.add = True
250
if ce.parentNode.tagName == 'num':
251
word.value = int(ce.parentNode.getAttribute("value"))
253
word.reg = ce.getElementsByTagName('reg')[0].toxml()[5:-6]
254
word.src = ce.getElementsByTagName('src')[0].toxml()[5:-6]
255
elif ce.tagName == u"c":
256
word.id = ce.getAttribute("xml:id")
257
word.orig = ce.childNodes[0].data
258
word.src = ce.childNodes[0].data
259
word.reg = ce.childNodes[0].data
260
word.isPunctuation = True
262
print "Unexpected xml: " + ce.toxml()