~vasily-aa/scat/trunk

« back to all changes in this revision

Viewing changes to SearchApp/utils.py

  • Committer: Gigla
  • Date: 2011-05-21 10:53:49 UTC
  • Revision ID: mail@v-alexeev.ru-20110521105349-ewr38nl1a9gu38kl
First commit

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#coding=utf-8
 
2
from models import Word, GramInfo, corrGramInfo
 
3
from django.core.exceptions import ObjectDoesNotExist
 
4
import re
 
5
import xml.dom, xml.dom.minidom
 
6
 
 
7
class InconsistentGramDataException(Exception):
 
8
        def __init__(self, value):
 
9
                self.value = value
 
10
        def __str__(self):
 
11
                return self.value
 
12
 
 
13
def getNextWord(word, allowPunctuation=True, reverse=False, quiet=False):
 
14
        m = re.match(r"^(.+)\.(\d+)$", word.id)
 
15
        i = 1
 
16
        maxretries = 5
 
17
        k = 1 if not reverse else -1
 
18
        while (i < maxretries):
 
19
                wid = m.group(1) + "." + unicode(int(m.group(2)) + i*k)
 
20
                q = Word.objects.filter(pk = wid)
 
21
                if q and (allowPunctuation or not q[0].isPunctuation): return q[0]
 
22
                i += 1
 
23
        if not quiet: raise ObjectDoesNotExist
 
24
 
 
25
def replaceCharsByDict(unistr, frm, to):
 
26
    if len(frm) != len(to): raise "frm != to"
 
27
    result = list(unistr)
 
28
    for i in range(len(result)):
 
29
        if result[i] in frm: result[i] = to[frm.index(result[i])]
 
30
    return u''.join(char for char in result)
 
31
 
 
32
def generateGramData(gramDataCSV, word):
 
33
        g = GramInfo()
 
34
        g.word = word
 
35
        corrg = corrGramInfo()
 
36
        corrg.word = word
 
37
        corrg.used = False
 
38
                
 
39
                
 
40
        def setGramAttribute(attrname, data, gramdict):
 
41
                #Sets grammatical attribute for both GramInfo and corrGramInfo
 
42
                if data == u"вин/род":
 
43
                        # Animate is a special case
 
44
                        g.animate = True
 
45
                        g.case = 4
 
46
                        return
 
47
                data = data.replace(u"а/имп", u"а%имп").replace(u"р/скл", u"р%скл").replace(u"н/б",u"н%б")
 
48
                dataspl = data.split("/")
 
49
                try:
 
50
                        setattr(g, attrname, gramdict[dataspl[0]])
 
51
                        if len(dataspl) > 1:
 
52
                                setattr(corrg, attrname, gramdict[dataspl[1]])
 
53
                                corrg.used = True
 
54
                except KeyError:
 
55
                        raise InconsistentGramDataException("Invalid %s: %s (word %s)" % (attrname, data, word.src))
 
56
                
 
57
        dTypes = {"a": 1, "ja": 2, "o":3, "jo":4, "u":5, u"i": 6, "en": 7, "men": 8, "es": 9, "ent": 10,
 
58
                "er":11, "uu": 12, u"личн": 13, u"м": 14, u"тв": 15, u"р%скл": 16}
 
59
        cases = {u"им": 1, u"род": 2, u"дат": 3, u"вин": 4, u"тв": 5, u"мест": 6, u"зв": 7}
 
60
        verbTenses = {u"н%б": 1, u"аорпр": 2, u"аорсигм": 3, u"аоргл": 4, u"аорнов": 5, u"а%имп": 6, u"имп": 7,
 
61
                u"перф": 8,     u"плюскв": 9, u"буд": 10, u"буд1": 11, u"буд2": 12,     u"прош": 13}
 
62
        numbers = {u"ед": 1, u"дв": 2, u"мн": 3, u"0": 0}
 
63
        persons = {u"1": 1, u"2": 2, u"3": 3}
 
64
        verbClasses = {u"1": 1, u"2": 2, u"3": 3, u"4": 4, u"5": 5}
 
65
        verbRoles = {u"св": 1, u"пр-св": 2,     u"пр": 3, u"инф": 4}
 
66
        genders = {u"м": 1,     u"ж": 2, u"ср": 3, u"0": 0}
 
67
        participleTenses = {u"наст": 1, u"прош": 2}
 
68
        moods = {u"изъяв": 1, u"повел": 2, u"сосл": 3}
 
69
                
 
70
 
 
71
        def NounAdjNum():
 
72
                gramDataCSV[2] = replaceCharsByDict(gramDataCSV[2],u'ао', u'ao')
 
73
                setGramAttribute("declensionType", gramDataCSV[2], dTypes)
 
74
                setGramAttribute("case", gramDataCSV[3], cases)
 
75
                setGramAttribute("number", gramDataCSV[4], numbers)
 
76
                setGramAttribute("gender", gramDataCSV[5], genders)
 
77
                
 
78
        def Particip():
 
79
                gramDataCSV[2] = replaceCharsByDict(gramDataCSV[2],u'ао', u'ao')
 
80
                setGramAttribute("declensionType", gramDataCSV[2], dTypes)
 
81
                setGramAttribute("participleTense", gramDataCSV[3], participleTenses)
 
82
                setGramAttribute("case", gramDataCSV[4], cases)
 
83
                setGramAttribute("number", gramDataCSV[5], numbers)
 
84
                setGramAttribute("gender", gramDataCSV[6], genders)
 
85
                
 
86
        def Verb():
 
87
                setGramAttribute("mood", gramDataCSV[2], moods)
 
88
                if g.mood == 1:
 
89
                        setGramAttribute("verbTense", gramDataCSV[3], verbTenses)
 
90
                        if g.verbTense in range(2,8) or g.verbTense == 10:
 
91
                                setGramAttribute("person", gramDataCSV[4], persons)
 
92
                                setGramAttribute("number", gramDataCSV[5], numbers)
 
93
                        elif g.verbTense == 13:
 
94
                                setGramAttribute("gender", gramDataCSV[4], genders)
 
95
                                setGramAttribute("number", gramDataCSV[5], numbers)
 
96
                        elif g.verbTense == 1:
 
97
                                setGramAttribute("person", gramDataCSV[4], persons)
 
98
                                setGramAttribute("number", gramDataCSV[5], numbers)
 
99
                                setGramAttribute("verbClass", gramDataCSV[6], verbClasses)
 
100
                        else:
 
101
                                setGramAttribute("verbRole", gramDataCSV[6], verbRoles)
 
102
                                if g.verbRole == 1:
 
103
                                        setGramAttribute("person", gramDataCSV[4], persons)
 
104
                                        setGramAttribute("number", gramDataCSV[5], numbers)
 
105
                                if g.verbRole in (2,3):
 
106
                                        setGramAttribute("gender", gramDataCSV[4], genders)
 
107
                                        setGramAttribute("number", gramDataCSV[5], numbers)
 
108
                                if g.verbRole == 4:
 
109
                                        pass #it's infinitive, we already have everything needed
 
110
                elif g.mood == 2:
 
111
                        # imperative
 
112
                        setGramAttribute("person", gramDataCSV[3], persons)
 
113
                        setGramAttribute("number", gramDataCSV[4], numbers)
 
114
                        setGramAttribute("verbClass", gramDataCSV[5], verbClasses)
 
115
                elif g.mood == 3:
 
116
                        #subjunctive
 
117
                        setGramAttribute("verbRole", gramDataCSV[5], verbRoles)
 
118
                        if g.verbRole == 1:
 
119
                                setGramAttribute("person", gramDataCSV[3], persons)
 
120
                                setGramAttribute("number", gramDataCSV[4], numbers)
 
121
                        if g.verbRole == 3:
 
122
                                setGramAttribute("gender", gramDataCSV[3], genders)
 
123
                                setGramAttribute("number", gramDataCSV[4], numbers)
 
124
                        
 
125
        
 
126
        if gramDataCSV[1] == u"сущ":
 
127
                g.POS = 1
 
128
                NounAdjNum()
 
129
        elif gramDataCSV[1] == u"мест":
 
130
                g.POS = 2
 
131
                gramDataCSV[2] = replaceCharsByDict(gramDataCSV[2],u'ао', u'ao')
 
132
                setGramAttribute("declensionType", gramDataCSV[2], dTypes)
 
133
                if g.declensionType == 13:
 
134
                        if gramDataCSV[3] == u"возвр":
 
135
                                g.reflexive = True
 
136
                                setGramAttribute("case", gramDataCSV[4], cases)
 
137
                        else:
 
138
                                setGramAttribute("person", gramDataCSV[3], persons)
 
139
                                setGramAttribute("case", gramDataCSV[4], cases)
 
140
                                setGramAttribute("number", gramDataCSV[5], numbers)
 
141
                else:
 
142
                        setGramAttribute("case", gramDataCSV[3], cases)
 
143
                        setGramAttribute("number", gramDataCSV[4], numbers)
 
144
                        setGramAttribute("gender", gramDataCSV[5], genders)
 
145
        elif gramDataCSV[1] == u"прил":
 
146
                g.POS = 3          
 
147
                NounAdjNum()
 
148
        elif gramDataCSV[1] == u"прил/ср":
 
149
                g.POS = 4  
 
150
                NounAdjNum()        
 
151
        elif gramDataCSV[1] == u"числ":
 
152
                g.POS = 5
 
153
                NounAdjNum()
 
154
        elif gramDataCSV[1] == u"числ/п":
 
155
                g.POS = 6        
 
156
                NounAdjNum()
 
157
        elif gramDataCSV[1] == u"прич":
 
158
                g.POS = 7  
 
159
                g.reflexive = False
 
160
                Particip()
 
161
        elif gramDataCSV[1] == u"прич/в":
 
162
                g.POS = 7
 
163
                g.reflexive = True
 
164
                Particip()
 
165
        elif gramDataCSV[1] == u"гл":
 
166
                g.POS = 8
 
167
                g.reflexive = False
 
168
                Verb()
 
169
        elif gramDataCSV[1] == u"гл/в":
 
170
                g.POS = 8
 
171
                g.reflexive = True
 
172
                Verb()
 
173
        elif gramDataCSV[1] == u"инф":
 
174
                g.POS = 9
 
175
                g.reflexive = False
 
176
        elif gramDataCSV[1] == u"инф/в":
 
177
                g.POS = 9
 
178
                g.reflexive = True
 
179
        elif gramDataCSV[1] == u"суп":
 
180
                g.POS = 10          
 
181
        elif gramDataCSV[1] == u"нар":
 
182
                g.POS = 11          
 
183
        elif gramDataCSV[1] == u"пред":
 
184
                g.POS = 12          
 
185
        elif gramDataCSV[1] == u"посл":
 
186
                g.POS = 13
 
187
        elif gramDataCSV[1] == u"союз":
 
188
                g.POS = 14          
 
189
        elif gramDataCSV[1] == u"част":
 
190
                g.POS = 15          
 
191
        elif gramDataCSV[1] == u"межд":
 
192
                g.POS = 16          
 
193
        else: raise InconsistentGramDataException("Invalid part of speech: '" + gramDataCSV[1] + "'")
 
194
        
 
195
        g.save()
 
196
        if corrg.used:
 
197
                corrg.save()
 
198
        else:
 
199
                corrg = corrGramInfo.objects.filter(pk = word)
 
200
                if corrg:
 
201
                        corrg.delete()
 
202
                        
 
203
                        
 
204
def ParseXMLIntoDB(xmlfile, mscript):
 
205
        doc = xml.dom.minidom.parse(xmlfile)
 
206
 
 
207
        doc.normalize()
 
208
        
 
209
        div1Els = doc.getElementsByTagName("div1")
 
210
        
 
211
        for div1 in div1Els:
 
212
                part = int(div1.getAttribute("n"))
 
213
                div2Els = div1.getElementsByTagName("div2")
 
214
                for div2 in div2Els:
 
215
                        page = int(div2.getAttribute("n"))
 
216
                        div3Els = div2.getElementsByTagName("div3")
 
217
                        for div3 in div3Els:
 
218
                                front = (div3.getAttribute("type") == "front")
 
219
                                div4Els = div3.getElementsByTagName("div4")
 
220
                                for div4 in div4Els:
 
221
                                        column = int(div4.getAttribute("n"))
 
222
                                        lEls = div4.getElementsByTagName("l")
 
223
                                        for l in lEls:
 
224
                                                line = int(l.getAttribute("n"))
 
225
                                                positioninline = 1
 
226
                                                childEls = [el for el in l.childNodes if isinstance(el, xml.dom.minidom.Element)]
 
227
                                                for ce in childEls:
 
228
                                                        word = Word()
 
229
                                                        word.positioninline = positioninline
 
230
                                                        positioninline += 1
 
231
                                                        word.line = line
 
232
                                                        word.page = page
 
233
                                                        word.column = column
 
234
                                                        word.part = part
 
235
                                                        word.front = front
 
236
                                                        word.manuscript = mscript
 
237
                                                        if ce.tagName in (u"w", u"name", u"add", u"num"):
 
238
                                                                if ce.tagName != "w": ce = ce.getElementsByTagName("w")[0]
 
239
                                                                word.id = ce.getAttribute("xml:id")
 
240
                                                                orig = ce.getElementsByTagName('orig')[0]
 
241
                                                                if len(orig.getElementsByTagName('sic')) > 0:
 
242
                                                                        word.sic = True
 
243
                                                                        word.orig = orig.getElementsByTagName('sic')[0].toxml()[5:-6]
 
244
                                                                        if len(orig.getElementsByTagName('corr')) > 0:
 
245
                                                                                word.corr = orig.getElementsByTagName('corr')[0].toxml()[6:-7]
 
246
                                                                else:                   
 
247
                                                                        word.orig = ce.getElementsByTagName('orig')[0].toxml()[6:-7]
 
248
                                                                if ce.parentNode.tagName == 'name': word.name = True
 
249
                                                                if ce.parentNode.tagName == 'add': word.add = True
 
250
                                                                if ce.parentNode.tagName == 'num':
 
251
                                                                        word.value = int(ce.parentNode.getAttribute("value"))
 
252
                                                                        word.num = True
 
253
                                                                word.reg = ce.getElementsByTagName('reg')[0].toxml()[5:-6]
 
254
                                                                word.src = ce.getElementsByTagName('src')[0].toxml()[5:-6]
 
255
                                                        elif ce.tagName == u"c":
 
256
                                                                word.id = ce.getAttribute("xml:id")
 
257
                                                                word.orig = ce.childNodes[0].data
 
258
                                                                word.src = ce.childNodes[0].data
 
259
                                                                word.reg = ce.childNodes[0].data
 
260
                                                                word.isPunctuation = True
 
261
                                                        else:
 
262
                                                                print "Unexpected xml: " + ce.toxml()
 
263
                                                        
 
264
                                                        word.save()
 
265