~jtv/corpusfiltergraph/cross-python

« back to all changes in this revision

Viewing changes to trunk/lib/corpusfg/graphs/sa-champollion/aligner-champollion.py

  • Committer: tahoar
  • Date: 2012-05-02 15:46:23 UTC
  • Revision ID: svn-v4:bc069b21-dff4-4e29-a776-06a4e04bad4e::266
new layout. need to update code to use the new layout

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#! /usr/bin/env python
 
2
# -*- coding: utf8 -*-
 
3
 
 
4
#===============================================================================
 
5
# Author: Tom Hoar
 
6
#===============================================================================
 
7
 
 
8
#version:
 
9
#4.0.264 - updated to fully cover 'lm' keys
 
10
 
 
11
import os
 
12
import sys
 
13
import subprocess
 
14
import logging
 
15
import common as cf
 
16
 
 
17
logger = logging.getLogger('.'.join([os.path.splitext(os.path.basename(sys.argv[0]))[0],'manager','filtergraph',__name__]))
 
18
 
 
19
class aligner(object):
 
20
 
 
21
# ARGS key:
 
22
# -d dictf : use dictf as the translation dictionary
 
23
# -s xstop : use words in file xstop as X stop words
 
24
# -c n     : number of Y chars for each X char
 
25
# -n       : disallow 1-3, 3-1, 1-4, 4-1 alignments
 
26
# -f       : faster performance, lower accuracy
 
27
# -a       : align all and merge ommision
 
28
# -l       : use on-disk database to reduce memory usage
 
29
 
 
30
        cfg = {
 
31
                'args': {},
 
32
                'args1': '-c',
 
33
                'args2': '0.634',
 
34
                'args3': '-s',
 
35
                'args4': 'stoplist',
 
36
                'args5': '-d',
 
37
                'args6': 'dictionary',
 
38
                'exeoptions': '',
 
39
                'invertx2y': False,
 
40
                'kw': {
 
41
                        'bufsize': 0,
 
42
                        'executable': '4,perl',
 
43
                        'stdin': 'PIPE',
 
44
                        'stdout': 'PIPE',
 
45
                        'stderr': 'PIPE',
 
46
                        'preexec_fn': None,
 
47
                        'close_fds': False,
 
48
                        'shell': False,
 
49
                        'cwd': None,
 
50
                        'env': None,
 
51
                        'universal_newlines': False,
 
52
                        'startupinfo': None,
 
53
                        'creationflags': 0,
 
54
                        },
 
55
                'script': '2,bin%schampollion'%(os.sep),
 
56
                'encoding': 'utf8',
 
57
                'lowmem': False,
 
58
                'simple-only': False,
 
59
                'fast': False,
 
60
                'no-omissions': False,
 
61
                'version': '4.0.264',
 
62
                }
 
63
        args = {}
 
64
        encoding = 'utf8'
 
65
        invertx2y = False
 
66
        kw = {}
 
67
        script = '2,bin%schampollion'%(os.sep)
 
68
        isopen = False
 
69
        p = object
 
70
        errors = []
 
71
 
 
72
        def open(self,parent,cfg):
 
73
                args = dict(cfg['args'])
 
74
                self.kw = dict(cfg['kw'])
 
75
                if cfg['script']:
 
76
                        self.script = self.p.decodepath('%s,%s'%(__name__.split(',')[1],cfg['script']),True)
 
77
                        if not self.script:
 
78
                                self.errors.append([__name__,'missing',cfg['script']])
 
79
                                logger.error('%s\t%s',*self.errors[-1][1:])
 
80
                        args[0] = self.script
 
81
                        self.kw['cwd'] = os.path.dirname(os.path.dirname(self.script))
 
82
                        self.kw['env'] = dict(os.environ,**{'CTK': os.path.dirname(os.path.dirname(self.script))})
 
83
                else:
 
84
                        self.script = None
 
85
                # convert args dict to args list
 
86
                args = [str(args[k]) for k in sorted(args.keys())]
 
87
                if 'executable' in cfg['kw'] and cfg['kw']['executable']:
 
88
                        self.kw['executable'] = self.p.decodepath('%s,%s'%(__name__.split(',')[1],cfg['kw']['executable']),True)
 
89
                        if not self.kw['executable']:
 
90
                                self.errors.append([__name__,'missing',cfg['kw']['executable']])
 
91
                                logger.error('%s\t%s',*self.errors[-1][1:])
 
92
                        args.insert(0,cfg['exeoptions'] if cfg['exeoptions'] else '')
 
93
                else:
 
94
                        self.kw['executable'] = None
 
95
                self.kw['stdin'] = subprocess.PIPE if cfg['kw']['stdin'] == 'PIPE' else cfg['kw']['stdin']
 
96
                self.kw['stdout'] = subprocess.PIPE if cfg['kw']['stdout'] == 'PIPE' else cfg['kw']['stdout']
 
97
                self.kw['stderr'] = subprocess.PIPE if cfg['kw']['stderr'] == 'PIPE' else cfg['kw']['stderr']
 
98
                self.encoding = 'utf8' if 'utf8' in cfg['encoding'].lower().replace('-','') else cfg['encoding']
 
99
 
 
100
                self.invertx2y = cfg['invertx2y']
 
101
                self.resources = self.p.decodepath(','.join(__name__.split(',')[:3]+['3']+[__name__.split(',')[-1]]))
 
102
                if not os.path.exists(self.resources):
 
103
                        self.errors.append([__name__,'missing',self.resources])
 
104
                        logger.warn('%s\t%s',*self.errors[-1][1:])
 
105
 
 
106
                for i in range(0,len(args),2):
 
107
                        self.args[0 if not args[i] else args[i]] = args[i+1]
 
108
                if cfg['lowmem']:
 
109
                        self.args['-l'] = None
 
110
                if cfg['simple-only']:
 
111
                        self.args['-n'] = None
 
112
                if cfg['fast']:
 
113
                        self.args['-f'] = None
 
114
                if cfg['no-omissions']:
 
115
                        self.args['-a'] = None
 
116
 
 
117
        def run(self,ktgt):
 
118
                '''run filter'''
 
119
                if not sum([len(line.strip()) for line in self.p.cfoutput[ktgt]['tempbuff']]): return
 
120
                ksrc = list(ktgt)
 
121
                ksrc[cf.rdrlang] = ktgt[cf.srclang]
 
122
                ksrc = tuple(ksrc)
 
123
                ktgtlm = list(ktgt)
 
124
                ktgtlm[cf.kind] = ktgtlm[cf.kind].replace(u'tm',u'lm',1)
 
125
                ktgtlm = tuple(ktgtlm)
 
126
                ksrclm = list(ksrc)
 
127
                ksrclm[cf.kind] = ksrclm[cf.kind].replace(u'tm',u'lm',1)
 
128
                ksrclm = tuple(ksrclm)
 
129
 
 
130
                # test if special-split.py module used property in the graph
 
131
                if not ksrc in self.p.cfextract or not ktgt in self.p.cfextract:
 
132
                        logger.error('%s\t%s',*['missing','\"[%s]\" \"filterX=0,special-split\" not before \"filterY=2,%s'%(ktgt[cf.srclang],__name__)])
 
133
                        return 249
 
134
 
 
135
                # add target lang extensions to dictionary names
 
136
                dargs = dict(self.args)
 
137
 
 
138
                # set keys and indexes for x2y or y2x
 
139
                if self.invertx2y:
 
140
                        kx,kxlm = ktgt,ktgtlm
 
141
                        ky,kylm = ksrc,ksrclm
 
142
                        xlang,ylang = cf.rdrlang,cf.srclang
 
143
                        dargs['-c'] = str(1-float(dargs['-c']))
 
144
                else:
 
145
                        kx,kxlm = ksrc,ksrclm
 
146
                        ky,kylm = ktgt,ktgtlm
 
147
                        xlang,ylang = cf.srclang,cf.rdrlang
 
148
 
 
149
                if os.path.exists(os.sep.join([self.resources,os.extsep.join([dargs['-d'],'-'.join([ktgt[xlang],ktgt[ylang]])])])):
 
150
                        dargs['-d'] = os.sep.join([self.resources,os.extsep.join([dargs['-d'],'-'.join([ktgt[xlang],ktgt[ylang]])])])
 
151
                else:
 
152
                        del(dargs['-d'])
 
153
                        dargs['-c'] = '0.500'
 
154
                if os.sep.join([self.resources,os.extsep.join([dargs['-s'],ktgt[xlang]])]):
 
155
                        dargs['-s'] = os.sep.join([self.resources,os.extsep.join([dargs['-s'],ktgt[xlang]])])
 
156
                else:
 
157
                        del(dargs['-s'])
 
158
                args = []
 
159
                for k in sorted(dargs.keys()):
 
160
                        if dargs[k]:
 
161
                                args.extend([k if k else '', dargs[k]])
 
162
                        else:
 
163
                                args.extend([k if k else '',])
 
164
 
 
165
                # add x,y inputs and output paths (note: '-' sends output to STDOUT)
 
166
#               falign = os.sep.join([self.p.tempdir,os.extsep.join([self.p.filelist[cf.basename],self.p.datetime.replace(':',''),'align'])])
 
167
                falign = '-'
 
168
                args.extend([self.p.cfoutput[kx]['tempname'],self.p.cfoutput[ky]['tempname'],falign])
 
169
                # save buffer to temp files for champollion input
 
170
                if self.p.writer.run(self.p.cfoutput): return -2
 
171
 
 
172
                # clear buffers for champollion re-alignment
 
173
                self.p.cfoutput[ktgt]['tempbuff'] = []
 
174
                self.p.cfoutput[ksrc]['tempbuff'] = []
 
175
 
 
176
                #run champollion script
 
177
                alignments = self.__filter(args)
 
178
                if not alignments:
 
179
                        logger.warn('%s\t%s',*['config','no alignments found'])
 
180
                        return -2
 
181
 
 
182
                # parse report results
 
183
                for alignment in alignments:
 
184
                        xlines, ylines = [line.split(',') for line in alignment]
 
185
                        xlines = [int(line)-1 for line in xlines]
 
186
                        ylines = [int(line)-1 for line in ylines]
 
187
                        # move matching line numbers from self.p.cfextract to self.p.cfoutput
 
188
                        if xlines[0] == -1:
 
189
                                yline = []
 
190
                                for yitem in ylines:
 
191
                                        if yitem >= len(self.p.cfextract[ky]['tempbuff']): continue
 
192
                                        yline.append(self.p.cfextract[ky]['tempbuff'][yitem].strip())
 
193
                                self.p.cfoutput[kylm]['tempbuff'].append(' '.join(yline))
 
194
                        elif ylines[0] == -1:
 
195
                                xline = []
 
196
                                for xitem in xlines:
 
197
                                        if xitem >= len(self.p.cfextract[kx]['tempbuff']): continue
 
198
                                        xline.append(self.p.cfextract[kx]['tempbuff'][xitem].strip())
 
199
                                self.p.cfoutput[kxlm]['tempbuff'].append(' '.join(xline))
 
200
                        else:
 
201
                                xline = []
 
202
                                for xitem in xlines:
 
203
                                        xline.append(self.p.cfextract[kx]['tempbuff'][xitem].strip())
 
204
                                yline = []
 
205
                                for yitem in ylines:
 
206
                                        yline.append(self.p.cfextract[ky]['tempbuff'][yitem].strip())
 
207
                                self.p.cfoutput[kx]['tempbuff'].append(' '.join(xline))
 
208
                                self.p.cfoutput[ky]['tempbuff'].append(' '.join(yline))
 
209
 
 
210
        def flush(self,ktgt):
 
211
                return
 
212
 
 
213
        def close(self):
 
214
                return
 
215
 
 
216
        def __filter(self,args):
 
217
                '''run champollion'''
 
218
                try:
 
219
                        subproc = subprocess.Popen(args,**self.kw)
 
220
                except Exception,e:
 
221
                        raise RuntimeError("[%s] failed to start - %s"%(__name__,str(e),))
 
222
                for errline in iter(subproc.stderr.readline,''):
 
223
#                       sys.stderr.write('%s\n'%(errline.rstrip()))
 
224
                        pass
 
225
                if subproc.returncode:  # champollion returned error
 
226
                        alignments = []
 
227
                        if os.path.exists(args[-1]): os.unlink(args[-1])
 
228
                else:
 
229
                        if args[-1] == '-':
 
230
                                txtout = subproc.stdout.readlines()
 
231
                        else:
 
232
                                if not os.path.exists(args[-1]):
 
233
                                        alignments = []
 
234
                                else:
 
235
                                        # open, read & close, delete alignment file
 
236
                                        txtout = open(args[-1],'r').read().strip().splitlines()
 
237
                                        os.unlink(args[-1])
 
238
                        alignments = [alignment.replace('omitted','0').strip().split('<=>') for alignment in txtout]
 
239
                return alignments
 
240
 
 
241
def usage():
 
242
        '''Command prompt help.'''
 
243
        return "\n%s\n\tUsage:\n\tfrom %s import filter\n"%(
 
244
        os.path.basename(sys.argv[0]),
 
245
        os.path.splitext(os.path.basename(sys.argv[0]))[0]
 
246
        )
 
247
 
 
248
licensetxt=u'''CorpusFiltergraph™ v4.0
 
249
Copyright © 2010-2012 Precision Translation Tools Co., Ltd.
 
250
 
 
251
This program is free software: you can redistribute it and/or modify
 
252
it under the terms of the GNU Lesser General Public License as published by
 
253
the Free Software Foundation, either version 3 of the License, or
 
254
(at your option) any later version.
 
255
 
 
256
This program is distributed in the hope that it will be useful,
 
257
but WITHOUT ANY WARRANTY; without even the implied warranty of
 
258
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
259
GNU Lesser General Public License for more details.
 
260
 
 
261
You should have received a copy of the GNU Lesser General Public License
 
262
along with this program.  If not, see http://www.gnu.org/licenses/.
 
263
 
 
264
For more information, please contact Precision Translation Tools Co., Ltd.
 
265
at: http://www.precisiontranslationtools.com'''
 
266
 
 
267
if __name__ == "__main__":
 
268
        import os
 
269
        import sys
 
270
        sys.stdout.write(usage().encode('utf8')+'\n')