4
#===============================================================================
6
#===============================================================================
9
#4.0.264 - updated to fully cover 'lm' keys
17
logger = logging.getLogger('.'.join([os.path.splitext(os.path.basename(sys.argv[0]))[0],'manager','filtergraph',__name__]))
19
class aligner(object):
22
# -d dictf : use dictf as the translation dictionary
23
# -s xstop : use words in file xstop as X stop words
24
# -c n : number of Y chars for each X char
25
# -n : disallow 1-3, 3-1, 1-4, 4-1 alignments
26
# -f : faster performance, lower accuracy
27
# -a : align all and merge ommision
28
# -l : use on-disk database to reduce memory usage
37
'args6': 'dictionary',
42
'executable': '4,perl',
51
'universal_newlines': False,
55
'script': '2,bin%schampollion'%(os.sep),
60
'no-omissions': False,
67
script = '2,bin%schampollion'%(os.sep)
72
def open(self,parent,cfg):
73
args = dict(cfg['args'])
74
self.kw = dict(cfg['kw'])
76
self.script = self.p.decodepath('%s,%s'%(__name__.split(',')[1],cfg['script']),True)
78
self.errors.append([__name__,'missing',cfg['script']])
79
logger.error('%s\t%s',*self.errors[-1][1:])
81
self.kw['cwd'] = os.path.dirname(os.path.dirname(self.script))
82
self.kw['env'] = dict(os.environ,**{'CTK': os.path.dirname(os.path.dirname(self.script))})
85
# convert args dict to args list
86
args = [str(args[k]) for k in sorted(args.keys())]
87
if 'executable' in cfg['kw'] and cfg['kw']['executable']:
88
self.kw['executable'] = self.p.decodepath('%s,%s'%(__name__.split(',')[1],cfg['kw']['executable']),True)
89
if not self.kw['executable']:
90
self.errors.append([__name__,'missing',cfg['kw']['executable']])
91
logger.error('%s\t%s',*self.errors[-1][1:])
92
args.insert(0,cfg['exeoptions'] if cfg['exeoptions'] else '')
94
self.kw['executable'] = None
95
self.kw['stdin'] = subprocess.PIPE if cfg['kw']['stdin'] == 'PIPE' else cfg['kw']['stdin']
96
self.kw['stdout'] = subprocess.PIPE if cfg['kw']['stdout'] == 'PIPE' else cfg['kw']['stdout']
97
self.kw['stderr'] = subprocess.PIPE if cfg['kw']['stderr'] == 'PIPE' else cfg['kw']['stderr']
98
self.encoding = 'utf8' if 'utf8' in cfg['encoding'].lower().replace('-','') else cfg['encoding']
100
self.invertx2y = cfg['invertx2y']
101
self.resources = self.p.decodepath(','.join(__name__.split(',')[:3]+['3']+[__name__.split(',')[-1]]))
102
if not os.path.exists(self.resources):
103
self.errors.append([__name__,'missing',self.resources])
104
logger.warn('%s\t%s',*self.errors[-1][1:])
106
for i in range(0,len(args),2):
107
self.args[0 if not args[i] else args[i]] = args[i+1]
109
self.args['-l'] = None
110
if cfg['simple-only']:
111
self.args['-n'] = None
113
self.args['-f'] = None
114
if cfg['no-omissions']:
115
self.args['-a'] = None
119
if not sum([len(line.strip()) for line in self.p.cfoutput[ktgt]['tempbuff']]): return
121
ksrc[cf.rdrlang] = ktgt[cf.srclang]
124
ktgtlm[cf.kind] = ktgtlm[cf.kind].replace(u'tm',u'lm',1)
125
ktgtlm = tuple(ktgtlm)
127
ksrclm[cf.kind] = ksrclm[cf.kind].replace(u'tm',u'lm',1)
128
ksrclm = tuple(ksrclm)
130
# test if special-split.py module used property in the graph
131
if not ksrc in self.p.cfextract or not ktgt in self.p.cfextract:
132
logger.error('%s\t%s',*['missing','\"[%s]\" \"filterX=0,special-split\" not before \"filterY=2,%s'%(ktgt[cf.srclang],__name__)])
135
# add target lang extensions to dictionary names
136
dargs = dict(self.args)
138
# set keys and indexes for x2y or y2x
140
kx,kxlm = ktgt,ktgtlm
141
ky,kylm = ksrc,ksrclm
142
xlang,ylang = cf.rdrlang,cf.srclang
143
dargs['-c'] = str(1-float(dargs['-c']))
145
kx,kxlm = ksrc,ksrclm
146
ky,kylm = ktgt,ktgtlm
147
xlang,ylang = cf.srclang,cf.rdrlang
149
if os.path.exists(os.sep.join([self.resources,os.extsep.join([dargs['-d'],'-'.join([ktgt[xlang],ktgt[ylang]])])])):
150
dargs['-d'] = os.sep.join([self.resources,os.extsep.join([dargs['-d'],'-'.join([ktgt[xlang],ktgt[ylang]])])])
153
dargs['-c'] = '0.500'
154
if os.sep.join([self.resources,os.extsep.join([dargs['-s'],ktgt[xlang]])]):
155
dargs['-s'] = os.sep.join([self.resources,os.extsep.join([dargs['-s'],ktgt[xlang]])])
159
for k in sorted(dargs.keys()):
161
args.extend([k if k else '', dargs[k]])
163
args.extend([k if k else '',])
165
# add x,y inputs and output paths (note: '-' sends output to STDOUT)
166
# falign = os.sep.join([self.p.tempdir,os.extsep.join([self.p.filelist[cf.basename],self.p.datetime.replace(':',''),'align'])])
168
args.extend([self.p.cfoutput[kx]['tempname'],self.p.cfoutput[ky]['tempname'],falign])
169
# save buffer to temp files for champollion input
170
if self.p.writer.run(self.p.cfoutput): return -2
172
# clear buffers for champollion re-alignment
173
self.p.cfoutput[ktgt]['tempbuff'] = []
174
self.p.cfoutput[ksrc]['tempbuff'] = []
176
#run champollion script
177
alignments = self.__filter(args)
179
logger.warn('%s\t%s',*['config','no alignments found'])
182
# parse report results
183
for alignment in alignments:
184
xlines, ylines = [line.split(',') for line in alignment]
185
xlines = [int(line)-1 for line in xlines]
186
ylines = [int(line)-1 for line in ylines]
187
# move matching line numbers from self.p.cfextract to self.p.cfoutput
191
if yitem >= len(self.p.cfextract[ky]['tempbuff']): continue
192
yline.append(self.p.cfextract[ky]['tempbuff'][yitem].strip())
193
self.p.cfoutput[kylm]['tempbuff'].append(' '.join(yline))
194
elif ylines[0] == -1:
197
if xitem >= len(self.p.cfextract[kx]['tempbuff']): continue
198
xline.append(self.p.cfextract[kx]['tempbuff'][xitem].strip())
199
self.p.cfoutput[kxlm]['tempbuff'].append(' '.join(xline))
203
xline.append(self.p.cfextract[kx]['tempbuff'][xitem].strip())
206
yline.append(self.p.cfextract[ky]['tempbuff'][yitem].strip())
207
self.p.cfoutput[kx]['tempbuff'].append(' '.join(xline))
208
self.p.cfoutput[ky]['tempbuff'].append(' '.join(yline))
210
def flush(self,ktgt):
216
def __filter(self,args):
217
'''run champollion'''
219
subproc = subprocess.Popen(args,**self.kw)
221
raise RuntimeError("[%s] failed to start - %s"%(__name__,str(e),))
222
for errline in iter(subproc.stderr.readline,''):
223
# sys.stderr.write('%s\n'%(errline.rstrip()))
225
if subproc.returncode: # champollion returned error
227
if os.path.exists(args[-1]): os.unlink(args[-1])
230
txtout = subproc.stdout.readlines()
232
if not os.path.exists(args[-1]):
235
# open, read & close, delete alignment file
236
txtout = open(args[-1],'r').read().strip().splitlines()
238
alignments = [alignment.replace('omitted','0').strip().split('<=>') for alignment in txtout]
242
'''Command prompt help.'''
243
return "\n%s\n\tUsage:\n\tfrom %s import filter\n"%(
244
os.path.basename(sys.argv[0]),
245
os.path.splitext(os.path.basename(sys.argv[0]))[0]
248
licensetxt=u'''CorpusFiltergraph™ v4.0
249
Copyright © 2010-2012 Precision Translation Tools Co., Ltd.
251
This program is free software: you can redistribute it and/or modify
252
it under the terms of the GNU Lesser General Public License as published by
253
the Free Software Foundation, either version 3 of the License, or
254
(at your option) any later version.
256
This program is distributed in the hope that it will be useful,
257
but WITHOUT ANY WARRANTY; without even the implied warranty of
258
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
259
GNU Lesser General Public License for more details.
261
You should have received a copy of the GNU Lesser General Public License
262
along with this program. If not, see http://www.gnu.org/licenses/.
264
For more information, please contact Precision Translation Tools Co., Ltd.
265
at: http://www.precisiontranslationtools.com'''
267
if __name__ == "__main__":
270
sys.stdout.write(usage().encode('utf8')+'\n')