4
#===============================================================================
5
# Author: Walapa Muangjeen
6
#===============================================================================
9
#4.0.264 - version update
16
logger = logging.getLogger('.'.join([os.path.splitext(os.path.basename(sys.argv[0]))[0],'manager','filtergraph',__name__]))
131
def open(self,parent,cfg):
132
self.encoding = 'utf8' if 'utf8' in cfg['encoding'].lower().replace('-','') else cfg['encoding']
133
self.inputfile = cfg['inputfile'].replace('%(rootfolder)s',parent.rootfolder) if cfg['inputfile'] else self.inputfile
134
self.outputfile = cfg['outputfile'].replace('%(rootfolder)s',parent.rootfolder) if cfg['outputfile'] else self.outputfile
135
if (self.inputfile and not self.outputfile) or (not self.inputfile and self.outputfile):
136
self.errors.append([__name__,'invalid','[%s] inputfile=%s without outputfile= value'%(__name__,cfg['inputfile'])])
137
logger.warn('%s\t%s',*self.errors[-1][1:])
141
skipclose = not self.inputfile
142
if self.inputfile: return
144
self.p.cfoutput[k]['tempbuff'] = [self.__filter(line) for line in self.p.cfoutput[k]['tempbuff']]
156
if not os.path.exists(self.inputfile):
157
self.errors.append([__name__,'missing','[%s] %s'%(__name__,self.inputfile)])
158
logger.error('%s\t%s',*self.errors[-1][1:])
163
os.makedirs(os.path.dirname(self.outputfile))
165
if not e.errno == 17:
166
logger.exception('%s\t%s, %s, %s',*['failed',e.errno,e.strerror,e.filename,])
169
# open input and output files
170
out = self.outputfile
171
if out == self.inputfile:
173
fd,out = tempfile.mkstemp(suffix='.tmp', prefix='~', dir=self.p.tempdir)
176
o = codecs.open(out,'w',self.encoding)
177
i = codecs.open(self.inputfile,'r',self.encoding)
179
raise RuntimeError('Failed to open [%s] input/output files'%(__name__))
181
sys.stderr.write('[%s] %s\n Please wait'%(__name__,self.outputfile))
184
# loop writes output line-by-line
188
o.write('%s\n'%(u''.join([ch if ch in self.exception else unicodedata.normalize(self.form,ch) for ch in '\b'.join(line.rstrip('\r\n')).split('\b')])))
190
if not cnt%5000: sys.stderr.write('.')
193
o.write('%s\n'%(self.__filter(line.rstrip('\r\n'))))
195
if not cnt%5000: sys.stderr.write('.')
197
sys.stderr.write('\n')
198
# close input and output files
202
if not out == self.outputfile:
204
shutil.move(out,self.outputfile)
206
except KeyboardInterrupt:
208
raise KeyboardInterrupt()
210
if not os.path.exists(self.outputfile):
211
self.errors.append([__name__,'missing','[%s] %s'%(__name__,self.outputfile)])
212
logger.error('%s\t%s',*self.errors[-1][1:])
214
def __filter(self,text):
215
'''Normalize JA english charactors to western UTF-8 range'''
216
for i, j in self.replace.items():
217
text = text.replace(i, j)
221
'''Command prompt help.'''
222
return "\n%s\n\tUsage:\n\tfrom %s import filter\n"%(
223
os.path.basename(sys.argv[0]),
224
os.path.splitext(os.path.basename(sys.argv[0]))[0]
227
licensetxt=u'''CorpusFiltergraph™ v4.0
228
Copyright © 2010-2012 Precision Translation Tools Co., Ltd.
230
This program is free software: you can redistribute it and/or modify
231
it under the terms of the GNU Lesser General Public License as published by
232
the Free Software Foundation, either version 3 of the License, or
233
(at your option) any later version.
235
This program is distributed in the hope that it will be useful,
236
but WITHOUT ANY WARRANTY; without even the implied warranty of
237
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
238
GNU Lesser General Public License for more details.
240
You should have received a copy of the GNU Lesser General Public License
241
along with this program. If not, see http://www.gnu.org/licenses/.
243
For more information, please contact Precision Translation Tools Co., Ltd.
244
at: http://www.precisiontranslationtools.com'''
246
if __name__ == "__main__":
249
sys.stdout.write(usage().encode('utf8')+'\n')