4
#===============================================================================
5
# Author: Walapa Muangjeen
6
#===============================================================================
9
#4.0.264 - version update
18
logger = logging.getLogger('.'.join([os.path.splitext(os.path.basename(sys.argv[0]))[0],'manager','filtergraph',__name__]))
25
'irrelevant': string.punctuation+string.whitespace+string.digits,
32
irrelevant = string.punctuation+string.whitespace+string.digits
40
def open(self,parent,cfg):
41
self.encoding = 'utf8' if 'utf8' in cfg['encoding'].lower().replace('-','') else cfg['encoding']
42
self.inputfile = cfg['inputfile'].replace('%(rootfolder)s',parent.rootfolder) if cfg['inputfile'] else self.inputfile
43
self.outputfile = cfg['outputfile'].replace('%(rootfolder)s',parent.rootfolder) if cfg['outputfile'] else self.outputfile
44
if (self.inputfile and not self.outputfile) or (not self.inputfile and self.outputfile):
45
self.errors.append([__name__,'invalid','[%s] inputfile=%s without outputfile= value'%(__name__,cfg['inputfile'])])
46
logger.warn('%s\t%s',*self.errors[-1][1:])
48
self.deleteline = cfg['deleteline']
49
self.irrelevant = string.punctuation+string.whitespace+string.digits+''.join([ char for char in cf.dedupelist(cfg['irrelevant']) if not char in string.punctuation+string.whitespace+string.digits ])
50
self.regex = re.compile('[%s]'%re.escape(self.irrelevant))
53
skipclose = not self.inputfile
54
if self.inputfile: return
57
self.p.cfoutput[k]['tempbuff'] = [line for line in self.p.cfoutput[k]['tempbuff'] if len(self.regex.sub('', line))]
59
self.p.cfoutput[k]['tempbuff'] = [self.__filter(line) for line in self.p.cfoutput[k]['tempbuff']]
61
def __filter(self,line):
62
if not self.regex.sub('', line): line = ''
73
if not os.path.exists(self.inputfile):
74
self.errors.append([__name__,'missing','[%s] %s'%(__name__,self.inputfile)])
75
logger.error('%s\t%s',*self.errors[-1][1:])
80
os.makedirs(os.path.dirname(self.outputfile))
83
logger.exception('%s\t%s, %s, %s',*['failed',e.errno,e.strerror,e.filename,])
86
# open input and output files
88
if out == self.inputfile:
90
fd,out = tempfile.mkstemp(suffix='.tmp', prefix='~', dir=self.p.tempdir)
93
o = codecs.open(out,'w',self.encoding)
94
i = codecs.open(self.inputfile,'r',self.encoding)
96
raise RuntimeError('Failed to open [%s] input/output files'%(__name__))
98
sys.stderr.write('[%s] %s\n Please wait'%(__name__,self.outputfile))
101
# loop writes output line-by-line
103
o.write('%s\n'%(self.__filter(line.rstrip('\r\n'))))
105
if not cnt%5000: sys.stderr.write('.')
106
sys.stderr.write('\n')
107
# close input and output files
111
if not out == self.outputfile:
113
shutil.move(out,self.outputfile)
115
except KeyboardInterrupt:
117
raise KeyboardInterrupt()
119
if not os.path.exists(self.outputfile):
120
self.errors.append([__name__,'missing','[%s] %s'%(__name__,self.outputfile)])
121
logger.error('%s\t%s',*self.errors[-1][1:])
124
'''Command prompt help.'''
125
return "\n%s\n\tUsage:\n\tfrom %s import filter\n"%(
126
os.path.basename(sys.argv[0]),
127
os.path.splitext(os.path.basename(sys.argv[0]))[0]
130
licensetxt=u'''CorpusFiltergraph™ v4.0
131
Copyright © 2010-2012 Precision Translation Tools Co., Ltd.
133
This program is free software: you can redistribute it and/or modify
134
it under the terms of the GNU Lesser General Public License as published by
135
the Free Software Foundation, either version 3 of the License, or
136
(at your option) any later version.
138
This program is distributed in the hope that it will be useful,
139
but WITHOUT ANY WARRANTY; without even the implied warranty of
140
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
141
GNU Lesser General Public License for more details.
143
You should have received a copy of the GNU Lesser General Public License
144
along with this program. If not, see http://www.gnu.org/licenses/.
146
For more information, please contact Precision Translation Tools Co., Ltd.
147
at: http://www.precisiontranslationtools.com'''
149
if __name__ == "__main__":
152
sys.stdout.write(usage().encode('utf8')+'\n')