~jtv/corpusfiltergraph/cross-python

self.irrelevant = string.punctuation+string.whitespace+string.digits+''.join([ char for char in cf.dedupelist(cfg['irrelevant']) if not char in string.punctuation+string.whitespace+string.digits ])

self.regex = re.compile('[%s]'%re.escape(self.irrelevant))

def run(self,k):

global skipclose

skipclose = not self.inputfile

if self.inputfile: return

if self.deleteline:

self.p.cfoutput[k]['tempbuff'] = [line for line in self.p.cfoutput[k]['tempbuff'] if len(self.regex.sub('', line))]

else:

self.p.cfoutput[k]['tempbuff'] = [self.__filter(line) for line in self.p.cfoutput[k]['tempbuff']]

def __filter(self,line):

if not self.regex.sub('', line): line = ''

return line

def flush(self,k):

return

def close(self):

if skipclose: return

import codecs

if not os.path.exists(self.inputfile):

self.errors.append([__name__,'missing','[%s] %s'%(__name__,self.inputfile)])

logger.error('%s\t%s',*self.errors[-1][1:])

return

# make output folder

try:

os.makedirs(os.path.dirname(self.outputfile))

except OSError,e:

if not e.errno == 17:

logger.exception('%s\t%s, %s, %s',*['failed',e.errno,e.strerror,e.filename,])

raise OSError(e)

# open input and output files

out = self.outputfile

if out == self.inputfile:

import tempfile

fd,out = tempfile.mkstemp(suffix='.tmp', prefix='~', dir=self.p.tempdir)

os.close(fd)

try:

o = codecs.open(out,'w',self.encoding)

i = codecs.open(self.inputfile,'r',self.encoding)

except:

raise RuntimeError('Failed to open [%s] input/output files'%(__name__))

sys.stderr.write('[%s] %s\n Please wait'%(__name__,self.outputfile))

cnt = 0

100

try:

101

# loop writes output line-by-line

102

for line in i:

103

o.write('%s\n'%(self.__filter(line.rstrip('\r\n'))))

104

cnt += 1

105

if not cnt%5000: sys.stderr.write('.')

106

sys.stderr.write('\n')

107

# close input and output files

108

i.close()

109

o.close()

110

111

if not out == self.outputfile:

112

import shutil

113

shutil.move(out,self.outputfile)

114

115

except KeyboardInterrupt:

116

os.unlink(out)

117

raise KeyboardInterrupt()

118

119

if not os.path.exists(self.outputfile):

120

self.errors.append([__name__,'missing','[%s] %s'%(__name__,self.outputfile)])

121

logger.error('%s\t%s',*self.errors[-1][1:])

122

123

def usage():

124

'''Command prompt help.'''

125

return "\n%s\n\tUsage:\n\tfrom %s import filter\n"%(

126

os.path.basename(sys.argv[0]),

127

os.path.splitext(os.path.basename(sys.argv[0]))[0]

128

)

129

130

licensetxt=u'''CorpusFiltergraph™ v4.0

131

132

133

This program is free software: you can redistribute it and/or modify

134

it under the terms of the GNU Lesser General Public License as published by

135

the Free Software Foundation, either version 3 of the License, or

136

(at your option) any later version.

137

138

This program is distributed in the hope that it will be useful,

139

but WITHOUT ANY WARRANTY; without even the implied warranty of

140

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

141

GNU Lesser General Public License for more details.

142

143

You should have received a copy of the GNU Lesser General Public License

144

along with this program. If not, see http://www.gnu.org/licenses/.

145

146

For more information, please contact Precision Translation Tools Co., Ltd.

147

at: http://www.precisiontranslationtools.com'''

148

149

if __name__ == "__main__":

150

import os

151

import sys

152

sys.stdout.write(usage().encode('utf8')+'\n')

Older »