~jtv/corpusfiltergraph/cross-python

« back to all changes in this revision

Viewing changes to trunk/lib/corpusfg/plugins/ja/convert-full2ansi.py

  • Committer: tahoar
  • Date: 2012-05-02 15:46:23 UTC
  • Revision ID: svn-v4:bc069b21-dff4-4e29-a776-06a4e04bad4e::266
new layout. need to update code to use the new layout

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#! /usr/bin/env python
 
2
# -*- coding: utf8 -*-
 
3
 
 
4
#===============================================================================
 
5
# Author: Walapa Muangjeen
 
6
#===============================================================================
 
7
 
 
8
#version:
 
9
#4.0.264 - version update
 
10
 
 
11
import os
 
12
import sys
 
13
import common as cf
 
14
import logging
 
15
 
 
16
logger = logging.getLogger('.'.join([os.path.splitext(os.path.basename(sys.argv[0]))[0],'manager','filtergraph',__name__]))
 
17
skipclose = True
 
18
 
 
19
class filter(object):
 
20
 
 
21
        cfg = {
 
22
                'encoding': 'utf8',
 
23
                'inputfile': '',
 
24
                'outputfile': '',
 
25
                'version': '4.0.264',
 
26
                }
 
27
        encoding = 'utf8'
 
28
        inputfile = ''
 
29
        outputfile = ''
 
30
        replace = {
 
31
#               u'″': u'"',
 
32
#               u'−': u'-',
 
33
                u' ': u' ',
 
34
                u'!': u'!',
 
35
                u'#': u'#',
 
36
                u'$': u'$',
 
37
                u'%': u'%',
 
38
                u'&': u'&',
 
39
                u'(': u'(',
 
40
                u')': u')',
 
41
                u'*': u'*',
 
42
                u'+': u'+',
 
43
                u'.': u'.',
 
44
                u'/': u'/',
 
45
                u'0': u'0',
 
46
                u'1': u'1',
 
47
                u'2': u'2',
 
48
                u'3': u'3',
 
49
                u'4': u'4',
 
50
                u'5': u'5',
 
51
                u'6': u'6',
 
52
                u'7': u'7',
 
53
                u'8': u'8',
 
54
                u'9': u'9',
 
55
                u';': u';',
 
56
                u'=': u'=',
 
57
                u'>': u'>',
 
58
                u'?': u'?',
 
59
                u'@': u'@',
 
60
                u'A': u'A',
 
61
                u'B': u'B',
 
62
                u'C': u'C',
 
63
                u'D': u'D',
 
64
                u'E': u'E',
 
65
                u'F': u'F',
 
66
                u'G': u'G',
 
67
                u'H': u'H',
 
68
                u'I': u'I',
 
69
                u'J': u'J',
 
70
                u'K': u'K',
 
71
                u'L': u'L',
 
72
                u'M': u'M',
 
73
                u'N': u'N',
 
74
                u'O': u'O',
 
75
                u'P': u'P',
 
76
                u'Q': u'Q',
 
77
                u'R': u'R',
 
78
                u'S': u'S',
 
79
                u'T': u'T',
 
80
                u'U': u'U',
 
81
                u'V': u'V',
 
82
                u'W': u'W',
 
83
                u'X': u'X',
 
84
                u'Y': u'Y',
 
85
                u'Z': u'Z',
 
86
                u'[': u'u',
 
87
                u'\': u'\\',
 
88
                u']': u']',
 
89
                u'^': u'^',
 
90
                u'_': u'_',
 
91
                u'a': u'a',
 
92
                u'b': u'b',
 
93
                u'c': u'c',
 
94
                u'd': u'd',
 
95
                u'e': u'e',
 
96
                u'f': u'f',
 
97
                u'g': u'g',
 
98
                u'h': u'h',
 
99
                u'i': u'i',
 
100
                u'j': u'j',
 
101
                u'k': u'k',
 
102
                u'l': u'l',
 
103
                u'm': u'm',
 
104
                u'n': u'n',
 
105
                u'o': u'o',
 
106
                u'p': u'p',
 
107
                u'q': u'q',
 
108
                u'r': u'r',
 
109
                u's': u's',
 
110
                u't': u't',
 
111
                u'u': u'u',
 
112
                u'v': u'v',
 
113
                u'w': u'w',
 
114
                u'x': u'x',
 
115
                u'y': u'y',
 
116
                u'z': u'z',
 
117
                u'{': u'{',
 
118
                u'|': u'|',
 
119
                u'}': u'}',
 
120
#               u'・': u'∙',
 
121
#               u'゚': u'°',
 
122
#               u' ̄': u'¯',
 
123
                u'「': u'「',
 
124
                u'」': u'」',
 
125
                u'¥': u'¥',
 
126
                }
 
127
        isopen = False
 
128
        p = object
 
129
        errors = []
 
130
 
 
131
        def open(self,parent,cfg):
 
132
                self.encoding = 'utf8' if 'utf8' in cfg['encoding'].lower().replace('-','') else cfg['encoding']
 
133
                self.inputfile = cfg['inputfile'].replace('%(rootfolder)s',parent.rootfolder) if cfg['inputfile'] else self.inputfile
 
134
                self.outputfile = cfg['outputfile'].replace('%(rootfolder)s',parent.rootfolder) if cfg['outputfile'] else self.outputfile
 
135
                if (self.inputfile and not self.outputfile) or (not self.inputfile and self.outputfile):
 
136
                        self.errors.append([__name__,'invalid','[%s] inputfile=%s without outputfile= value'%(__name__,cfg['inputfile'])])
 
137
                        logger.warn('%s\t%s',*self.errors[-1][1:])
 
138
 
 
139
        def run(self,k):
 
140
                global skipclose
 
141
                skipclose = not self.inputfile
 
142
                if self.inputfile: return
 
143
 
 
144
                self.p.cfoutput[k]['tempbuff'] = [self.__filter(line) for line in self.p.cfoutput[k]['tempbuff']]
 
145
 
 
146
        def flush(self,k):
 
147
                return
 
148
 
 
149
        def close(self):
 
150
                if skipclose: return
 
151
 
 
152
                import os
 
153
                import sys
 
154
                import codecs
 
155
 
 
156
                if not os.path.exists(self.inputfile):
 
157
                        self.errors.append([__name__,'missing','[%s] %s'%(__name__,self.inputfile)])
 
158
                        logger.error('%s\t%s',*self.errors[-1][1:])
 
159
                        return
 
160
 
 
161
                # make output folder
 
162
                try:
 
163
                        os.makedirs(os.path.dirname(self.outputfile))
 
164
                except OSError,e:
 
165
                        if not e.errno == 17:
 
166
                                logger.exception('%s\t%s, %s, %s',*['failed',e.errno,e.strerror,e.filename,])
 
167
                                raise OSError(e)
 
168
 
 
169
                # open input and output files
 
170
                out = self.outputfile
 
171
                if out == self.inputfile:
 
172
                        import tempfile
 
173
                        fd,out = tempfile.mkstemp(suffix='.tmp', prefix='~', dir=self.p.tempdir)
 
174
                        os.close(fd)
 
175
                try:
 
176
                        o = codecs.open(out,'w',self.encoding)
 
177
                        i = codecs.open(self.inputfile,'r',self.encoding)
 
178
                except:
 
179
                        raise RuntimeError('Failed to open [%s] input/output files'%(__name__))
 
180
 
 
181
                sys.stderr.write('[%s] %s\n   Please wait'%(__name__,self.outputfile))
 
182
                cnt = 0
 
183
                try:
 
184
                        # loop writes output line-by-line
 
185
 
 
186
                        if self.exceptions:
 
187
                                for line in i:
 
188
                                        o.write('%s\n'%(u''.join([ch if ch in self.exception else unicodedata.normalize(self.form,ch) for ch in '\b'.join(line.rstrip('\r\n')).split('\b')])))
 
189
                                        cnt += 1
 
190
                                        if not cnt%5000: sys.stderr.write('.')
 
191
                        else:
 
192
                                for line in i:
 
193
                                        o.write('%s\n'%(self.__filter(line.rstrip('\r\n'))))
 
194
                                        cnt += 1
 
195
                                        if not cnt%5000: sys.stderr.write('.')
 
196
 
 
197
                        sys.stderr.write('\n')
 
198
                        # close input and output files
 
199
                        i.close()
 
200
                        o.close()
 
201
 
 
202
                        if not out == self.outputfile:
 
203
                                import shutil
 
204
                                shutil.move(out,self.outputfile)
 
205
 
 
206
                except KeyboardInterrupt:
 
207
                        os.unlink(out)
 
208
                        raise KeyboardInterrupt()
 
209
 
 
210
                if not os.path.exists(self.outputfile):
 
211
                        self.errors.append([__name__,'missing','[%s] %s'%(__name__,self.outputfile)])
 
212
                        logger.error('%s\t%s',*self.errors[-1][1:])
 
213
 
 
214
        def __filter(self,text):
 
215
                '''Normalize JA english charactors to western UTF-8 range'''
 
216
                for i, j in self.replace.items():
 
217
                        text = text.replace(i, j)
 
218
                return text
 
219
 
 
220
def usage():
 
221
        '''Command prompt help.'''
 
222
        return "\n%s\n\tUsage:\n\tfrom %s import filter\n"%(
 
223
        os.path.basename(sys.argv[0]),
 
224
        os.path.splitext(os.path.basename(sys.argv[0]))[0]
 
225
        )
 
226
 
 
227
licensetxt=u'''CorpusFiltergraph™ v4.0
 
228
Copyright © 2010-2012 Precision Translation Tools Co., Ltd.
 
229
 
 
230
This program is free software: you can redistribute it and/or modify
 
231
it under the terms of the GNU Lesser General Public License as published by
 
232
the Free Software Foundation, either version 3 of the License, or
 
233
(at your option) any later version.
 
234
 
 
235
This program is distributed in the hope that it will be useful,
 
236
but WITHOUT ANY WARRANTY; without even the implied warranty of
 
237
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
238
GNU Lesser General Public License for more details.
 
239
 
 
240
You should have received a copy of the GNU Lesser General Public License
 
241
along with this program.  If not, see http://www.gnu.org/licenses/.
 
242
 
 
243
For more information, please contact Precision Translation Tools Co., Ltd.
 
244
at: http://www.precisiontranslationtools.com'''
 
245
 
 
246
if __name__ == "__main__":
 
247
        import os
 
248
        import sys
 
249
        sys.stdout.write(usage().encode('utf8')+'\n')