~jtv/corpusfiltergraph/cross-python

« back to all changes in this revision

Viewing changes to trunk/lib/corpusfg/plugins/unicode-normalize.py

  • Committer: tahoar
  • Date: 2012-05-02 15:46:23 UTC
  • Revision ID: svn-v4:bc069b21-dff4-4e29-a776-06a4e04bad4e::266
new layout. need to update code to use the new layout

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#! /usr/bin/env python
 
2
# -*- coding: utf8 -*-
 
3
 
 
4
#===============================================================================
 
5
# Author: Walapa Muangjeen
 
6
#===============================================================================
 
7
 
 
8
#version:
 
9
#4.0.264 - version update
 
10
 
 
11
import os
 
12
import sys
 
13
import unicodedata
 
14
import common as cf
 
15
import logging
 
16
 
 
17
logger = logging.getLogger('.'.join([os.path.splitext(os.path.basename(sys.argv[0]))[0],'manager','filtergraph',__name__]))
 
18
skipclose = True
 
19
 
 
20
class filter(object):
 
21
 
 
22
        cfg = {
 
23
                'exceptions': '',
 
24
                'form': 'NFKC',
 
25
                'encoding': 'utf8',
 
26
                'inputfile': '',
 
27
                'outputfile': '',
 
28
                'version': '4.0.264',
 
29
                }
 
30
        exceptions = ''
 
31
        form = 'NFKC'
 
32
        encoding = 'utf8'
 
33
        inputfile = ''
 
34
        outputfile = ''
 
35
        isopen = False
 
36
        p = object
 
37
        errors = []
 
38
 
 
39
        def open(self,parent,cfg):
 
40
                if not cfg['form'] in ['NFC','NFD','NFKC','NFKD',]:
 
41
                        self.errors.append([__name__,'invalid',"%s is invalid. \"form=\" must be '\NFC\', \'NFD\', \'NFKC\', or \'NFKD\'"%(cfg['form'])])
 
42
                        logger.warn('%s\t%s',*self.errors[-1][1:])
 
43
                self.exceptions = cfg['exceptions']
 
44
                self.form = cfg['form']
 
45
 
 
46
                self.encoding = 'utf8' if 'utf8' in cfg['encoding'].lower().replace('-','') else cfg['encoding']
 
47
                self.inputfile = cfg['inputfile'].replace('%(rootfolder)s',self.p.rootfolder) if cfg['inputfile'] else self.inputfile
 
48
                self.outputfile = cfg['outputfile'].replace('%(rootfolder)s',self.p.rootfolder) if cfg['outputfile'] else self.outputfile
 
49
                if (self.inputfile and not self.outputfile) or (not self.inputfile and self.outputfile):
 
50
                        self.errors.append([__name__,'invalid','[%s] inputfile=%s without outputfile= value'%(__name__,cfg['inputfile'])])
 
51
                        logger.warn('%s\t%s',*self.errors[-1][1:])
 
52
 
 
53
        def run(self,k):
 
54
                global skipclose
 
55
                skipclose = not self.inputfile
 
56
                if self.inputfile: return
 
57
 
 
58
                if self.exceptions:
 
59
                        self.p.cfoutput[k]['tempbuff'] = [u''.join([ch if ch in self.exception else unicodedata.normalize(self.form,ch) for ch in '\b'.join(line).split('\b')]) for line in self.p.cfoutput[k]['tempbuff']]
 
60
                else:
 
61
                        self.p.cfoutput[k]['tempbuff'] = [unicodedata.normalize(self.form,unicode(line)) for line in self.p.cfoutput[k]['tempbuff']]
 
62
 
 
63
        def flush(self,k):
 
64
                return
 
65
 
 
66
        def close(self):
 
67
                if skipclose: return
 
68
 
 
69
                import codecs
 
70
 
 
71
                if not os.path.exists(self.inputfile):
 
72
                        self.errors.append([__name__,'missing','[%s] %s'%(__name__,self.inputfile)])
 
73
                        logger.error('%s\t%s',*self.errors[-1][1:])
 
74
                        return
 
75
 
 
76
                # make output folder
 
77
                try:
 
78
                        os.makedirs(os.path.dirname(self.outputfile))
 
79
                except OSError,e:
 
80
                        if not e.errno == 17:
 
81
                                logger.exception('%s\t%s, %s, %s',*['failed',e.errno,e.strerror,e.filename,])
 
82
                                raise OSError(e)
 
83
 
 
84
                # open input and output files
 
85
                out = self.outputfile
 
86
                if out == self.inputfile:
 
87
                        import tempfile
 
88
                        fd,out = tempfile.mkstemp(suffix='.tmp', prefix='~', dir=self.p.tempdir)
 
89
                        os.close(fd)
 
90
                try:
 
91
                        o = codecs.open(out,'w',self.encoding)
 
92
                        i = codecs.open(self.inputfile,'r',self.encoding)
 
93
                except:
 
94
                        raise RuntimeError('Failed to open [%s] input/output files'%(__name__))
 
95
 
 
96
                sys.stderr.write('[%s] %s\n   Please wait'%(__name__,self.outputfile))
 
97
                cnt = 0
 
98
                try:
 
99
                        # loop writes output line-by-line
 
100
                        if self.exceptions:
 
101
                                for line in i:
 
102
                                        o.write('%s\n'%(u''.join([ch if ch in self.exception else unicodedata.normalize(self.form,ch) for ch in '\b'.join(line).split('\b')])))
 
103
                                        cnt += 1
 
104
                                        if not cnt%5000: sys.stderr.write('.')
 
105
                        else:
 
106
                                for line in i:
 
107
                                        o.write('%s\n'%(unicodedata.normalize(self.form,unicode(line))))
 
108
                                        cnt += 1
 
109
                                        if not cnt%5000: sys.stderr.write('.')
 
110
                        sys.stderr.write('\n')
 
111
                        # close input and output files
 
112
                        i.close()
 
113
                        o.close()
 
114
 
 
115
                        if not out == self.outputfile:
 
116
                                import shutil
 
117
                                shutil.move(out,self.outputfile)
 
118
 
 
119
                except KeyboardInterrupt:
 
120
                        os.unlink(out)
 
121
                        raise KeyboardInterrupt()
 
122
 
 
123
                if not os.path.exists(self.outputfile):
 
124
                        self.errors.append([__name__,'missing','[%s] %s'%(__name__,self.outputfile)])
 
125
                        logger.error('%s\t%s',*self.errors[-1][1:])
 
126
 
 
127
def usage():
 
128
        '''Command prompt help.'''
 
129
        return "\n%s\n\tUsage:\n\tfrom %s import filter\n"%(
 
130
        os.path.basename(sys.argv[0]),
 
131
        os.path.splitext(os.path.basename(sys.argv[0]))[0]
 
132
        )
 
133
 
 
134
licensetxt=u'''CorpusFiltergraph™ v4.0
 
135
Copyright © 2010-2012 Precision Translation Tools Co., Ltd.
 
136
 
 
137
This program is free software: you can redistribute it and/or modify
 
138
it under the terms of the GNU Lesser General Public License as published by
 
139
the Free Software Foundation, either version 3 of the License, or
 
140
(at your option) any later version.
 
141
 
 
142
This program is distributed in the hope that it will be useful,
 
143
but WITHOUT ANY WARRANTY; without even the implied warranty of
 
144
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
145
GNU Lesser General Public License for more details.
 
146
 
 
147
You should have received a copy of the GNU Lesser General Public License
 
148
along with this program.  If not, see http://www.gnu.org/licenses/.
 
149
 
 
150
For more information, please contact Precision Translation Tools Co., Ltd.
 
151
at: http://www.precisiontranslationtools.com'''
 
152
 
 
153
if __name__ == "__main__":
 
154
        import os
 
155
        import sys
 
156
        sys.stdout.write(usage().encode('utf8')+'\n')