2
2
# -*- coding: utf-8 -*-
3
# wlc.py, wordlist creator, extracts words from txt, transcode and put in sqlitedb
4
# v0.1 Alex Stanev 2011
3
# wlc.py, wordlist creator
4
# extracts words from txt/html files, transcode and put in sqlitedb
5
# see README for more info
5
6
# The source code is distributed under GPLv3 license
12
12
from optparse import OptionParser
14
14
print 'Wordlist creator v0.1 by Alex Stanev'
19
19
parser = OptionParser(usage=usage)
20
20
parser.add_option('-l', '--lang', dest='lang', type='string',
21
default='en', help='Input files charsets [default: %default]')
21
default='en', help='Input files charset [default: %default]')
22
22
parser.add_option('-s', '--strip-html', dest='strip_html', type='string',
23
default='off', help='Strip html tags [default: %default]')
23
default='off', help='Strip html tags [default: %default]')
24
24
parser.add_option('-d', '--database', dest='db', type='string',
25
default='wlc.db', help='Output sqlite3 db [default: %default]')
25
default='wlc.db', help='Output sqlite3 db [default: %default]')
26
26
parser.add_option('-g', '--glob', dest='pattern', type='string',
27
default='txt', help='File pattern to [default: %default]')
27
default='txt', help='File pattern to [default: %default]')
28
28
parser.add_option('-t', '--translate', dest='translate', type='string',
29
default='off', help='Translate cyr chars to en [default: %default]')
29
default='off', help='Transcode cyr char to en [default: %default]')
30
30
parser.add_option('-p', '--path', dest='path', type='string',
31
help='Path to directory where texts live')
31
help='Path to directory where texts live')
32
32
(options, args) = parser.parse_args()
33
33
if not options.path:
34
34
parser.error('no path specified')
92
92
# do translation, return True for next table
93
def trans(input, tab):
93
def trans(instr, tab):
96
96
for diphthong, value in tab[0].items():
97
97
first = first.replace(diphthong, value)
137
137
for word in words:
140
for tab in [cyr, cyr6]:
141
(tword, nexttab) = trans(word, tab)
142
c.execute('INSERT OR IGNORE INTO wl VALUES (?,?,0)', (tword, None))
140
for ttab in [cyr, cyr6]:
141
(tword, nexttab) = trans(word, ttab)
142
c.execute('INSERT OR IGNORE INTO wl VALUES (?, NULL, 0)', (tword,))
143
143
c.execute('UPDATE wl SET freq=freq+1 WHERE word=?', (tword,))
147
c.execute('INSERT OR IGNORE INTO wl VALUES (?,?,0)', (word, None))
147
c.execute('INSERT OR IGNORE INTO wl VALUES (?, NULL, 0)', (word,))
148
148
c.execute('UPDATE wl SET freq=freq+1 WHERE word=?', (word,))
149
149
print('Updating word lengths...')
150
150
c.execute('UPDATE wl SET ccount=length(word) WHERE ccount IS NULL')