~cosmos-door/+junk/libkkc-data

« back to all changes in this revision

Viewing changes to tools/sortlm.py

Committer: Mitsuya Shibata
Date: 2013-07-06 16:06:31 UTC
Revision ID: mty.shibata@gmail.com-20130706160631-rpwsfk1k5fvznehm

Initial commit of Debian packaging.

files added:

AUTHORS

COPYING

ChangeLog

INSTALL

Makefile.am

Makefile.in

NEWS

README

aclocal.m4

configure

configure.ac

data

data/Makefile.am

data/Makefile.in

data/models

data/models/Makefile.am

data/models/Makefile.in

data/models/sorted3

data/models/sorted3/metadata.json

data/models/text3

data/models/text3/data.arpa

data/models/text3/metadata.json

debian

debian/changelog

debian/compat

debian/control

debian/copyright

debian/docs

debian/rules

debian/source

debian/source/format

debian/watch

install-sh

missing

tools

tools/Makefile.am

tools/Makefile.in

tools/genfilter.py

tools/sortlm.py

Show diffs side-by-side

added added

removed removed

tools/sortlm.py

#!/usr/bin/python

# This program is free software: you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation, either version 3 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program. If not, see <http://www.gnu.org/licenses/>.

import struct

import marisa

import re

NGRAM = 3

NGRAM_LINE_REGEX = '^([-0-9.]+)[ \t]+([^\t]+?)(?:[ \t]+([-0-9.]+))?$'

class SortedGenerator(object):

def __init__(self, infile, output_prefix):

self.__infile = infile

self.__output_prefix = output_prefix

self.__ngram_line_regex = re.compile(NGRAM_LINE_REGEX)

self.__ngram_entries = [{} for x in range(0, NGRAM)]

self.__vocab_keyset = marisa.Keyset()

self.__input_keyset = marisa.Keyset()

self.__vocab_trie = marisa.Trie()

self.__input_trie = marisa.Trie()

self.__min_cost = 0.0

def read(self):

print "reading N-grams"

self.__read_tries()

self.__read_ngrams()

print "min cost = %lf" % self.__min_cost

def __read_tries(self):

while True:

line = self.__infile.readline()

if line == "":

break

if line.startswith("\\1-grams"):

break

unigram_count = 0

while True:

line = self.__infile.readline()

if line == "":

break

line = line.strip()

if line == "":

break

match = self.__ngram_line_regex.match(line)

if not match:

continue

strv = match.groups()

self.__vocab_keyset.push_back(strv[1])

if not strv[1] in ("<s>", "</s>", "<UNK>"):

if "/" not in strv[1]:

continue

(input, output) = strv[1].split("/")

self.__input_keyset.push_back(input)

self.__vocab_trie.build(self.__vocab_keyset)

self.__input_trie.build(self.__input_keyset)

def __read_ngrams(self):

self.__infile.seek(0)

for n in range(1, NGRAM + 1):

while True:

line = self.__infile.readline()

if line == "":

break

if line.startswith("\\%s-grams:" % n):

break

while True:

line = self.__infile.readline()

if line == "":

break

line = line.strip()

if line == "":

break

match = self.__ngram_line_regex.match(line)

if not match:

continue

strv = match.groups()

ngram = strv[1].split(" ")

100

ids = []

101

for word in ngram:

102

agent = marisa.Agent()

103

agent.set_query(word)

104

if not self.__vocab_trie.lookup(agent):

105

continue

106

ids.append(agent.key_id())

107

cost = float(strv[0])

108

if cost != -99 and cost < self.__min_cost:

109

self.__min_cost = cost

110

backoff = 0.0

111

if strv[2]:

112

backoff = float(strv[2])

113

self.__ngram_entries[n - 1][tuple(ids)] = (cost, backoff)

114

115

def write(self):

116

self.__min_cost = -8.0

117

self.__write_tries()

118

self.__write_ngrams()

119

120

def __write_tries(self):

121

self.__vocab_trie.save(self.__output_prefix + ".1gram.index")

122

self.__input_trie.save(self.__output_prefix + ".input")

123

124

def __write_ngrams(self):

125

def quantize(cost, min_cost):

126

return max(0, min(65535, int(cost * 65535 / min_cost)))

127

128

def cmp_header(a, b):

129

return cmp(a[0], b[0])

130

131

print "writing 1-gram file"

132

unigram_offsets = {}

133

unigram_file = open("%s.1gram" % self.__output_prefix, "wb")

134

offset = 0

135

for ids, value in sorted(self.__ngram_entries[0].iteritems()):

136

unigram_offsets[ids[0]] = offset

137

s = struct.pack("=HHH",

138

quantize(value[0], self.__min_cost),

139

quantize(value[1], self.__min_cost),

140

0 # reserved

141

)

142

unigram_file.write(s)

143

offset += 1

144

unigram_file.close()

145

146

print "writing 2-gram file"

147

bigram_offsets = {}

148

bigram_file = open("%s.2gram" % self.__output_prefix, "wb")

149

keys = self.__ngram_entries[1].keys()

150

items = [(struct.pack("=LL", ids[1], unigram_offsets[ids[0]]), ids) for ids in keys]

151

offset = 0

152

for header, ids in sorted(items, cmp=cmp_header):

153

value = self.__ngram_entries[1][ids]

154

bigram_offsets[ids] = offset

155

s = struct.pack("=HH",

156

quantize(value[0], self.__min_cost),

157

quantize(value[1], self.__min_cost))

158

bigram_file.write(header + s)

159

offset += 1

160

bigram_file.close()

161

162

if len(self.__ngram_entries[2]) > 0:

163

print "writing 3-gram file"

164

trigram_file = open("%s.3gram" % self.__output_prefix, "wb")

165

keys = self.__ngram_entries[2].keys()

166

items = [(struct.pack("=LL", ids[2], bigram_offsets[(ids[0], ids[1])]), ids) for ids in keys]

167

for header, ids in sorted(items, cmp=cmp_header):

168

value = self.__ngram_entries[2][ids]

169

s = struct.pack("=H",

170

quantize(value[0], self.__min_cost))

171

trigram_file.write(header + s)

172

trigram_file.close()

173

174

if __name__ == '__main__':

175

import sys

176

import argparse

177

178

parser = argparse.ArgumentParser(description='sortlm')

179

parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),

180

default=sys.stdin,

181

help='language model file')

182

parser.add_argument('output_prefix', metavar='OUTPUT_PREFIX', type=str,

183

help='output file prefix')

184

args = parser.parse_args()

185

186

generator = SortedGenerator(args.infile, args.output_prefix)

187

generator.read();

188

generator.write();

Older »