6
6
* Distribution License ("CDDL")(collectively, the "License"). You may not use this
7
7
* file except in compliance with the License. You can obtain a copy of the CDDL at
8
8
* http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
9
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
9
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
10
10
* specific language governing permissions and limitations under the License. When
11
11
* distributing the software, include this License Header Notice in each file and
12
12
* include the full text of the License in the License file as well as the
13
13
* following notice:
15
15
* NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
17
17
* For Covered Software in this distribution, this License shall be governed by the
19
19
* Any litigation relating to this License shall be subject to the jurisdiction of
20
20
* the Federal Courts of the Northern District of California and the state courts
21
21
* of the State of California, with venue lying in Santa Clara County, California.
25
25
* If you wish your version of this file to be governed by only the CDDL or only
26
26
* the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
27
27
* include this software in this distribution under the [CDDL or LGPL Version 2.1]
62
62
#include "arpa_conv.h"
65
void ShowUsage(const char* progname)
66
ShowUsage(const char* progname)
67
68
printf("Usage:\n");
68
69
printf(" %s arpa_slm dict_file threaded_slm\n", progname);
70
71
printf("Description:\n");
71
printf(" %s converts the ARPA representation of SLM to the binary format of threaded SLM. \n", progname);
73
" %s converts the ARPA representation of SLM to the binary format of threaded SLM. \n",
78
81
* bow_eff, bow_values [out]
81
void build_map(const CArpaSlm& slm, EffRealMap &pr_eff, FreqMap& pr_values, EffRealMap &bow_eff, FreqMap& bow_values)
85
build_map(const CArpaSlm& slm,
83
91
bool usingLogPr = slm.usingLogPr();
85
93
printf("\nfirst pass..."); fflush(stdout);
87
for (unsigned lvl=0; lvl < slm.getN(); ++lvl) {
95
for (unsigned lvl = 0; lvl < slm.getN(); ++lvl) {
88
96
typedef CArpaSlm::TNodeLevel TNodeLevel;
89
97
const TNodeLevel& level = slm.getLevel(lvl);
90
for (TNodeLevel::const_iterator node = level.begin(); node != level.end(); ++node) {
98
for (TNodeLevel::const_iterator node = level.begin();
91
101
float real_pr, eff_pr;
92
102
real_pr = node->pr;
93
103
eff_pr = EffectivePr(real_pr);
125
137
// Following pr value should not be grouped, or as milestone values.
126
138
static const float msprs[] = {
127
139
0.9, 0.8, 0.7, 0.6,
128
1.0/2, 1.0/4, 1.0/8, 1.0/16, 1.0/32, 1.0/64, 1.0/128,
129
1.0/256, 1.0/512, 1.0/1024, 1.0/2048, 1.0/4096, 1.0/8192,
130
1.0/16384, 1.0/32768, 1.0/65536
140
1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32, 1.0 / 64, 1.0 / 128,
141
1.0 / 256, 1.0 / 512, 1.0 / 1024, 1.0 / 2048, 1.0 / 4096, 1.0 / 8192,
142
1.0 / 16384, 1.0 / 32768, 1.0 / 65536
133
for (unsigned i=0, sz=sizeof(msprs)/sizeof(float); i < sz; ++i) {
134
float real_pr = (usingLogPr)?(-log(msprs[i])):(msprs[i]);
145
for (unsigned i = 0, sz = sizeof(msprs) / sizeof(float); i < sz; ++i) {
146
float real_pr = (usingLogPr) ? (-log(msprs[i])) : (msprs[i]);
135
147
float eff_pr = EffectivePr(real_pr);
136
148
assert(usingLogPr || (real_pr > 0.0 && real_pr < 1.0));
137
149
assert(!usingLogPr || real_pr > 0.0);
151
163
0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001
154
for (unsigned i=0; i < sizeof(msbows)/sizeof(msbows[0]); ++i) {
155
float real_bow = (usingLogPr)?(-log(msbows[i])):(msbows[i]);
166
for (unsigned i = 0; i < sizeof(msbows) / sizeof(msbows[0]); ++i) {
167
float real_bow = (usingLogPr) ? (-log(msbows[i])) : (msbows[i]);
156
168
float eff_bow = EffectiveBow(real_bow);
157
169
if (bow_eff.find(eff_bow) == bow_eff.end()) {
158
170
bow_eff[eff_bow] = real_bow;
167
179
* group vaules into a smaller set of their approximations
169
181
* bow_eff [in], bow_values [in], bow_map [out], bow_table [out]
170
182
* pr_eff [in], pr_values [in], pr_map [out], pr_table [out]
173
void group_values(bool usingLogPr,
174
EffRealMap& pr_eff, FreqMap& pr_values, CompressedTable& pr_table, RealIndexMap& pr_map,
175
EffRealMap& bow_eff, FreqMap& bow_values, CompressedTable& bow_table, RealIndexMap& bow_map)
186
group_values(bool usingLogPr,
189
CompressedTable& pr_table,
190
RealIndexMap& pr_map,
193
CompressedTable& bow_table,
194
RealIndexMap& bow_map)
177
196
printf("\nCompressing pr values..."); fflush(stdout);
178
197
CValueCompressor vc;
194
213
printf("%lu float values ==> %lu values", bow_eff.size(), bow_table.size());
197
TLexicon read_lexicon(const char* filename)
217
read_lexicon(const char* filename)
199
219
printf("Loading lexicon..."); fflush(stdout);
200
static char word[1024*10];
220
static char word[1024 * 10];
201
221
FILE* f_lex = fopen(filename, "r");
202
222
TLexicon lexicon;
203
223
while (fgets(word, sizeof(word), f_lex)) {
238
258
// levels[N] [in]
239
259
// lastLevel [in]
241
void write_out(const char* filename, const CArpaSlm& slm,
242
CompressedTable& pr_table, CompressedTable& bow_table,
243
const TNodeLevels& levels, const CThreadSlm::TLeaf* lastLevel)
262
write_out(const char* filename, const CArpaSlm& slm,
263
CompressedTable& pr_table, CompressedTable& bow_table,
264
const TNodeLevels& levels, const CThreadSlm::TLeaf* lastLevel)
245
266
printf("\nWriting out..."); fflush(stdout);
247
268
FILE* fp = fopen(filename, "wb");
248
269
const int N = slm.getN();
249
270
fwrite(&N, sizeof(int), 1, fp);
250
271
const unsigned usingLogPr = slm.usingLogPr();
251
272
fwrite(&usingLogPr, sizeof(unsigned), 1, fp);
253
274
for (int lvl = 0; lvl <= N; ++lvl) {
254
unsigned len = slm.getLevelSize(lvl)+1;
275
unsigned len = slm.getLevelSize(lvl) + 1;
255
276
fwrite(&len, sizeof(unsigned), 1, fp);
258
for (int i = 0, sz = pr_table.size(); i < (1 << CThreadSlm::BITS_PR); ++i) {
279
for (int i = 0, sz = pr_table.size(); i < (1 << CThreadSlm::BITS_PR);
260
282
fwrite(&pr_table[i], sizeof(float), 1, fp);
263
285
fwrite(&dummy, sizeof(float), 1, fp);
267
for (int i = 0, sz = bow_table.size(); i < (1 << CThreadSlm::BITS_BOW); ++i) {
289
for (int i = 0, sz = bow_table.size(); i < (1 << CThreadSlm::BITS_BOW);
269
292
fwrite(&bow_table[i], sizeof(float), 1, fp);
272
295
fwrite(&dummy, sizeof(float), 1, fp);
276
for (int lvl=0; lvl < N; ++lvl) {
277
fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize(lvl)+1, fp);
299
for (int lvl = 0; lvl < N; ++lvl) {
300
fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize(
280
fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N)+1, fp);
304
fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N) + 1, fp);
284
308
printf("done!\n"); fflush(stdout);
288
void cleanup(CompressedTable& pr_table, CompressedTable& bow_table,
289
TNodeLevels& levels, CThreadSlm::TLeaf* lastLevel)
313
cleanup(CompressedTable& pr_table, CompressedTable& bow_table,
314
TNodeLevels& levels, CThreadSlm::TLeaf* lastLevel)
291
for (unsigned lvl=0; lvl < levels.size(); ++lvl)
316
for (unsigned lvl = 0; lvl < levels.size(); ++lvl)
292
317
delete[] levels[lvl];
293
318
delete[] lastLevel;
294
319
bow_table.clear();
295
320
pr_table.clear();
298
int main(int argc, char* argv[])
324
main(int argc, char* argv[])
302
327
ShowUsage(argv[0]);
303
328
const char* arpa_path = argv[1];
304
329
const char* lexicon_path = argv[2];
305
330
const char* threaded_path = argv[3];
308
333
TLexicon lexicon = read_lexicon(lexicon_path);
309
334
slm.load(arpa_path, lexicon);
311
336
if (!slm.good()) {
312
std::cerr << "Failed to load language model from " << arpa_path << "." << std::endl;
337
std::cerr << "Failed to load language model from " << arpa_path <<
317
EffRealMap pr_eff, bow_eff; // effval --> val
343
EffRealMap pr_eff, bow_eff; // effval --> val
318
344
FreqMap pr_values, bow_values; // effval --> freq
319
345
build_map(slm, pr_eff, pr_values, bow_eff, bow_values);
321
RealIndexMap pr_map, bow_map; // result: val --> int
322
CompressedTable pr_table, bow_table; // result: val vector
347
RealIndexMap pr_map, bow_map; // result: val --> int
348
CompressedTable pr_table, bow_table; // result: val vector
323
349
group_values(slm.usingLogPr(),
324
350
pr_eff, pr_values, pr_table, pr_map,
325
351
bow_eff, bow_values, bow_table, bow_map);
330
356
CThreadSlm::TLeaf* lastLevel;
331
357
compress(slm, pr_table, pr_map, bow_table, bow_map,
332
358
levels, lastLevel);
336
362
write_out(threaded_path, slm, pr_table, bow_table, levels, lastLevel);
338
364
cleanup(pr_table, bow_table, levels, lastLevel);