2
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4
* Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6
* The contents of this file are subject to the terms of either the GNU Lesser
7
* General Public License Version 2.1 only ("LGPL") or the Common Development and
8
* Distribution License ("CDDL")(collectively, the "License"). You may not use this
9
* file except in compliance with the License. You can obtain a copy of the CDDL at
10
* http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12
* specific language governing permissions and limitations under the License. When
13
* distributing the software, include this License Header Notice in each file and
14
* include the full text of the License in the License file as well as the
17
* NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19
* For Covered Software in this distribution, this License shall be governed by the
20
* laws of the State of California (excluding conflict-of-law provisions).
21
* Any litigation relating to this License shall be subject to the jurisdiction of
22
* the Federal Courts of the Northern District of California and the state courts
23
* of the State of California, with venue lying in Santa Clara County, California.
27
* If you wish your version of this file to be governed by only the CDDL or only
28
* the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29
* include this software in this distribution under the [CDDL or LGPL Version 2.1]
30
* license." If you don't indicate a single choice of license, a recipient has the
31
* option to distribute your version of this file under either the CDDL or the LGPL
32
* Version 2.1, or to extend the choice of license to its licensees as provided
33
* above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34
* Version 2 license, then the option applies only if the new code is made subject
35
* to such option by the copyright holder.
54
#include "../sim_slm.h"
57
#include "ValueCompress.h"
59
class CSIMSlmWithIteration : public CSIMSlm{
61
struct TLevelIterator {
62
std::vector<int> m_history;
67
getLevelSize(int lvl) { return sz[lvl]; }
73
getIdString(TLevelIterator& it, std::vector<TSIMWordId>& history);
76
beginLevelIteration(int lvl, TLevelIterator& it);
79
next(TLevelIterator& it);
82
isEnd(TLevelIterator& it);
85
getNodePtr(TLevelIterator& it);
88
findState(int n, TSIMWordId*hw);
91
findBackOffState(int n, TSIMWordId*hw, unsigned & bol, unsigned& bon);
96
adjustIterator(TLevelIterator& it);
100
CSIMSlmWithIteration::findState(int n, TSIMWordId*hw)
102
if (n == 0) return 0;
105
while (n > N) { --n; ++hw; }
107
void* pstate = ((TNode*)level[0]);
108
for (int lvl=0; lvl < n && pstate != NULL; ++lvl) {
109
int h = ((TNode*)pstate)->child;
110
int t = (((TNode*)pstate)+1)->child;
112
TLeaf* p = (TLeaf*)level[lvl+1];
113
pstate = (void*)binary_find_id(p+h, p+t, hw[lvl]);
114
m = (pstate != NULL)?(((TLeaf*)pstate) - p):(-1);
116
TNode* p = (TNode*)level[lvl+1];
117
pstate = (void*)binary_find_id(p+h, p+t, hw[lvl]);
118
m = (pstate != NULL)?(((TNode*)pstate) - p):(-1);
125
CSIMSlmWithIteration::findBackOffState(int n, TSIMWordId*hw, unsigned & bol, unsigned& bon)
129
int idx = findState(n, hw);
130
if (idx >= 0 && ((TNode*)(level[n]))[idx].child < ((TNode*)(level[n]))[idx+1].child) {
131
bol = n; bon = idx; return;
139
CSIMSlmWithIteration::getIdString(TLevelIterator& it, std::vector<TSIMWordId>& history)
142
for (int i=1, tmp_sz=it.m_history.size(); i < tmp_sz; ++i) {
143
int idx = it.m_history[i];
145
history.push_back(((TLeaf*)(level[i]))[idx].id);
147
history.push_back(((TNode*)(level[i]))[idx].id);
152
CSIMSlmWithIteration::beginLevelIteration(int lvl, TLevelIterator& it)
154
it.m_history.clear();
155
for (int i=0, tmp_sz=lvl; i <= tmp_sz; ++i)
156
it.m_history.push_back(0);
161
CSIMSlmWithIteration::next(TLevelIterator& it)
163
++(it.m_history.back());
168
CSIMSlmWithIteration::isEnd(TLevelIterator& it)
170
return ((it.m_history.back()+1 >= sz[it.m_history.size()-1]));
174
CSIMSlmWithIteration::adjustIterator(TLevelIterator& it)
176
int ch = it.m_history.back();
177
for (int i= it.m_history.size()-2; i >= 0; --i) {
179
int& parent = it.m_history[i];
180
TNode* pn = (TNode*)(level[i]);
181
while (parent < len && pn[parent+1].child <= ch)
188
CSIMSlmWithIteration::getNodePtr(TLevelIterator& it)
190
int lvl = it.m_history.size()-1;
191
int idx = it.m_history.back();
193
return (((TLeaf*)(level[lvl]))+idx);
195
return (((TNode*)(level[lvl]))+idx);
203
printf(" slmthread primitive_slm threaded_slm\n");
204
printf("\nDescription:\n");
205
printf(" slmthread add back-off-state for each slm node in the primitive_slm. ");
206
printf("Also it compresses 32-bit float into 16 bit representation.\n\n");
211
CThreadSlm::TNode* levels[16];
212
CThreadSlm::TLeaf* lastLevel;
214
int main(int argc, char* argv[])
217
unsigned int bol, bon;
218
CSIMSlmWithIteration slm;
219
std::vector<TSIMWordId> history;
220
float real_pr, eff_pr, real_bow, eff_bow;
222
std::map<float, float> pr_eff, bow_eff; // effval --> val
223
std::map<float, int> pr_values, bow_values; // effval --> freq
224
std::map<float, int> pr_map, bow_map; // result: val --> int
225
std::vector<float> pr_table, bow_table; // result: val vector
226
std::vector<float>::iterator itt, itte;
231
printf("Loading original slm..."); fflush(stdout);
232
if (slm.Load(argv[1]) == false)
235
bool usingLogPr = slm.isUseLogPr();
237
#define EffectivePr(a) (float((usingLogPr)?((a)/log(2.0)):(-log2((a)))))
238
#define OriginalPr(b) (float((usingLogPr)?((b)*log(2.0)):(exp2(-(b)))))
239
#define EffectiveBow(a) (float((usingLogPr)?(exp(-(a))):((a))))
240
#define OriginalBow(b) (float((usingLogPr)?(-log((b))):((b))))
242
printf("\nfirst pass..."); fflush(stdout);
243
for (int lvl=0; lvl <= slm.getN(); ++lvl) {
244
CSIMSlmWithIteration::TLevelIterator it;
245
slm.beginLevelIteration(lvl, it);
246
for (; !slm.isEnd(it); slm.next(it)) {
247
CSIMSlm::TLeaf* pl = slm.getNodePtr(it);
249
eff_pr = EffectivePr(real_pr);
250
if (pr_eff.find(eff_pr) == pr_eff.end()) {
251
pr_eff[eff_pr] = real_pr;
252
} else { // precision error cause non 1:1 mapping
253
pr_eff[eff_pr] = OriginalPr(eff_pr);
255
++(pr_values[eff_pr]);
256
if (lvl < slm.getN()) {
257
real_bow = ((CSIMSlm::TNode*)pl)->bow;
258
eff_bow = EffectiveBow(real_bow);
259
if (bow_eff.find(eff_bow) == bow_eff.end()) {
260
bow_eff[eff_bow] = real_bow;
261
} else { // two values map to same distance value due to precision error
262
bow_eff[eff_bow] = OriginalBow(eff_bow);
264
++(bow_values[eff_bow]);
269
// Following pr value should not be grouped, or as milestone values.
270
static float msprs[] = {
272
1.0/2, 1.0/4, 1.0/8, 1.0/16, 1.0/32, 1.0/64, 1.0/128,
273
1.0/256, 1.0/512, 1.0/1024, 1.0/2048, 1.0/4096, 1.0/8192,
274
1.0/16384, 1.0/32768, 1.0/65536
277
for (unsigned i=0, sz=sizeof(msprs)/sizeof(float); i < sz; ++i) {
278
float real_pr = (usingLogPr)?(-log(msprs[i])):(msprs[i]);
279
float eff_pr = EffectivePr(real_pr);
280
if (pr_eff.find(eff_pr) == pr_eff.end()) {
281
pr_eff[eff_pr] = real_pr;
282
} else { // precision error cause non 1:1 mapping
283
pr_eff[eff_pr] = OriginalPr(eff_pr);
285
pr_values[eff_pr] = 0;
288
// Following bow value should not be grouped, or as milestone values.
289
static float msbows[] = {
290
1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,
291
0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001,
292
0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001
295
for (unsigned i=0, sz=sizeof(msbows)/sizeof(float); i < sz; ++i) {
296
float real_bow = (usingLogPr)?(-log(msbows[i])):(msbows[i]);
297
float eff_bow = EffectiveBow(real_bow);
298
if (bow_eff.find(eff_bow) == bow_eff.end()) {
299
bow_eff[eff_bow] = real_bow;
300
} else { // two values map to same distance value due to precision error
301
bow_eff[eff_bow] = OriginalBow(eff_bow);
303
bow_values[eff_bow] = 0;
306
printf("\nCompressing pr values..."); fflush(stdout);
307
vc(pr_eff, pr_values, pr_map, pr_table, (1 << CThreadSlm::BITS_PR));
309
itte = pr_table.end();
310
for (itt = pr_table.begin(); itt != itte; ++itt) {
311
*itt = OriginalPr(*itt);
312
assert(usingLogPr || (*itt > 0.0 && *itt < 1.0));
313
assert(!usingLogPr || *itt > 0.0);
315
printf("%lu float values ==> %lu values", pr_eff.size(), pr_table.size());
317
printf("\nCompressing bow values..."); fflush(stdout);
318
vc(bow_eff, bow_values, bow_map, bow_table, (1 << CThreadSlm::BITS_BOW));
320
itte = bow_table.end();
321
for (itt = bow_table.begin(); itt != itte; ++itt)
322
*itt = OriginalBow(*itt);
323
printf("%lu float values ==> %lu values", bow_eff.size(), bow_table.size());
326
printf("\nThreading the new model..."); fflush(stdout);
327
for (int lvl=0; lvl < slm.getN(); ++lvl) {
328
levels[lvl] = new CThreadSlm::TNode[slm.getLevelSize(lvl)];
330
CSIMSlmWithIteration::TLevelIterator it;
331
slm.beginLevelIteration(lvl, it);
332
for (; !slm.isEnd(it); slm.next(it)) {
333
slm.getIdString(it, history);
334
if (history.size() == 0) {
335
slm.findBackOffState(lvl, NULL, bol, bon);
337
slm.findBackOffState(lvl, &history[0], bol, bon);
340
CSIMSlm::TNode* pn = (CSIMSlm::TNode*)slm.getNodePtr(it);
341
CThreadSlm::TNode& nn = levels[lvl][it.m_history.back()];
343
std::map<float, int>::iterator prit = pr_map.find(pn->pr);
344
if (prit == pr_map.end()) { // This would be cause by precision error
345
double val = EffectivePr(pn->pr);
346
val = OriginalPr(val);
347
prit = pr_map.find(val);
348
assert(prit != pr_map.end());
350
int idx_pr = prit->second;
357
std::map<float, int>::iterator bowit = bow_map.find(pn->bow);
358
if (bowit == bow_map.end()) { // precision error
359
double val = EffectiveBow(pn->bow);
360
val = OriginalBow(val);
361
bowit = bow_map.find(val);
362
assert(bowit != bow_map.end());
364
int idx_bow = bowit->second;
367
nn.set_ch(pn->child);
369
assert(usingLogPr || (pr_table[idx_pr] > 0.0 && pr_table[idx_pr] < 1.0));
370
assert(!usingLogPr || pr_table[idx_pr] > 0.0);
372
CSIMSlm::TNode* pn = (CSIMSlm::TNode*)slm.getNodePtr(it);
373
CThreadSlm::TNode& nn = levels[lvl][it.m_history.back()];
374
nn.set_ch(pn->child);
378
lastLevel = new CThreadSlm::TLeaf [slm.getLevelSize(slm.getN())];
379
CSIMSlmWithIteration::TLevelIterator it;
380
slm.beginLevelIteration(slm.getN(), it);
381
for (int lvl=slm.getN(); !slm.isEnd(it); slm.next(it)) {
382
CSIMSlm::TLeaf* pn = slm.getNodePtr(it);
383
slm.getIdString(it, history);
384
slm.findBackOffState(lvl, &history[0], bol, bon);
386
CThreadSlm::TLeaf& nn = lastLevel[it.m_history.back()];
388
std::map<float, int>::iterator prit = pr_map.find(pn->pr);
389
if (prit == pr_map.end()) { // This would be cause by precision error
390
double val = EffectivePr(pn->pr);
391
val = OriginalPr(val);
392
prit = pr_map.find(val);
393
assert(prit != pr_map.end());
395
int idx_pr = prit->second;
404
printf("\nWriting out..."); fflush(stdout);
407
fp = fopen(argv[2], "wb");
409
fwrite(&N, sizeof(int), 1, fp);
410
unsigned ulp = slm.isUseLogPr();
411
fwrite(&ulp, sizeof(unsigned), 1, fp);
413
for (int lvl = 0; lvl <= N; ++lvl) {
414
int len = slm.getLevelSize(lvl);
415
fwrite(&len, sizeof(int), 1, fp);
417
fwrite(&pr_table[0], sizeof(float), pr_table.size(), fp);
418
for (int i = pr_table.size(), sz=(1 << CThreadSlm::BITS_PR); i < sz; ++i)
419
fwrite(&dummy, sizeof(float), 1, fp);
421
fwrite(&bow_table[0], sizeof(float), bow_table.size(), fp);
422
for (int i = bow_table.size(), sz=(1 << CThreadSlm::BITS_BOW); i < sz; ++i)
423
fwrite(&dummy, sizeof(float), 1, fp);
425
for (int lvl=0; lvl < N; ++lvl)
426
fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize(lvl), fp);
427
fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N), fp);
430
printf("done!\n"); fflush(stdout);
433
for (int lvl=0; lvl < N; ++lvl)
434
delete []levels[lvl];