~ubuntu-branches/ubuntu/trusty/sunpinyin/trusty-proposed

« back to all changes in this revision

Viewing changes to src/slm/thread/slmthread.cpp

Committer: Bazaar Package Importer
Author(s): Zhengpeng Hou
Date: 2010-09-06 12:23:46 UTC
Revision ID: james.westby@ubuntu.com-20100906122346-yamofztk2j5p85fs

Tags: upstream-2.0.2

Import upstream version 2.0.2

files added:

AUTHORS

COPYING

ChangeLog

LGPL.LICENSE

NEWS

OPENSOLARIS.LICENSE

README

SConstruct

TODO

build

build/Makefile

build/SConscript

data

data/Makefile

data/cnpunc.png

data/cnpunc.svg

data/eng.png

data/eng.svg

data/enpunc.png

data/enpunc.svg

data/fullwidth.png

data/fullwidth.svg

data/halfwidth.png

data/halfwidth.svg

data/han.png

data/han.svg

data/setup.svg

data/sunpinyin_logo.xpm

doc/AuxRequire.txt

doc/README

doc/genpyt.pod

doc/getWordFreq.pod

doc/idngram_merge.pod

doc/ids2ngram.pod

doc/mmseg.pod

doc/slmbuild.pod

doc/slminfo.pod

doc/slmprune.pod

doc/slmseg.pod

doc/slmthread.pod

doc/tslmendian.pod

doc/tslminfo.pod

doc/tslmpack.pod

python

python/imdict.py

python/importer

python/importer/import_fcitx_userdict.py

python/importer/import_fit_userdict.py

python/importer/import_google_userdict.py

python/importer/import_qim_userdict.py

python/importer/import_qq_userdict.py

python/importer/import_sogou_celldict.py

python/importer/import_sogou_userdict.py

python/importer/import_ziguang_userdict.py

python/importer/importer.py

python/importer/pinyin_data.py

python/mmseg.py

python/pinyin_data.py

python/pyslm.pyx

python/pytrie.pyx

python/setup.py

python/test.py

python/trie.py

python/utils.py

raw/Makefile

src/config

src/config/beos.h

src/config/linux.h

src/config/mac.h

src/config/solaris.h

src/ime-core

src/ime-core/ic_history.cpp

src/ime-core/ic_history.h

src/ime-core/imi_context.cpp

src/ime-core/imi_context.h

src/ime-core/imi_data.cpp

src/ime-core/imi_data.h

src/ime-core/imi_defines.h

src/ime-core/imi_funcobjs.cpp

src/ime-core/imi_funcobjs.h

src/ime-core/imi_keys.h

src/ime-core/imi_option_event.cpp

src/ime-core/imi_option_event.h

src/ime-core/imi_option_keys.h

src/ime-core/imi_options.cpp

src/ime-core/imi_options.h

src/ime-core/imi_uiobjects.cpp

src/ime-core/imi_uiobjects.h

src/ime-core/imi_view.cpp

src/ime-core/imi_view.h

src/ime-core/imi_view_classic.cpp

src/ime-core/imi_view_classic.h

src/ime-core/imi_winHandler.cpp

src/ime-core/imi_winHandler.h

src/ime-core/lattice_states.cpp

src/ime-core/lattice_states.h

src/ime-core/userdict.cpp

src/ime-core/userdict.h

src/ime-core/utils.h

src/lexicon

src/lexicon/genPYT.cpp

src/lexicon/pytrie.cpp

src/lexicon/pytrie.h

src/lexicon/pytrie_gen.cpp

src/lexicon/pytrie_gen.h

src/lexicon/trie_writer.cpp

src/lexicon/trie_writer.h

src/pinyin

src/pinyin/datrie.h

src/pinyin/datrie_impl.h

src/pinyin/pinyin_data.cpp

src/pinyin/pinyin_data.h

src/pinyin/pinyin_seg.cpp

src/pinyin/pinyin_seg.h

src/pinyin/quanpin_trie.h

src/pinyin/segmentor.h

src/pinyin/shuangpin_data.cpp

src/pinyin/shuangpin_data.h

src/pinyin/shuangpin_seg.cpp

src/pinyin/shuangpin_seg.h

src/pinyin/syllable.h

src/portability.cpp

src/portability.h

src/slm

src/slm/getWordFreq

src/slm/getWordFreq/getWordFreq.cpp

src/slm/ids2ngram

src/slm/ids2ngram/idngram.h

src/slm/ids2ngram/idngram_merge.cpp

src/slm/ids2ngram/idngram_merge.h

src/slm/ids2ngram/ids2ngram.cpp

src/slm/mmseg

src/slm/mmseg/mmseg.cpp

src/slm/sim_dict.cpp

src/slm/sim_dict.h

src/slm/sim_fmerge.h

src/slm/sim_sen.cpp

src/slm/sim_sen.h

src/slm/sim_slm.cpp

src/slm/sim_slm.h

src/slm/sim_slmbuilder.cpp

src/slm/sim_slmbuilder.h

src/slm/slm.cpp

src/slm/slm.h

src/slm/slmbuild

src/slm/slmbuild/slmbuild.cpp

src/slm/slminfo

src/slm/slminfo/slminfo.cpp

src/slm/slmprune

src/slm/slmprune/slmprune.cpp

src/slm/slmseg

src/slm/slmseg/slmseg.cpp

src/slm/thread

src/slm/thread/ValueCompress.cpp

src/slm/thread/ValueCompress.h

src/slm/thread/slmthread.cpp

src/slm/thread/test_vc.cpp

src/slm/tools

src/slm/tools/clean_rmrb.cpp

src/slm/tools/dumpdict.cpp

src/slm/tslmendian

src/slm/tslmendian/slm_endian.cpp

src/slm/tslmendian/slm_file.cpp

src/slm/tslmendian/slm_file.h

src/slm/tslmendian/writer.cpp

src/slm/tslmendian/writer.h

src/slm/tslminfo

src/slm/tslminfo/tslminfo.cpp

src/slm/tslmpack

src/slm/tslmpack/arpa_conv.cpp

src/slm/tslmpack/arpa_conv.h

src/slm/tslmpack/arpa_slm.cpp

src/slm/tslmpack/arpa_slm.h

src/slm/tslmpack/common.h

src/slm/tslmpack/slmpack.cpp

src/sunpinyin.h

swap

swap/README

Show diffs side-by-side

added added

removed removed

src/slm/thread/slmthread.cpp

* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.

* The contents of this file are subject to the terms of either the GNU Lesser

* General Public License Version 2.1 only ("LGPL") or the Common Development and

* Distribution License ("CDDL")(collectively, the "License"). You may not use this

* file except in compliance with the License. You can obtain a copy of the CDDL at

* http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at

* http://www.opensource.org/licenses/lgpl-license.php. See the License for the

* specific language governing permissions and limitations under the License. When

* distributing the software, include this License Header Notice in each file and

* include the full text of the License in the License file as well as the

* following notice:

* NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE

* (CDDL)

* For Covered Software in this distribution, this License shall be governed by the

* laws of the State of California (excluding conflict-of-law provisions).

* Any litigation relating to this License shall be subject to the jurisdiction of

* the Federal Courts of the Northern District of California and the state courts

* of the State of California, with venue lying in Santa Clara County, California.

* Contributor(s):

* If you wish your version of this file to be governed by only the CDDL or only

* the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to

* include this software in this distribution under the [CDDL or LGPL Version 2.1]

* license." If you don't indicate a single choice of license, a recipient has the

* option to distribute your version of this file under either the CDDL or the LGPL

* Version 2.1, or to extend the choice of license to its licensees as provided

* above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL

* Version 2 license, then the option applies only if the new code is made subject

* to such option by the copyright holder.

#ifdef HAVE_CONFIG_H

#include "config.h"

#endif

#ifdef HAVE_ASSERT_H

#include <assert.h>

#endif

#include <stdio.h>

#include <unistd.h>

#include <stdlib.h>

#include <vector>

#include <map>

#include <math.h>

#include "../sim_slm.h"

#include "../slm.h"

#include "ValueCompress.h"

class CSIMSlmWithIteration : public CSIMSlm{

public:

struct TLevelIterator {

std::vector<int> m_history;

};

public:

int

getLevelSize(int lvl) { return sz[lvl]; }

int

getN() { return N; }

void

getIdString(TLevelIterator& it, std::vector<TSIMWordId>& history);

void

beginLevelIteration(int lvl, TLevelIterator& it);

void

next(TLevelIterator& it);

bool

isEnd(TLevelIterator& it);

TLeaf*

getNodePtr(TLevelIterator& it);

int

findState(int n, TSIMWordId*hw);

void

findBackOffState(int n, TSIMWordId*hw, unsigned & bol, unsigned& bon);

protected:

void

adjustIterator(TLevelIterator& it);

};

int

100

CSIMSlmWithIteration::findState(int n, TSIMWordId*hw)

101

{

102

if (n == 0) return 0;

103

104

int m = -1;

105

while (n > N) { --n; ++hw; }

106

107

void* pstate = ((TNode*)level[0]);

108

for (int lvl=0; lvl < n && pstate != NULL; ++lvl) {

109

int h = ((TNode*)pstate)->child;

110

int t = (((TNode*)pstate)+1)->child;

111

if (lvl == N-1) {

112

TLeaf* p = (TLeaf*)level[lvl+1];

113

pstate = (void*)binary_find_id(p+h, p+t, hw[lvl]);

114

m = (pstate != NULL)?(((TLeaf*)pstate) - p):(-1);

115

} else {

116

TNode* p = (TNode*)level[lvl+1];

117

pstate = (void*)binary_find_id(p+h, p+t, hw[lvl]);

118

m = (pstate != NULL)?(((TNode*)pstate) - p):(-1);

119

}

120

}

121

return m;

122

}

123

124

void

125

CSIMSlmWithIteration::findBackOffState(int n, TSIMWordId*hw, unsigned & bol, unsigned& bon)

126

{

127

while (n > 1) {

128

--n; ++hw;

129

int idx = findState(n, hw);

130

if (idx >= 0 && ((TNode*)(level[n]))[idx].child < ((TNode*)(level[n]))[idx+1].child) {

131

bol = n; bon = idx; return;

132

}

133

}

134

bol = bon = 0;

135

return;

136

}

137

138

void

139

CSIMSlmWithIteration::getIdString(TLevelIterator& it, std::vector<TSIMWordId>& history)

140

{

141

history.clear();

142

for (int i=1, tmp_sz=it.m_history.size(); i < tmp_sz; ++i) {

143

int idx = it.m_history[i];

144

if (i == N)

145

history.push_back(((TLeaf*)(level[i]))[idx].id);

146

else

147

history.push_back(((TNode*)(level[i]))[idx].id);

148

}

149

}

150

151

void

152

CSIMSlmWithIteration::beginLevelIteration(int lvl, TLevelIterator& it)

153

{

154

it.m_history.clear();

155

for (int i=0, tmp_sz=lvl; i <= tmp_sz; ++i)

156

it.m_history.push_back(0);

157

adjustIterator(it);

158

}

159

160

void

161

CSIMSlmWithIteration::next(TLevelIterator& it)

162

{

163

++(it.m_history.back());

164

adjustIterator(it);

165

}

166

167

bool

168

CSIMSlmWithIteration::isEnd(TLevelIterator& it)

169

{

170

return ((it.m_history.back()+1 >= sz[it.m_history.size()-1]));

171

}

172

173

void

174

CSIMSlmWithIteration::adjustIterator(TLevelIterator& it)

175

{

176

int ch = it.m_history.back();

177

for (int i= it.m_history.size()-2; i >= 0; --i) {

178

int len = sz[i];

179

int& parent = it.m_history[i];

180

TNode* pn = (TNode*)(level[i]);

181

while (parent < len && pn[parent+1].child <= ch)

182

++parent;

183

ch = parent;

184

}

185

}

186

187

CSIMSlm::TLeaf*

188

CSIMSlmWithIteration::getNodePtr(TLevelIterator& it)

189

{

190

int lvl = it.m_history.size()-1;

191

int idx = it.m_history.back();

192

if (lvl == N)

193

return (((TLeaf*)(level[lvl]))+idx);

194

else

195

return (((TNode*)(level[lvl]))+idx);

196

}

197

198

199

200

void ShowUsage()

201

{

202

printf("Usage:\n");

203

printf(" slmthread primitive_slm threaded_slm\n");

204

printf("\nDescription:\n");

205

printf(" slmthread add back-off-state for each slm node in the primitive_slm. ");

206

printf("Also it compresses 32-bit float into 16 bit representation.\n\n");

207

exit(100);

208

}

209

210

FILE* fp = NULL;

211

CThreadSlm::TNode* levels[16];

212

CThreadSlm::TLeaf* lastLevel;

213

214

int main(int argc, char* argv[])

215

{

216

CValueCompressor vc;

217

unsigned int bol, bon;

218

CSIMSlmWithIteration slm;

219

std::vector<TSIMWordId> history;

220

float real_pr, eff_pr, real_bow, eff_bow;

221

222

std::map<float, float> pr_eff, bow_eff; // effval --> val

223

std::map<float, int> pr_values, bow_values; // effval --> freq

224

std::map<float, int> pr_map, bow_map; // result: val --> int

225

std::vector<float> pr_table, bow_table; // result: val vector

226

std::vector<float>::iterator itt, itte;

227

228

if (argc != 3)

229

ShowUsage();

230

231

printf("Loading original slm..."); fflush(stdout);

232

if (slm.Load(argv[1]) == false)

233

ShowUsage();

234

235

bool usingLogPr = slm.isUseLogPr();

236

237

#define EffectivePr(a) (float((usingLogPr)?((a)/log(2.0)):(-log2((a)))))

238

#define OriginalPr(b) (float((usingLogPr)?((b)*log(2.0)):(exp2(-(b)))))

239

#define EffectiveBow(a) (float((usingLogPr)?(exp(-(a))):((a))))

240

#define OriginalBow(b) (float((usingLogPr)?(-log((b))):((b))))

241

242

printf("\nfirst pass..."); fflush(stdout);

243

for (int lvl=0; lvl <= slm.getN(); ++lvl) {

244

CSIMSlmWithIteration::TLevelIterator it;

245

slm.beginLevelIteration(lvl, it);

246

for (; !slm.isEnd(it); slm.next(it)) {

247

CSIMSlm::TLeaf* pl = slm.getNodePtr(it);

248

real_pr = pl->pr;

249

eff_pr = EffectivePr(real_pr);

250

if (pr_eff.find(eff_pr) == pr_eff.end()) {

251

pr_eff[eff_pr] = real_pr;

252

} else { // precision error cause non 1:1 mapping

253

pr_eff[eff_pr] = OriginalPr(eff_pr);

254

}

255

++(pr_values[eff_pr]);

256

if (lvl < slm.getN()) {

257

real_bow = ((CSIMSlm::TNode*)pl)->bow;

258

eff_bow = EffectiveBow(real_bow);

259

if (bow_eff.find(eff_bow) == bow_eff.end()) {

260

bow_eff[eff_bow] = real_bow;

261

} else { // two values map to same distance value due to precision error

262

bow_eff[eff_bow] = OriginalBow(eff_bow);

263

}

264

++(bow_values[eff_bow]);

265

}

266

}

267

}

268

269

// Following pr value should not be grouped, or as milestone values.

270

static float msprs[] = {

271

0.9, 0.8, 0.7, 0.6,

272

1.0/2, 1.0/4, 1.0/8, 1.0/16, 1.0/32, 1.0/64, 1.0/128,

273

1.0/256, 1.0/512, 1.0/1024, 1.0/2048, 1.0/4096, 1.0/8192,

274

1.0/16384, 1.0/32768, 1.0/65536

275

};

276

277

for (unsigned i=0, sz=sizeof(msprs)/sizeof(float); i < sz; ++i) {

278

float real_pr = (usingLogPr)?(-log(msprs[i])):(msprs[i]);

279

float eff_pr = EffectivePr(real_pr);

280

if (pr_eff.find(eff_pr) == pr_eff.end()) {

281

pr_eff[eff_pr] = real_pr;

282

} else { // precision error cause non 1:1 mapping

283

pr_eff[eff_pr] = OriginalPr(eff_pr);

284

}

285

pr_values[eff_pr] = 0;

286

}

287

288

// Following bow value should not be grouped, or as milestone values.

289

static float msbows[] = {

290

1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,

291

0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001,

292

0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001

293

};

294

295

for (unsigned i=0, sz=sizeof(msbows)/sizeof(float); i < sz; ++i) {

296

float real_bow = (usingLogPr)?(-log(msbows[i])):(msbows[i]);

297

float eff_bow = EffectiveBow(real_bow);

298

if (bow_eff.find(eff_bow) == bow_eff.end()) {

299

bow_eff[eff_bow] = real_bow;

300

} else { // two values map to same distance value due to precision error

301

bow_eff[eff_bow] = OriginalBow(eff_bow);

302

}

303

bow_values[eff_bow] = 0;

304

}

305

306

printf("\nCompressing pr values..."); fflush(stdout);

307

vc(pr_eff, pr_values, pr_map, pr_table, (1 << CThreadSlm::BITS_PR));

308

pr_values.clear();

309

itte = pr_table.end();

310

for (itt = pr_table.begin(); itt != itte; ++itt) {

311

*itt = OriginalPr(*itt);

312

assert(usingLogPr || (*itt > 0.0 && *itt < 1.0));

313

assert(!usingLogPr || *itt > 0.0);

314

}

315

printf("%lu float values ==> %lu values", pr_eff.size(), pr_table.size());

316

317

printf("\nCompressing bow values..."); fflush(stdout);

318

vc(bow_eff, bow_values, bow_map, bow_table, (1 << CThreadSlm::BITS_BOW));

319

bow_values.clear();

320

itte = bow_table.end();

321

for (itt = bow_table.begin(); itt != itte; ++itt)

322

*itt = OriginalBow(*itt);

323

printf("%lu float values ==> %lu values", bow_eff.size(), bow_table.size());

324

325

326

printf("\nThreading the new model..."); fflush(stdout);

327

for (int lvl=0; lvl < slm.getN(); ++lvl) {

328

levels[lvl] = new CThreadSlm::TNode[slm.getLevelSize(lvl)];

329

330

CSIMSlmWithIteration::TLevelIterator it;

331

slm.beginLevelIteration(lvl, it);

332

for (; !slm.isEnd(it); slm.next(it)) {

333

slm.getIdString(it, history);

334

if (history.size() == 0) {

335

slm.findBackOffState(lvl, NULL, bol, bon);

336

} else {

337

slm.findBackOffState(lvl, &history[0], bol, bon);

338

}

339

340

CSIMSlm::TNode* pn = (CSIMSlm::TNode*)slm.getNodePtr(it);

341

CThreadSlm::TNode& nn = levels[lvl][it.m_history.back()];

342

343

std::map<float, int>::iterator prit = pr_map.find(pn->pr);

344

if (prit == pr_map.end()) { // This would be cause by precision error

345

double val = EffectivePr(pn->pr);

346

val = OriginalPr(val);

347

prit = pr_map.find(val);

348

assert(prit != pr_map.end());

349

}

350

int idx_pr = prit->second;

351

nn.set_pr(idx_pr);

352

353

nn.set_wid(pn->id);

354

nn.set_bon(bon);

355

nn.set_bol(bol);

356

357

std::map<float, int>::iterator bowit = bow_map.find(pn->bow);

358

if (bowit == bow_map.end()) { // precision error

359

double val = EffectiveBow(pn->bow);

360

val = OriginalBow(val);

361

bowit = bow_map.find(val);

362

assert(bowit != bow_map.end());

363

}

364

int idx_bow = bowit->second;

365

nn.set_bow(idx_bow);

366

367

nn.set_ch(pn->child);

368

369

assert(usingLogPr || (pr_table[idx_pr] > 0.0 && pr_table[idx_pr] < 1.0));

370

assert(!usingLogPr || pr_table[idx_pr] > 0.0);

371

}

372

CSIMSlm::TNode* pn = (CSIMSlm::TNode*)slm.getNodePtr(it);

373

CThreadSlm::TNode& nn = levels[lvl][it.m_history.back()];

374

nn.set_ch(pn->child);

375

};

376

377

378

lastLevel = new CThreadSlm::TLeaf [slm.getLevelSize(slm.getN())];

379

CSIMSlmWithIteration::TLevelIterator it;

380

slm.beginLevelIteration(slm.getN(), it);

381

for (int lvl=slm.getN(); !slm.isEnd(it); slm.next(it)) {

382

CSIMSlm::TLeaf* pn = slm.getNodePtr(it);

383

slm.getIdString(it, history);

384

slm.findBackOffState(lvl, &history[0], bol, bon);

385

386

CThreadSlm::TLeaf& nn = lastLevel[it.m_history.back()];

387

388

std::map<float, int>::iterator prit = pr_map.find(pn->pr);

389

if (prit == pr_map.end()) { // This would be cause by precision error

390

double val = EffectivePr(pn->pr);

391

val = OriginalPr(val);

392

prit = pr_map.find(val);

393

assert(prit != pr_map.end());

394

}

395

int idx_pr = prit->second;

396

nn.set_pr(idx_pr);

397

398

nn.set_wid(pn->id);

399

nn.set_bon(bon);

400

nn.set_bol(bol);

401

}

402

403

404

printf("\nWriting out..."); fflush(stdout);

405

406

float dummy = 0.0;

407

fp = fopen(argv[2], "wb");

408

int N = slm.getN();

409

fwrite(&N, sizeof(int), 1, fp);

410

unsigned ulp = slm.isUseLogPr();

411

fwrite(&ulp, sizeof(unsigned), 1, fp);

412

413

for (int lvl = 0; lvl <= N; ++lvl) {

414

int len = slm.getLevelSize(lvl);

415

fwrite(&len, sizeof(int), 1, fp);

416

}

417

fwrite(&pr_table[0], sizeof(float), pr_table.size(), fp);

418

for (int i = pr_table.size(), sz=(1 << CThreadSlm::BITS_PR); i < sz; ++i)

419

fwrite(&dummy, sizeof(float), 1, fp);

420

421

fwrite(&bow_table[0], sizeof(float), bow_table.size(), fp);

422

for (int i = bow_table.size(), sz=(1 << CThreadSlm::BITS_BOW); i < sz; ++i)

423

fwrite(&dummy, sizeof(float), 1, fp);

424

425

for (int lvl=0; lvl < N; ++lvl)

426

fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize(lvl), fp);

427

fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N), fp);

428

fclose(fp);

429

430

printf("done!\n"); fflush(stdout);

431

432

delete [] lastLevel;

433

for (int lvl=0; lvl < N; ++lvl)

434

delete []levels[lvl];

435

436

bow_values.clear();

437

bow_map.clear();

438

bow_table.clear();

439

440

pr_values.clear();

441

pr_map.clear();

442

pr_table.clear();

443

444

slm.Free();

445

446

447

return 0;

448

}

Older »