~elementary-os/elementaryos/os-patch-onboard-trusty

« back to all changes in this revision

Viewing changes to Onboard/pypredict/lm/lm_dynamic_kn.h

Committer: RabbitBot
Date: 2014-08-31 20:00:45 UTC
Revision ID: rabbitbot@elementaryos.org-20140831200045-guqqu1s80isrm103

Initial import, version 1.0.0-0ubuntu4

files added:

AUTHORS

CHANGELOG

COPYING

NEWS

Onboard

Onboard/Appearance.py

Onboard/AtspiStateTracker.py

Onboard/AutoShow.py

Onboard/ClickSimulator.py

Onboard/Config.py

Onboard/ConfigUtils.py

Onboard/Exceptions.py

Onboard/IconPalette.py

Onboard/Indicator.py

Onboard/KbdWindow.py

Onboard/KeyCommon.py

Onboard/KeyGtk.py

Onboard/Keyboard.py

Onboard/KeyboardPopups.py

Onboard/KeyboardWidget.py

Onboard/LanguageSupport.py

Onboard/Layout.py

Onboard/LayoutLoaderSVG.py

Onboard/LayoutView.py

Onboard/OnboardGtk.py

Onboard/Scanner.py

Onboard/SnippetView.py

Onboard/Sound.py

Onboard/SpellChecker.py

Onboard/TextChanges.py

Onboard/TextContext.py

Onboard/TextDomain.py

Onboard/TouchHandles.py

Onboard/TouchInput.py

Onboard/WPEngine.py

Onboard/WindowUtils.py

Onboard/WordSuggestions.py

Onboard/XInput.py

Onboard/__init__.py

Onboard/attic

Onboard/attic/data

Onboard/attic/data/org.onboard-prediction.service

Onboard/canonical_equivalents.py

Onboard/definitions.py

Onboard/osk

Onboard/osk/osk_audio.c

Onboard/osk/osk_click_mapper.c

Onboard/osk/osk_dconf.c

Onboard/osk/osk_devices.c

Onboard/osk/osk_hunspell.c

Onboard/osk/osk_module.c

Onboard/osk/osk_module.h

Onboard/osk/osk_struts.c

Onboard/osk/osk_util.c

Onboard/osk/osk_virtkey.c

Onboard/pypredict

Onboard/pypredict/README

Onboard/pypredict/__init__.py

Onboard/pypredict/attic

Onboard/pypredict/attic/Makefile

Onboard/pypredict/attic/multilevel

Onboard/pypredict/attic/ngram-test

Onboard/pypredict/attic/prune.py

Onboard/pypredict/attic/randbench

Onboard/pypredict/attic/setup.py

Onboard/pypredict/attic/test-client

Onboard/pypredict/lm

Onboard/pypredict/lm/accent_transform.h

Onboard/pypredict/lm/gen_accent_transform

Onboard/pypredict/lm/lm.cpp

Onboard/pypredict/lm/lm.h

Onboard/pypredict/lm/lm_dynamic.cpp

Onboard/pypredict/lm/lm_dynamic.h

Onboard/pypredict/lm/lm_dynamic_cached.h

Onboard/pypredict/lm/lm_dynamic_impl.h

Onboard/pypredict/lm/lm_dynamic_kn.h

Onboard/pypredict/lm/lm_merged.cpp

Onboard/pypredict/lm/lm_merged.h

Onboard/pypredict/lm/lm_python.cpp

Onboard/pypredict/lm/lm_unigram.cpp

Onboard/pypredict/lm/lm_unigram.h

Onboard/pypredict/lm/pool_allocator.cpp

Onboard/pypredict/lm_wrapper.py

Onboard/pypredict/test

Onboard/pypredict/test/__init__.py

Onboard/pypredict/test/test_checkmodels.py

Onboard/pypredict/test/test_pypredict.py

Onboard/pypredict/tools

Onboard/pypredict/tools/analyze

Onboard/pypredict/tools/checkmodels

Onboard/pypredict/tools/entropy

Onboard/pypredict/tools/filter

Onboard/pypredict/tools/ksr

Onboard/pypredict/tools/makemodels

Onboard/pypredict/tools/model_info

Onboard/pypredict/tools/optimize

Onboard/pypredict/tools/predict

Onboard/pypredict/tools/pypredict.py

Onboard/pypredict/tools/randomize

Onboard/pypredict/tools/split_corpus

Onboard/pypredict/tools/train

Onboard/settings.py

Onboard/test

Onboard/test/__init__.py

Onboard/test/test_dbus_service.py

Onboard/test/test_migration.py

Onboard/test/test_translations.py

Onboard/utils.py

PKG-INFO

README

data

data/layoutstrings.py

data/onboard-autostart.desktop

data/onboard-settings.desktop.in

data/onboard.desktop.in

data/org.onboard.gschema.xml

debian

debian/changelog

debian/compat

debian/control

debian/copyright

debian/onboard-data.install

debian/onboard.install

debian/onboard.lintian-overrides

debian/patches

debian/patches/add_defaults_for_ubuntu.patch

debian/patches/series

debian/rules

debian/source

debian/source/format

debian/watch

docs

docs/alpha.png

docs/changeid.png

docs/func.png

docs/index.html

docs/latched.png

docs/layoutdialog.png

docs/layouts.html

docs/locked.png

docs/macrodialog.png

docs/macrokeys.png

docs/onboard.png

docs/personalise.png

docs/scanningdialog.png

docs/usage.html

icons

icons/HighContrast

icons/HighContrast/onboard.svg

icons/hicolor

icons/hicolor/onboard.svg

icons/ubuntu-mono-dark

icons/ubuntu-mono-dark/onboard-panel.svg

icons/ubuntu-mono-light

icons/ubuntu-mono-light/onboard-panel.svg

layouts

layouts/Compact-Alpha.svg

layouts/Compact-Numbers.svg

layouts/Compact-Utils.svg

layouts/Compact.onboard

layouts/Full Keyboard-Alpha.svg

layouts/Full Keyboard-Numpad.svg

layouts/Full Keyboard.onboard

layouts/Grid-Alpha.svg

layouts/Grid.onboard

layouts/Phone-Alpha.svg

layouts/Phone-Emoji.svg

layouts/Phone-Numbers.svg

layouts/Phone-Syms.svg

layouts/Phone.onboard

layouts/Small-Alpha.svg

layouts/Small-Emoji.svg

layouts/Small-Fn.svg

layouts/Small-Numbers.svg

layouts/Small-Snippets.svg

layouts/Small-Syms.svg

layouts/Small.onboard

layouts/Whiteboard-Alpha.svg

layouts/Whiteboard-Arrows.svg

layouts/Whiteboard-Emoji.svg

layouts/Whiteboard-Greek.svg

layouts/Whiteboard-Numbers.svg

layouts/Whiteboard-Syms.svg

layouts/Whiteboard.onboard

layouts/Whiteboard_wide-Alpha.svg

layouts/Whiteboard_wide.onboard

layouts/images

layouts/images/arrow-down.svg

layouts/images/arrow-right.svg

layouts/images/close.svg

layouts/images/double-click.svg

layouts/images/drag-click.svg

layouts/images/erase-left.svg

layouts/images/erase.svg

layouts/images/hide.svg

layouts/images/hover-click.svg

layouts/images/middle-click.svg

layouts/images/move.svg

layouts/images/pause.svg

layouts/images/preferences.svg

layouts/images/right-click.svg

layouts/images/show-click.svg

layouts/images/single-click.svg

layouts/images/snippets.svg

layouts/key_defs.xml

layouts/template.svg

layouts/word_suggestions.xml

models

models/da_DK.lm

models/de_AT.lm

models/de_CH.lm

models/de_DE.lm

models/el_GR.lm

models/en_AU.lm

models/en_CA.lm

models/en_GB.lm

models/en_US.lm

models/es_ES.lm

models/fr_FR.lm

models/ga_IE.lm

models/gd_GB.lm

models/it_IT.lm

models/lb_LU.lm

models/nl_AN.lm

models/nl_AW.lm

models/nl_BE.lm

models/nl_NL.lm

models/nl_SR.lm

models/pl_PL.lm

models/pt_BR.lm

models/pt_PT.lm

models/ro_RO.lm

models/ru_RU.lm

models/sv_SE.lm

models/tr_TR.lm

onboard

onboard-defaults.conf.example

onboard-defaults.conf.example.nexus7

onboard-settings

po/ace.po

po/af.po

po/am.po

po/ar.po

po/ast.po

po/az.po

po/be.po

po/bg.po

po/bn.po

po/br.po

po/bs.po

po/ca.po

po/ca@valencia.po

po/cs.po

po/cy.po

po/da.po

po/de.po

po/el.po

po/en_AU.po

po/en_CA.po

po/en_GB.po

po/eo.po

po/es.po

po/et.po

po/eu.po

po/fi.po

po/fil.po

po/fo.po

po/fr.po

po/fr_CA.po

po/ga.po

po/gd.po

po/gl.po

po/he.po

po/hi.po

po/hr.po

po/hu.po

po/hy.po

po/id.po

po/is.po

po/it.po

po/ja.po

po/kk.po

po/km.po

po/kn.po

po/ko.po

po/ku.po

po/ky.po

po/lt.po

po/lv.po

po/mhr.po

po/mi.po

po/ml.po

po/mr.po

po/ms.po

po/my.po

po/nb.po

po/ne.po

po/nl.po

po/nn.po

po/oc.po

po/pa.po

po/pl.po

po/pms.po

po/pt.po

po/pt_BR.po

po/ro.po

po/ru.po

po/sd.po

po/se.po

po/shn.po

po/si.po

po/sk.po

po/sl.po

po/sn.po

po/sq.po

po/sr.po

po/sv.po

po/ta.po

po/te.po

po/tg.po

po/th.po

po/tl.po

po/tr.po

po/ug.po

po/uk.po

po/uz.po

po/vi.po

po/zh_CN.po

po/zh_HK.po

po/zh_TW.po

scripts

scripts/sokSettings.py

settings.ui

settings_docking_dialog.ui

settings_scanner_dialog.ui

settings_theme_dialog.ui

setup.py

sounds

sounds/onboard-key-feedback.oga

themes

themes/Ambiance.theme

themes/Aubergine.colors

themes/Black.colors

themes/Blackboard.theme

themes/Charcoal.colors

themes/Classic Onboard.colors

themes/Classic Onboard.theme

themes/DarkRoom.colors

themes/DarkRoom.theme

themes/Droid.theme

themes/Granite.colors

themes/HighContrast.colors

themes/HighContrast.theme

themes/HighContrastInverse.theme

themes/HighContrastInverseBlack.colors

themes/HighContrastInverseBlue.colors

themes/LowContrast.colors

themes/LowContrast.theme

themes/ModelM.colors

themes/ModelM.theme

themes/Nightshade.theme

themes/Typist.colors

themes/Typist.theme

Show diffs side-by-side

added added

removed removed

Onboard/pypredict/lm/lm_dynamic_kn.h

This program is free software: you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation, either version 3 of the License, or

(at your option) any later version.

This program is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with this program. If not, see <http://www.gnu.org/licenses/>.

Author: marmuta <marmvta@gmail.com>

#ifndef LM_DYNAMIC_KN_H

#define LM_DYNAMIC_KN_H

#include <assert.h>

#include "lm_dynamic.h"

#pragma pack(2)

//------------------------------------------------------------------------

// BeforeLastNodeKN - second to last node of the ngram trie, bigram for order 3

//------------------------------------------------------------------------

template <class TBASE>

class BeforeLastNodeKNBase : public TBASE

{

public:

BeforeLastNodeKNBase(WordId wid = (WordId)-1)

: TBASE(wid)

{

N1pxr = 0;

}

int get_N1pxr() {return N1pxr;}

public:

uint32_t N1pxr; // number of word types wid-n+1 that precede wid-n+2..wid in the training data

};

//------------------------------------------------------------------------

// TrieNodeKN - node for all lower levels of the ngram trie, unigrams for order 3

//------------------------------------------------------------------------

template <class TBASE>

class TrieNodeKNBase : public TBASE

{

public:

TrieNodeKNBase(WordId wid = (WordId)-1)

: TBASE(wid)

{

clear();

}

void clear()

{

N1pxr = 0;

N1pxrx = 0;

TBASE::clear();

}

int get_N1pxrx()

{

return N1pxrx;

}

public:

// Nomenclature:

// N1p: number of word types with count>=1 (1p=one plus)

// x: word, free running variable over all word types wi

// r: remainder, remaining part of the full ngram

uint32_t N1pxr; // number of word types wi-n+1 that precede

// wi-n+2..wi in the training data

uint32_t N1pxrx; // number of permutations around center part

};

//------------------------------------------------------------------------

// NGramTrieKN - root node of the ngram trie

//------------------------------------------------------------------------

template <class TNODE, class TBEFORELASTNODE, class TLASTNODE>

class NGramTrieKN : public NGramTrie<TNODE, TBEFORELASTNODE, TLASTNODE>

{

private:

typedef NGramTrie<TNODE, TBEFORELASTNODE, TLASTNODE> Base;

public:

NGramTrieKN(WordId wid = (WordId)-1)

: Base(wid)

{

}

int increment_node_count(BaseNode* node, const WordId* wids, int n,

int increment);

int get_N1pxr(BaseNode* node, int level);

100

int get_N1pxrx(BaseNode* node, int level);

101

102

void get_probs_kneser_ney_i(const std::vector<WordId>& history,

103

const std::vector<WordId>& words,

104

std::vector<double>& vp,

105

int num_word_types,

106

const std::vector<double>& Ds);

107

};

108

109

// Add increment to node->count and incrementally update kneser-ney counts

110

template <class TNODE, class TBEFORELASTNODE, class TLASTNODE>

111

int NGramTrieKN<TNODE, TBEFORELASTNODE, TLASTNODE>::

112

increment_node_count(BaseNode* node, const WordId* wids, int n,

113

int increment)

114

{

115

// only the first time for each ngram

116

if (increment && node->count == 0)

117

{

118

// get/add node for ngram (wids) excluding predecessor

119

// ex: ngram = ["We", "saw"] -> wxr = ["saw"] with predecessor "We"

120

// Predecessors exist for unigrams or greater, predecessor of unigrams

121

// are all unigrams. In that case use the root to store N1pxr.

122

std::vector<WordId> wxr(wids+1, wids+n);

123

BaseNode *nd = this->add_node(wxr);

124

if (!nd)

125

return -1;

126

((TBEFORELASTNODE*)nd)->N1pxr++; // count number of word types wid-n+1

127

// that precede wid-n+2..wid in the

128

// training data

129

130

// get/add node for ngram (wids) excluding predecessor and successor

131

// ex: ngram = ["We", "saw", "whales"] -> wxrx = ["saw"]

132

// with predecessor "We" and successor "whales"

133

// Predecessors and successors exist for bigrams or greater. wxrx is

134

// an empty vector for bigrams. In that case use the root to store N1pxrx.

135

if (n >= 2)

136

{

137

std::vector<WordId> wxrx(wids+1, wids+n-1);

138

BaseNode* nd = this->add_node(wxrx);

139

if (!nd)

140

return -1;

141

((TNODE*)nd)->N1pxrx++; // count number of word types wid-n+1 that precede wid-n+2..wid in the training data

142

}

143

}

144

145

return Base::increment_node_count(node, wids, n, increment);

146

}

147

148

template <class TNODE, class TBEFORELASTNODE, class TLASTNODE>

149

int NGramTrieKN<TNODE, TBEFORELASTNODE, TLASTNODE>::

150

get_N1pxr(BaseNode* node, int level)

151

{

152

if (level == this->order)

153

return 0;

154

if (level == this->order - 1)

155

return static_cast<TBEFORELASTNODE*>(node)->N1pxr;

156

return static_cast<TNODE*>(node)->N1pxr;

157

}

158

159

template <class TNODE, class TBEFORELASTNODE, class TLASTNODE>

160

int NGramTrieKN<TNODE, TBEFORELASTNODE, TLASTNODE>::

161

get_N1pxrx(BaseNode* node, int level)

162

{

163

if (level == this->order)

164

return 0;

165

if (level == this->order - 1)

166

return 0;

167

return static_cast<TNODE*>(node)->get_N1pxrx();

168

}

169

170

// kneser-ney smoothed probabilities

171

template <class TNODE, class TBEFORELASTNODE, class TLASTNODE>

172

void NGramTrieKN<TNODE, TBEFORELASTNODE, TLASTNODE>::

173

get_probs_kneser_ney_i(const std::vector<WordId>& history,

174

const std::vector<WordId>& words,

175

std::vector<double>& vp,

176

int num_word_types,

177

const std::vector<double>& Ds)

178

{

179

// only fixed history size allowed; don't remove unknown words

180

// from the history, mark them with UNKNOWN_WORD_ID instead.

181

ASSERT((int)history.size() == order-1);

182

183

int i,j;

184

int n = history.size() + 1;

185

int size = words.size(); // number of candidate words

186

std::vector<int32_t> vc(size); // vector of counts, reused for order 1..n

187

188

// order 0

189

vp.resize(size);

190

fill(vp.begin(), vp.end(), 1.0/num_word_types); // uniform distribution

191

192

// order 1..n

193

for(j=0; j<n; j++)

194

{

195

std::vector<WordId> h(history.begin()+(n-j-1), history.end()); // tmp history

196

BaseNode* hnode = this->get_node(h);

197

if (hnode)

198

{

199

int N1prx = this->get_N1prx(hnode, j); // number of word types following the history

200

if (!N1prx) // break early, don't reset probabilities to 0

201

break; // for unknown histories

202

203

// orders 1..n-1

204

if (j < n-1)

205

{

206

// Exclude children without predecessor from the count of

207

// successors. This corrects normalization errors for the case

208

// that the language model wasn't trained from a single

209

// continous stream of tokens, i.e. some tokens don't have

210

// successors. This happenes by default with the predefined

211

// control words <unk>, <s>, ..., but can also happen when

212

// incrementally adding text fragments to a language model.

213

int num_children = this->get_num_children(hnode, j);

214

for(i=0; i<num_children; i++)

215

{

216

// children here may be of type TrieNode or BeforeLastNode,

217

// play safe and cast to the latter.

218

TBEFORELASTNODE* child = static_cast<TBEFORELASTNODE*>

219

(this->get_child_at(hnode, j, i));

220

221

if (child->get_N1pxr() == 0) // no predecessors?

222

N1prx--; // exclude it from the count of successors

223

}

224

225

// number of permutations around history h

226

int N1pxrx = get_N1pxrx(hnode, j);

227

if (N1pxrx)

228

{

229

// get number of word types seen to precede history h

230

if (h.size() == 0) // empty history?

231

{

232

// We're at the root and there are many children, all

233

// unigrams to be accurate. So the number of child nodes

234

// is >= the number of candidate words.

235

// Luckily a childs word_id can be directly looked up

236

// in the unigrams because they are always sorted by word_id

237

// as well. -> take that shortcut for root.

238

for(i=0; i<size; i++)

239

{

240

//printf("%d %d %d %d %d\n", size, j, i, words[i], (int)ngrams.children.size());

241

TNODE* node = static_cast<TNODE*>(this->children[words[i]]);

242

vc[i] = node->N1pxr;

243

}

244

}

245

else

246

{

247

// We're at some level > 0 and very likely there are much

248

// less child nodes than candidate words. E.g. everything

249

// from bigrams up has in all likelihood only few children.

250

// -> Turn the algorithm around and search the child nodes

251

// in the candidate words.

252

fill(vc.begin(), vc.end(), 0);

253

int num_children = this->get_num_children(hnode, j);

254

for(i=0; i<num_children; i++)

255

{

256

// children here may be of type TrieNode or BeforeLastNode,

257

// play safe and cast to the latter.

258

TBEFORELASTNODE* child = static_cast<TBEFORELASTNODE*>

259

(this->get_child_at(hnode, j, i));

260

261

// word_indices have to be sorted by index

262

int index = binsearch(words, child->word_id);

263

if (index != -1)

264

vc[index] = child->N1pxr;

265

}

266

}

267

268

double D = Ds[j];

269

double l1 = D / float(N1pxrx) * N1prx; // normalization factor

270

// 1 - lambda

271

for(i=0; i<size; i++)

272

{

273

double a = vc[i] - D;

274

if (a < 0)

275

a = 0;

276

vp[i] = a / N1pxrx + l1 * vp[i];

277

}

278

}

279

280

}

281

// order n

282

else

283

{

284

// total number of occurences of the history

285

int cs = this->sum_child_counts(hnode, j);

286

if (cs)

287

{

288

// get ngram counts

289

fill(vc.begin(), vc.end(), 0);

290

int num_children = this->get_num_children(hnode, j);

291

for(i=0; i<num_children; i++)

292

{

293

BaseNode* child = this->get_child_at(hnode, j, i);

294

int index = binsearch(words, child->word_id); // word_indices have to be sorted by index

295

if (index >= 0)

296

vc[index] = child->get_count();

297

}

298

299

double D = Ds[j];

300

double l1 = D / float(cs) * N1prx; // normalization factor

301

// 1 - lambda

302

for(i=0; i<size; i++)

303

{

304

double a = vc[i] - D;

305

if (a < 0)

306

a = 0;

307

vp[i] = a / float(cs) + l1 * vp[i];

308

}

309

}

310

}

311

}

312

}

313

}

314

#pragma pack()

315

316

317

//------------------------------------------------------------------------

318

// DynamicModelKN - dynamically updatable language model with kneser-ney support

319

//------------------------------------------------------------------------

320

template <class TNGRAMS>

321

class _DynamicModelKN : public _DynamicModel<TNGRAMS>

322

{

323

public:

324

typedef _DynamicModel<TNGRAMS> Base;

325

326

static const Smoothing DEFAULT_SMOOTHING = KNESER_NEY_I;

327

328

public:

329

_DynamicModelKN()

330

{

331

this->smoothing = DEFAULT_SMOOTHING;

332

}

333

334

virtual std::vector<Smoothing> get_smoothings()

335

{

336

std::vector<Smoothing> smoothings = Base::get_smoothings();

337

smoothings.push_back(KNESER_NEY_I);

338

return smoothings;

339

}

340

341

virtual void get_node_values(BaseNode* node, int level,

342

std::vector<int>& values)

343

{

344

Base::get_node_values(node, level, values);

345

values.push_back(this->ngrams.get_N1pxrx(node, level));

346

values.push_back(this->ngrams.get_N1pxr(node, level));

347

}

348

349

protected:

350

virtual void get_probs(const std::vector<WordId>& history,

351

const std::vector<WordId>& words,

352

std::vector<double>& probabilities);

353

354

private:

355

virtual int increment_node_count(BaseNode* node, const WordId* wids,

356

int n, int increment)

357

{return this->ngrams.increment_node_count(node, wids, n, increment);}

358

};

359

360

typedef _DynamicModelKN<NGramTrieKN<TrieNode<TrieNodeKNBase<BaseNode> >,

361

BeforeLastNode<BeforeLastNodeKNBase<BaseNode>,

362

LastNode<BaseNode> >,

363

LastNode<BaseNode> > > DynamicModelKN;

364

365

// Calculate a vector of probabilities for the ngrams formed

366

// by history + word[i], for all i.

367

// input: constant history and a vector of candidate words

368

// output: vector of probabilities, one value per candidate word

369

template <class TNGRAMS>

370

void _DynamicModelKN<TNGRAMS>::get_probs(const std::vector<WordId>& history,

371

const std::vector<WordId>& words,

372

std::vector<double>& probabilities)

373

{

374

// pad/cut history so it's always of length order-1

375

int n = std::min((int)history.size(), this->order-1);

376

std::vector<WordId> h(this->order-1, UNKNOWN_WORD_ID);

377

copy_backward(history.end()-n, history.end(), h.end());

378

379

switch(this->smoothing)

380

{

381

case KNESER_NEY_I:

382

this->ngrams.get_probs_kneser_ney_i(h, words, probabilities,

383

this->get_num_word_types(), this->Ds);

384

break;

385

386

default:

387

Base::get_probs(history, words, probabilities);

388

break;

389

}

390

}

391

392

#endif

Older »