2
# Copyright © 2012, marmuta
4
# This file is part of Onboard.
6
# Onboard is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 3 of the License, or
9
# (at your option) any later version.
11
# Onboard is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
16
# You should have received a copy of the GNU General Public License
17
# along with this program. If not, see <http://www.gnu.org/licenses/>.
19
# Important directories:
21
# ./models/raw - Input directory containing large raw models with only
22
# minimal processing (utf-8 encoded). Due to their large
23
# size, often in the GB range, raw models aren't
24
# included with Onboard's source package,
26
# ./models - Output directory for final filtered system language models
31
LANGUAGES+="en_US en_GB en_AU en_CA "
32
LANGUAGES+="de_DE de_CH de_AT "
36
LANGUAGES+="pt_PT pt_BR "
42
# All Dutch models are currently identical, but the spell checkers
43
# might have differences. Keep them for now so we can easily
44
# select all flavours in the language menu.
45
LANGUAGES+="nl_NL nl_BE nl_AW nl_AN nl_SR "
53
RAWMODELDIR="../training/raw_models"
56
TRAIN_CMD=Onboard/pypredict/tools/train
57
FILTER_CMD=Onboard/pypredict/tools/filter
67
Usage: `basename $0` [-e|E] [-i|I] [-v] [languages...]
68
Script to create language models.
70
-v More verbose output
77
# process command line arguments
78
while getopts "v" opt; do
89
shift $(($OPTIND - 1))
92
# get languages from positional parameters
97
if [ "${lang}" == "${lang_id}" ]; then
99
# expand language to full lang_country id
100
for l in ${LANGUAGES}; do
102
if [ "${lang_id}" == "${lid}" ]; then
103
langs="${langs} ${l}"
107
# append language as is
108
langs="${langs} ${lang}"
115
# make sure the directories exist
116
[ -d "$MODELDIR" ] || mkdir $MODELDIR
118
# filter language models
119
for lang in $LANGUAGES; do
121
MODEL_IN="$RAWMODELDIR/$lang_id.$MODELEXT.sorted-pruned"
122
MODEL_OUT="$MODELDIR/$lang.$MODELEXT"
124
echo "Building '$MODEL_OUT'..."
127
REGEX_DROP_UNIGRAM="\'s$|^\S$"
128
REGEX_DROP_NGRAM="^\w{1}\ | \s\S{1,3}(?:\s|$)"
135
REGEX_DROP_UNIGRAM="\'s$|^[^IaA]$"
136
NAME_EXCEPTIONS=Union,Kingdom,Nations,New,Barack,Obama,Bush,South,West,North,East,Southern,Western,Northern,Eastern,Mean,Standard,Coast,Time,Grand,Central,Station,Large,Cloud,Collider,Psychology,Geographic \
142
MAX_LC_UC_RATIO=300.0
147
MAX_LC_UC_RATIO=150.0
173
PRUNE_FREQ=199,199,-1
210
echo "Unsupported language $lang, Aborting"
215
if [ ! "${NAME_EXCEPTIONS}" == "" ]; then
216
REMAINING_OPTIONS="${REMAINING_OPTIONS} -x ${NAME_EXCEPTIONS}"
219
$FILTER_CMD -p "${PRUNE_FREQ}" \
220
-r "${REGEX_DROP_UNIGRAM}" \
221
-n "${REGEX_DROP_NGRAM}" \
225
-i ${MAX_LC_UC_RATIO} \
226
${REMAINING_OPTIONS} \
227
--save-sorted $MODEL_IN $MODEL_OUT