1
SLM_SRC_DIR = ../src/slm
2
LEXICON_SRC_DIR = ../src/lexicon
3
PINYIN_SRC_DIR = ../src/pinyin
9
DICTFILE = ${CORPUS_DIR}/dict.utf8
10
CORPUSFILE = ${CORPUS_DIR}/corpus.utf8
11
TEST_CORPUSFILE = ${CORPUS_DIR}/test.utf8
12
REAL_CORPUSFILE = ${CORPUS_DIR}/BIGCORPUS
15
IDS_FILE = ${SWAP_DIR}/${LMTARGET}.ids
16
SWAP_FILE = ${SWAP_DIR}/swap
18
#FILE NAMES for BIGRAM model
19
IDNGRAM_FILE = ${SWAP_DIR}/${LMTARGET}.id2gram
20
RAW_LM_FILE = ${SWAP_DIR}/${LMTARGET}.2gram
21
SLM_FILE = ${SWAP_DIR}/${LMTARGET}.2gm
22
SLM_INFO_FILE = ${SWAP_DIR}/${LMTARGET}.2gm.arpa
23
TSLM_FILE = ${RESULT_DIR}/${LMTARGET}.t2g
24
TSLM_INFO_FILE = ${SWAP_DIR}/${LMTARGET}.t2g.arpa
26
#FILE NAMES for TRIGRAM model
27
IDNGRAM_FILE3 = ${SWAP_DIR}/${LMTARGET}.id3gram
28
RAW_LM_FILE3 = ${SWAP_DIR}/${LMTARGET}.3gram
29
SLM_FILE3 = ${SWAP_DIR}/${LMTARGET}.3gm
30
SLM_INFO_FILE3 = ${SWAP_DIR}/${LMTARGET}.3gm.arpa
31
TSLM_FILE3 = ${RESULT_DIR}/${LMTARGET}.t3g
32
TSLM_INFO_FILE3 = ${SWAP_DIR}/${LMTARGET}.t3g.arpa
33
TSLM_REPACKED_FILE3 = ${SWAP_DIR}/${LMTARGET}.t3g.repacked
34
TSLM_UNPACKED_FILE3 = ${SWAP_DIR}/${LMTARGET}.t3g.arpa.unpacked
36
#Lexicon FILE names (raw resource and others)
37
PINYIN_TEXTFILE = ${CORPUS_DIR}/dict.utf8
38
PINYIN_NMP_TEXTFILE = ${SWAP_DIR}/dict_nmp.utf8
39
PYTRIE_FILE = ${RESULT_DIR}/pydict_sc.bin
40
PYTRIE_PRINTOUT = ${SWAP_DIR}/pydict_sc.log.utf8
43
if [ -e ${CORPUSFILE} ]; then unlink ${CORPUSFILE}; fi
44
ln -s ${TEST_CORPUSFILE} ${CORPUSFILE}
47
if [ -e ${CORPUSFILE} ]; then unlink ${CORPUSFILE}; fi
48
ln -s ${REAL_CORPUSFILE} ${CORPUSFILE}
51
./mmseg -d ${DICTFILE} -f bin -s 10 -a 9 ${CORPUSFILE} >${IDS_FILE}
54
./slmseg -d ${DICTFILE} -f bin -s 10 -m ${TSLM_FILE3} ${CORPUSFILE} >${IDS_FILE}
55
cp ${TSLM_FILE3} ${TSLM_FILE3}.normal
58
./slmseg -d ${DICTFILE} -f bin -s 10 -m ${TSLM_FILE} ${CORPUSFILE} >${IDS_FILE}
59
cp ${TSLM_FILE} ${TSLM_FILE}.normal
61
#second round bootstrap bigram
62
bs_bigram : slmids m2_idngram m2_slm m2_prune m2_thread m2_tslminfo
64
#This is the command to make a bigram model
65
bigram : ids m2_idngram m2_slm m2_prune m2_thread m2_tslminfo
68
./ids2ngram -n 2 -s ${SWAP_FILE} -o ${IDNGRAM_FILE} -p 5000000 ${IDS_FILE}
72
./slmbuild -n 2 -o ${RAW_LM_FILE} -w 120000 -c 0,2 -d ABS,0.005 -d ABS,0.6 -b 10 -e 9 ${IDNGRAM_FILE}
75
./slmprune ${RAW_LM_FILE} ${SLM_FILE} R 40000 100000
78
./slmthread ${SLM_FILE} ${TSLM_FILE}
81
./tslminfo -v -l ${DICTFILE} ${TSLM_FILE} >${TSLM_INFO_FILE}
83
#Use this to generate bigram non-threaded lm arpa information if needed
85
./slminfo -p -v -l ${DICTFILE} ${SLM_FILE} >${SLM_INFO_FILE}
87
#second round bootstrap to make trigram model
88
bs_trigram : slmids3 m3_idngram m3_slm m3_prune m3_thread m3_tslminfo
90
#This is the command to make a trigram model
91
trigram : ids m3_idngram m3_slm m3_prune m3_thread m3_tslminfo
94
./ids2ngram -n 3 -s ${SWAP_FILE} -o ${IDNGRAM_FILE3} -p 5000000 ${IDS_FILE}
98
./slmbuild -n 3 -o ${RAW_LM_FILE3} -w 120000 -c 0,2,2 -d ABS,0.0005 -d ABS -d ABS,0.6 -b 10 -e 9 ${IDNGRAM_FILE3}
101
./slmprune ${RAW_LM_FILE3} ${SLM_FILE3} R 100000 1250000 1000000
104
./slmthread ${SLM_FILE3} ${TSLM_FILE3}
107
./tslminfo -p -v -l ${DICTFILE} ${TSLM_FILE3} >${TSLM_INFO_FILE3}
110
./tslmpack ${TSLM_INFO_FILE3} ${DICTFILE} ${TSLM_REPACKED_FILE3}
113
./tslminfo -p -v -l ${DICTFILE} ${TSLM_REPACKED_FILE3} >${TSLM_UNPACKED_FILE3}
115
#Use this to generate trigram non-threaded lm arpa information if needed
117
./slminfo -p -v -l ${DICTFILE} ${SLM_FILE3} >${SLM_INFO_FILE3}
119
#clean all intermedian file for building the model
123
rm -f ${IDNGRAM_FILE} ${RAW_LM_FILE}
124
rm -f ${IDNGRAM_FILE3} ${RAW_LM_FILE3}
127
./genpyt -i ${PINYIN_TEXTFILE} -o ${PYTRIE_FILE} -l ${PYTRIE_PRINTOUT} -s ${TSLM_FILE3}
130
./genpyt -i ${PINYIN_TEXTFILE} -o ${PYTRIE_FILE} -l ${PYTRIE_PRINTOUT} -s ${TSLM_FILE}