2
###############################################################################
3
# This software is being provided to you, the LICENSEE, by the Linguistic #
4
# Data Consortium (LDC) and the University of Pennsylvania (UPENN) under the #
5
# following license. By obtaining, using and/or copying this software, you #
6
# agree that you have read, understood, and will comply with these terms and #
9
# Permission to use, copy, modify and distribute, including the right to #
10
# grant others the right to distribute at any tier, this software and its #
11
# documentation for any purpose and without fee or royalty is hereby granted, #
12
# provided that you agree to comply with the following copyright notice and #
13
# statements, including the disclaimer, and that the same appear on ALL #
14
# copies of the software and documentation, including modifications that you #
15
# make for internal use or for distribution: #
17
# Copyright 1999 by the University of Pennsylvania. All rights reserved. #
19
# THIS SOFTWARE IS PROVIDED "AS IS"; LDC AND UPENN MAKE NO REPRESENTATIONS OR #
20
# WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not limitation, #
21
# LDC AND UPENN MAKE NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR #
22
# FITNESS FOR ANY PARTICULAR PURPOSE. #
23
###############################################################################
24
# mansegment.perl Version 1.1
25
# Run as: mansegment.perl [dictfile] < infile > outfile
26
# A Chinese segmenter for both GB and BIG5 as long as the cooresponding
27
# word frequency dictionary is used.
29
# Written by Zhibiao Wu at LDC on April 12 1999
30
# Modified by Xiaoyi Ma at LDC, March, 2003
33
# - regenerated database to be compatible with perl5
35
# Algorithm: Dynamic programming to find the path which has the highest
36
# multiple of word probability, the next word is selected from the longest
39
# dictfile is a two column text file, first column is the frequency,
40
# second column is the word. The program will change the file into a dbm
41
# file in the first run. So be sure to remove the dbm file if you have a
42
# newer version of the text file.
43
##############################################################################
45
binmode(STDIN, ":utf8");
46
binmode(STDOUT, ":utf8");
47
binmode(STDERR, ":utf8");
48
$wd = 1; # width of a character
49
$| = 1; # disable Perl output buffering
55
$DICTPATH = $1 if ( $0 =~ /(.+)\/[^\/]+/ );
63
$dictfile = "$DICTPATH/Mandarin.fre.utf8";
66
#$dict_db = $dictfile.".db";
71
# read in frequency dictionary in associate array.
75
# read in Mandarin files.
82
$lineLen = length($thisLine);
84
while($index<$lineLen){
85
$c = substr($thisLine, $index, $wd);
88
$code = unpack("U", $c);
92
} elsif ($code >= hex('3000') && $code <= hex('9FFF')) {
94
$newline = $newline . " " . $c;
96
$newline = $newline . $c;
101
$newline = $newline . " " . $c;
111
@segment = split(/\s+/,$_);
123
return if ($sentence eq "");
125
if ($sentence =~ /^[\x00-\xFF]+$/) {
130
print STDERR "Input: $sentence\n" if $trace;
138
$len = length($sentence);
140
# Take out the top most path in the stack and extend that path
141
# into several new paths, and put those paths into the stack.
142
while (($top != -1) &&
143
(!(($position{$top} == $len) && ($next{$top} == -1)))) {
145
#print STDERR "$. $result{$top}\n";
147
# find the first open path
150
while (($current != -1 ) && ($position{$current} == $len)) {
152
$current = $next{$current};
156
if ($current == $top) {
159
$next{$father} = $next{$current};
162
if ($current == -1) {
163
# no open path, finished, take the first path
166
$firstword = substr($sentence, $position{$current}, $wd);
168
$i = $freq{"m,$firstword"};
169
if ($i > $len - $position{$current}) {
170
$i = $len - $position{$current};
177
$word = substr($sentence, $position{$current}, $i);
179
# If you want to add algorithmic segments you can do it like so:
180
#$digit0 = "○|零|一|二|三|四|五|六|七|八|九";
181
#$digit1 = "一|二|三|四|五|六|七|八|九";
183
#if ($word =~ /^((($digit1)千)?(($digit1)百)?(($digit1)十)?($digit1)?|十($digit1))$/) {
188
$freq{$word} = 1; # single character always counts as a word
203
print STDERR "Error: $. $sentence\n";
207
foreach $k (sort {$a <=> $b} (keys %result)) {
208
print STDERR "$k $result{$k}\n";
212
$result{$top} =~ s/^ *//g;
222
$value{$nextid} = $value{$current} * $freq{$word} / $freq{total};
223
$result{$nextid} = $result{$current} . " " . $word;
224
$position{$nextid} = $position{$current} + $i;
226
# check to see whether there is duplicated path
227
# if there is a duplicate path, remove the small value path
231
while ($index != -1) {
232
if ($position{$index} == $position{$nextid}) {
233
if ($value{$index} >= $value{$nextid}) {
236
if ($top == $index) {
237
$next{$nextid} = $next{$index};
241
$next{$father} = $next{$index};
247
$index = $next{$index};
253
# insert the new path into the list
254
if ($needInsert == 1) {
256
while (($index != -1) && ($value{$index} > $value{$nextid})) {
258
$index = $next{$index};
260
if ($top == $index) {
261
$next{$nextid} = $top;
264
$next{$father} = $nextid;
265
$next{$nextid} = $index;
274
open F, "<:utf8", "$dictfile" || die "Dictonary file $dictfile not found";
279
$freq{$_[1]} = $_[0];
280
$header = substr($_[1],0,$wd);
281
if ($freq{"m,$header"}) {
282
if ($freq{"m,$header"} < length($_[1])) {
283
$freq{"m,$header"} = length($_[1]);
286
$freq{"m,$header"} = length($_[1]);
288
$freq{total} += $_[0];