2
# Licensed to the Apache Software Foundation (ASF) under one or more
3
# contributor license agreements. See the NOTICE file distributed with
4
# this work for additional information regarding copyright ownership.
5
# The ASF licenses this file to You under the Apache License, Version 2.0
6
# (the "License"); you may not use this file except in compliance with
7
# the License. You may obtain a copy of the License at
9
# http://www.apache.org/licenses/LICENSE-2.0
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
18
# Parses Khmer text, with orthographic syllable as token.
20
# The definition of Khmer orthographic syllable is taken from the Unicode Standard.
22
# B = base character (consonant, independent vowel, etc)
23
$KhmerBase = [\u1780-\u17B3];
25
$KhmerRobat = [\u17CC];
26
# C = consonant shifter
27
$KhmerShifter = [\u17C9\u17CA];
28
# S = subscript consonant or independent vowel sign
29
$KhmerSub = ([\u17D2] $KhmerBase);
30
# V = dependent vowel sign
31
$KhmerVowel = [\u17B4-\u17C5];
32
# Z = zero-width joiner or non-joiner
33
$KhmerZWC = [\u200C\u200D];
35
$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD];
37
$WordJoin = [:Line_Break=Word_Joiner:];
39
$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
41
$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
44
# default numerical definitions
46
$Extend = [\p{Word_Break = Extend}];
47
$Format = [\p{Word_Break = Format}];
48
$MidNumLet = [\p{Word_Break = MidNumLet}];
49
$MidNum = [\p{Word_Break = MidNum}];
50
$Numeric = [\p{Word_Break = Numeric}];
51
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
52
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
53
$MidNumEx = $MidNum ($Extend | $Format)*;
54
$NumericEx = $Numeric ($Extend | $Format)*;
55
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
58
$KhmerJoinedSyllableEx {200};
60
# default numeric rules
61
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};