3
# Licensed to the Apache Software Foundation (ASF) under one or more
4
# contributor license agreements. See the NOTICE file distributed with
5
# this work for additional information regarding copyright ownership.
6
# The ASF licenses this file to You under the Apache License, Version 2.0
7
# (the "License"); you may not use this file except in compliance with
8
# the License. You may obtain a copy of the License at
10
# http://www.apache.org/licenses/LICENSE-2.0
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
24
my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
27
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+\.\d+/) {
28
print STDERR "Usage: $script_name -v <version>\n";
29
print STDERR "\tversion must be of the form X.Y.Z, e.g. 5.2.0\n"
33
my $url_prefix = "http://www.unicode.org/Public/${version}/ucd";
34
my $scripts_url = "${url_prefix}/Scripts.txt";
35
my $line_break_url = "${url_prefix}/LineBreak.txt";
36
my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt";
37
my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt";
38
my $underscore_version = $version;
39
$underscore_version =~ s/\./_/g;
40
my $class_name = "WordBreakTestUnicode_${underscore_version}";
41
my $output_filename = "${class_name}.java";
42
my $header =<<"__HEADER__";
43
package org.apache.lucene.analysis;
46
* Licensed to the Apache Software Foundation (ASF) under one or more
47
* contributor license agreements. See the NOTICE file distributed with
48
* this work for additional information regarding copyright ownership.
49
* The ASF licenses this file to You under the Apache License, Version 2.0
50
* (the "License"); you may not use this file except in compliance with
51
* the License. You may obtain a copy of the License at
53
* http://www.apache.org/licenses/LICENSE-2.0
55
* Unless required by applicable law or agreed to in writing, software
56
* distributed under the License is distributed on an "AS IS" BASIS,
57
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
58
* See the License for the specific language governing permissions and
59
* limitations under the License.
62
import org.junit.Ignore;
65
* This class was automatically generated by ${script_name}
66
* from: ${url_prefix}/auxiliary/WordBreakTest.txt
68
* WordBreakTest.txt indicates the points in the provided character sequences
69
* at which conforming implementations must and must not break words. This
70
* class tests for expected token extraction from each of the test sequences
71
* in WordBreakTest.txt, where the expected tokens are those character
72
* sequences bounded by word breaks and containing at least one character
73
* from one of the following character sets:
75
* \\p{Script = Han} (From $scripts_url)
76
* \\p{Script = Hiragana}
77
* \\p{LineBreak = Complex_Context} (From $line_break_url)
78
* \\p{WordBreak = ALetter} (From $word_break_url)
79
* \\p{WordBreak = Katakana}
80
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
81
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
84
public class ${class_name} extends BaseTokenStreamTestCase {
86
public void test(Analyzer analyzer) throws Exception {
90
map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
91
# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
92
# Using lowercase versions of property value names to allow for case-
93
# insensitive comparison with the names in the Unicode data files.
94
parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
95
parse_Unicode_data_file($scripts_url, $codepoints,
96
{'han' => 1, 'hiragana' => 1});
97
parse_Unicode_data_file($word_break_url, $codepoints,
98
{'aletter' => 1, 'katakana' => 1, 'numeric' => 1});
99
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
101
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
102
open OUT, ">$output_path"
103
|| die "Error opening '$output_path' for writing: $!";
105
print STDERR "Writing '$output_path'...";
109
for my $line (@tests) {
110
next if ($line =~ /^\s*\#/);
111
# ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
112
my ($sequence) = $line =~ /^(.*?)\s*\#/;
113
print OUT " // $line\n";
114
$sequence =~ s/\s*÷\s*$//; # Trim trailing break character
115
my $test_string = $sequence;
116
$test_string =~ s/\s*÷\s*/\\u/g;
117
$test_string =~ s/\s*×\s*/\\u/g;
118
$test_string =~ s/\\u000A/\\n/g;
119
$test_string =~ s/\\u000D/\\r/g;
120
$sequence =~ s/^\s*÷\s*//; # Trim leading break character
122
for my $candidate (split /\s*÷\s*/, $sequence) {
124
my $has_wanted_char = 0;
125
while ($candidate =~ /([0-9A-F]+)/gi) {
127
unless ($has_wanted_char) {
128
$has_wanted_char = 1 if (defined($codepoints->[hex($1)]));
131
if ($has_wanted_char) {
132
push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
135
print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";
136
print OUT " new String[] { ";
137
print OUT join(", ", @tokens), " });\n\n";
142
print STDERR "done.\n";
145
# sub parse_Unicode_data_file
147
# Downloads and parses the specified Unicode data file, parses it, and
148
# extracts code points assigned any of the given property values, defining
149
# the corresponding array position in the passed-in target array.
151
# Takes in the following parameters:
153
# - URL of the Unicode data file to download and parse
154
# - Reference to target array
155
# - Reference to hash of property values to get code points for
157
sub parse_Unicode_data_file {
160
my $wanted_property_values = shift;
161
my $content = get_URL_content($url);
162
print STDERR "Parsing '$url'...";
163
my @lines = split /\r?\n/, $content;
165
s/\s*#.*//; # Strip trailing comments
166
s/\s+$//; # Strip trailing space
167
next unless (/\S/); # Skip empty lines
168
my ($start, $end, $property_value);
169
if (/^([0-9A-F]{4,5})\s*;\s*(.+)/i) {
171
$start = $end = hex $1;
172
$property_value = lc $2; # Property value names are case-insensitive
173
} elsif (/^([0-9A-F]{4,5})..([0-9A-F]{4,5})\s*;\s*(.+)/i) {
174
# 0AE6..0AEF ; Gujarati
177
$property_value = lc $3; # Property value names are case-insensitive
181
if (defined($wanted_property_values->{$property_value})) {
182
for my $code_point ($start..$end) {
183
$target->[$code_point] = 1;
187
print STDERR "done.\n";
190
# sub get_URL_content
192
# Retrieves and returns the content of the given URL.
194
sub get_URL_content {
196
print STDERR "Retrieving '$url'...";
197
my $user_agent = LWP::UserAgent->new;
198
my $request = HTTP::Request->new(GET => $url);
199
my $response = $user_agent->request($request);
200
unless ($response->is_success) {
201
print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
204
print STDERR "done.\n";
205
return $response->content;