5
# generates in the current directory:
8
# - unicode-nameslist.h
10
# - unicode-categories.h
12
# - unicode-versions.h
14
# usage: ./gen-guch-unicode-tables.pl UNICODE-VERSION DIRECTORY
15
# where DIRECTORY contains UnicodeData.txt Unihan_Readings.txt.bz2 NamesList.txt Blocks.txt Scripts.txt
17
# NOTE! Some code copied from glib/glib/gen-unicode-tables.pl; keep in sync!
20
use vars ('$UNZIP', '$ICONV');
22
# if these things aren't in your path you can put full paths to them here
26
sub process_unicode_data_txt ($);
27
sub process_unihan_zip ($);
28
sub process_nameslist_txt ($);
29
sub process_blocks_txt ($);
30
sub process_scripts_txt ($);
31
sub process_versions_txt ($);
33
$| = 1; # flush stdout buffer
40
Usage: $0 UNICODE-VERSION DIRECTORY
42
DIRECTORY should contain the following Unicode data files:
43
UnicodeData.txt Unihan_Readings.txt.bz2 NamesList.txt Blocks.txt Scripts.txt
45
which can be found at http://www.unicode.org/Public/UNIDATA/
50
my ($unicodedata_txt, $unihan_zip, $nameslist_txt, $blocks_txt, $scripts_txt, $versions_txt);
54
opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n";
55
for my $f (readdir ($dir))
57
$unicodedata_txt = "$d/$f" if ($f =~ /UnicodeData.*\.txt/);
58
$unihan_zip = "$d/$f" if ($f =~ /Unihan_Readings\.txt\.bz2/);
59
$nameslist_txt = "$d/$f" if ($f =~ /NamesList.*\.txt/);
60
$blocks_txt = "$d/$f" if ($f =~ /Blocks.*\.txt/);
61
$scripts_txt = "$d/$f" if ($f =~ /Scripts.*\.txt/);
62
$versions_txt = "$d/$f" if ($f =~ /DerivedAge.*\.txt/);
65
defined $unicodedata_txt or die "Did not find $d/UnicodeData.txt";
66
defined $unihan_zip or die "Did not find $d/Unihan_Readings.txt.bz2";
67
defined $nameslist_txt or die "Did not find $d/NamesList.txt";
68
defined $blocks_txt or die "Did not find $d/Blocks.txt";
69
defined $scripts_txt or die "Did not find $d/Scripts.txt";
70
defined $versions_txt or die "Did not find $d/DerivedAge.txt";
72
process_unicode_data_txt ($unicodedata_txt);
73
process_nameslist_txt ($nameslist_txt);
74
process_blocks_txt ($blocks_txt);
75
process_scripts_txt ($scripts_txt);
76
process_versions_txt ($versions_txt);
77
process_unihan_zip ($unihan_zip);
82
#------------------------#
84
sub process_unicode_data_txt ($)
86
my ($unicodedata_txt) = @_;
90
open (my $unicodedata, $unicodedata_txt) or die;
91
open (my $out, "> unicode-names.h") or die;
93
print "processing $unicodedata_txt...";
95
print $out "/* unicode-names.h */\n";
96
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
97
print $out "/* Generated by $0 */\n";
98
print $out "/* Generated from UCD version $v */\n\n";
100
print $out "#ifndef UNICODE_NAMES_H\n";
101
print $out "#define UNICODE_NAMES_H\n\n";
103
print $out "#include <glib.h>\n\n";
104
print $out "#include <glib/gi18n-lib.h>\n\n";
109
while (my $line = <$unicodedata>)
112
$line =~ /^([^;]+);([^;]+)/ or die;
118
push @unicode_pairs, [$hex, $name];
121
print $out "static const char unicode_names_strings[] = \\\n";
125
foreach my $name (sort keys %names) {
126
print $out " \"$name\\0\"\n";
127
$names{$name} = $offset;
128
$offset += length($name) + 1;
135
print $out "typedef struct _UnicodeName UnicodeName;\n\n";
137
print $out "static const struct _UnicodeName\n";
139
print $out " gunichar index;\n";
140
print $out " guint32 name_offset;\n";
142
print $out "unicode_names[] =\n";
147
foreach my $pair (@unicode_pairs) {
154
my ($hex, $name) = @{$pair};
155
my $offset = $names{$name};
156
print $out " {0x$hex, $offset}";
159
print $out "\n};\n\n";
162
static inline const char * unicode_name_get_name(const UnicodeName *entry)
164
guint32 offset = entry->name_offset;
165
return unicode_names_strings + offset;
170
print $out "#endif /* #ifndef UNICODE_NAMES_H */\n";
173
undef @unicode_pairs;
175
close ($unicodedata);
180
open ($unicodedata, $unicodedata_txt) or die;
181
open ($out, "> unicode-categories.h") or die;
183
# Map general category code onto symbolic name.
187
'Lu' => "G_UNICODE_UPPERCASE_LETTER",
188
'Ll' => "G_UNICODE_LOWERCASE_LETTER",
189
'Lt' => "G_UNICODE_TITLECASE_LETTER",
190
'Mn' => "G_UNICODE_NON_SPACING_MARK",
191
'Mc' => "G_UNICODE_COMBINING_MARK",
192
'Me' => "G_UNICODE_ENCLOSING_MARK",
193
'Nd' => "G_UNICODE_DECIMAL_NUMBER",
194
'Nl' => "G_UNICODE_LETTER_NUMBER",
195
'No' => "G_UNICODE_OTHER_NUMBER",
196
'Zs' => "G_UNICODE_SPACE_SEPARATOR",
197
'Zl' => "G_UNICODE_LINE_SEPARATOR",
198
'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR",
199
'Cc' => "G_UNICODE_CONTROL",
200
'Cf' => "G_UNICODE_FORMAT",
201
'Cs' => "G_UNICODE_SURROGATE",
202
'Co' => "G_UNICODE_PRIVATE_USE",
203
'Cn' => "G_UNICODE_UNASSIGNED",
206
'Lm' => "G_UNICODE_MODIFIER_LETTER",
207
'Lo' => "G_UNICODE_OTHER_LETTER",
208
'Pc' => "G_UNICODE_CONNECT_PUNCTUATION",
209
'Pd' => "G_UNICODE_DASH_PUNCTUATION",
210
'Ps' => "G_UNICODE_OPEN_PUNCTUATION",
211
'Pe' => "G_UNICODE_CLOSE_PUNCTUATION",
212
'Pi' => "G_UNICODE_INITIAL_PUNCTUATION",
213
'Pf' => "G_UNICODE_FINAL_PUNCTUATION",
214
'Po' => "G_UNICODE_OTHER_PUNCTUATION",
215
'Sm' => "G_UNICODE_MATH_SYMBOL",
216
'Sc' => "G_UNICODE_CURRENCY_SYMBOL",
217
'Sk' => "G_UNICODE_MODIFIER_SYMBOL",
218
'So' => "G_UNICODE_OTHER_SYMBOL"
221
# these shouldn't be -1
222
my ($codepoint, $last_codepoint, $start_codepoint) = (-999, -999, -999);
224
my ($category, $last_category) = ("G_FAKE1", "G_FAKE2");
225
my ($started_range, $finished_range) = (undef, undef);
227
print $out "/* unicode-categories.h */\n";
228
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
229
print $out "/* Generated by $0 */\n";
230
print $out "/* Generated from UCD version $v */\n\n";
232
print $out "#ifndef UNICODE_CATEGORIES_H\n";
233
print $out "#define UNICODE_CATEGORIES_H\n\n";
235
print $out "#include <glib.h>\n\n";
237
print $out "typedef struct _UnicodeCategory UnicodeCategory;\n\n";
239
print $out "static const struct _UnicodeCategory\n";
241
print $out " gunichar start;\n";
242
print $out " gunichar end;\n";
243
print $out " GUnicodeType category;\n";
245
print $out "unicode_categories[] =\n";
248
while (my $line = <$unicodedata>)
250
$line =~ /^([0-9A-F]*);([^;]*);([^;]*);/ or die;
251
my $codepoint = hex ($1);
253
my $category = $mappings{$3};
256
or ($category ne $last_category)
257
or (not $started_range and $codepoint != $last_codepoint + 1))
259
if ($last_codepoint >= 0) {
260
printf $out (" { 0x%4.4X, 0x%4.4X, \%s },\n", $start_codepoint, $last_codepoint, $last_category);
263
$start_codepoint = $codepoint;
266
if ($name =~ /^<.*First>$/) {
268
$finished_range = undef;
270
elsif ($name =~ /^<.*Last>$/) {
271
$started_range = undef;
274
elsif ($finished_range) {
275
$finished_range = undef;
278
$last_codepoint = $codepoint;
279
$last_category = $category;
281
printf $out (" { 0x%4.4X, 0x%4.4X, \%s },\n", $start_codepoint, $last_codepoint, $last_category);
285
print $out "#endif /* #ifndef UNICODE_CATEGORIES_H */\n";
291
#------------------------#
293
# XXX should do kFrequency too
294
sub process_unihan_zip ($)
296
my ($unihan_zip) = @_;
298
open (my $unihan, "$UNZIP -c $unihan_zip |") or die;
299
open (my $out, "> unicode-unihan.h") or die;
301
print "processing $unihan_zip";
303
print $out "/* unicode-unihan.h */\n";
304
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
305
print $out "/* Generated by $0 */\n";
306
print $out "/* Generated from UCD version $v */\n\n";
308
print $out "#ifndef UNICODE_UNIHAN_H\n";
309
print $out "#define UNICODE_UNIHAN_H\n\n";
311
print $out "#include <glib.h>\n\n";
313
print $out "typedef struct _Unihan Unihan;\n\n";
315
print $out "static const struct _Unihan\n";
317
print $out " gunichar index;\n";
318
print $out " gint32 kDefinition;\n";
319
print $out " gint32 kCantonese;\n";
320
print $out " gint32 kMandarin;\n";
321
print $out " gint32 kTang;\n";
322
print $out " gint32 kKorean;\n";
323
print $out " gint32 kJapaneseKun;\n";
324
print $out " gint32 kJapaneseOn;\n";
326
print $out "unihan[] =\n";
333
my ($kDefinition, $kCantonese, $kMandarin, $kTang, $kKorean, $kJapaneseKun, $kJapaneseOn);
336
while (my $line = <$unihan>)
339
$line =~ /^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$/ or next;
341
my $new_wc = hex ($1);
345
$value =~ s/\\/\\\\/g;
346
$value =~ s/\"/\\"/g;
350
if (defined $kDefinition or defined $kCantonese or defined $kMandarin
351
or defined $kTang or defined $kKorean or defined $kJapaneseKun
352
or defined $kJapaneseOn)
354
printf $out (" { 0x%04X, \%d, \%d, \%d, \%d, \%d, \%d, \%d },\n",
356
(defined($kDefinition) ? $kDefinition : -1),
357
(defined($kCantonese) ? $kCantonese: -1),
358
(defined($kMandarin) ? $kMandarin : -1),
359
(defined($kTang) ? $kTang : -1),
360
(defined($kKorean) ? $kKorean : -1),
361
(defined($kJapaneseKun) ? $kJapaneseKun : -1),
362
(defined($kJapaneseOn) ? $kJapaneseOn : -1));
376
for my $f qw(kDefinition kCantonese kMandarin
377
kTang kKorean kJapaneseKun kJapaneseOn) {
380
push @strings, $value;
381
my $last_offset = $offset;
382
$offset += length($value) + 1;
383
$value = $last_offset;
388
if ($field eq "kDefinition") {
389
$kDefinition = $value;
391
elsif ($field eq "kCantonese") {
392
$kCantonese = $value;
394
elsif ($field eq "kMandarin") {
397
elsif ($field eq "kTang") {
400
elsif ($field eq "kKorean") {
403
elsif ($field eq "kJapaneseKun") {
404
$kJapaneseKun = $value;
406
elsif ($field eq "kJapaneseOn") {
407
$kJapaneseOn = $value;
410
if ($i++ % 32768 == 0) {
417
print $out "static const char unihan_strings[] = \\\n";
419
for my $s (@strings) {
420
print $out " \"$s\\0\"\n";
424
print $out "static const Unihan *_get_unihan (gunichar uc)\n;";
426
for my $name qw(kDefinition kCantonese kMandarin
427
kTang kKorean kJapaneseKun kJapaneseOn) {
430
static inline const char * unihan_get_$name (const Unihan *uh)
432
gint32 offset = uh->$name;
435
return unihan_strings + offset;
438
G_CONST_RETURN gchar *
439
gucharmap_get_unicode_$name (gunichar uc)
441
const Unihan *uh = _get_unihan (uc);
445
return unihan_get_$name (uh);
451
print $out "#endif /* #ifndef UNICODE_UNIHAN_H */\n";
459
#------------------------#
463
# 0x0027 => { '=' => {
465
# 'values' => [ 'APOSTROPHE-QUOTE', 'APL quote' ]
469
# 'values' => [ 'neutral (vertical) glyph with mixed usage',
470
# '2019 is preferred for apostrophe',
471
# 'preferred characters in English for paired quotation marks are 2018 & 2019'
482
my ($out, $nameslist_hash, $token, $variable_name) = @_;
484
print $out "static const char ", $variable_name, "_strings[] = \n";
490
for my $wc (sort {$a <=> $b} keys %{$nameslist_hash})
492
next if not exists $nameslist_hash->{$wc}->{$token};
493
for my $value (@{$nameslist_hash->{$wc}->{$token}->{'values'}}) {
494
push @names_pairs, [$wc, $value];
495
next if exists $names_offsets{$value};
497
$names_offsets{$value} = $offset;
498
$offset += length($value) + 1;
500
my $printvalue = $value;
501
$printvalue =~ s/\\/\\\\/g;
502
$printvalue =~ s/\"/\\"/g;
504
printf $out (qq/ "\%s\\0"\n/, $printvalue);
510
print $out "static const UnicharStringIndex ", $variable_name, "[] = \n";
512
foreach my $pair (@names_pairs) {
513
my ($wc, $value) = @{$pair};
514
printf $out (qq/ { 0x%04X, %d },\n/, $wc, $names_offsets{$value});
516
print $out " { (gunichar)(-1), 0 } /* end marker */ \n";
520
sub process_nameslist_txt ($)
522
my ($nameslist_txt) = @_;
524
open (my $nameslist, $nameslist_txt) or die;
526
print "processing $nameslist_txt...";
528
my ($equal_i, $ex_i, $star_i, $pound_i, $colon_i) = (0, 0, 0, 0, 0);
532
my $in_multiline_comment = 0;
534
while (my $line = <$nameslist>)
536
if ($in_multiline_comment && $line =~ /^\t/)
543
$in_multiline_comment = 0;
547
$in_multiline_comment = 1;
550
elsif ($line =~ /^@/)
554
elsif ($line =~ /^([0-9A-F]+)/)
558
elsif ($line =~ /^\s+=\s+(.+)$/)
562
if (not defined $nameslist_hash->{$wc}->{'='}->{'index'}) {
563
$nameslist_hash->{$wc}->{'='}->{'index'} = $equal_i;
565
push (@{$nameslist_hash->{$wc}->{'='}->{'values'}}, $value);
569
elsif ($line =~ /^\s+\*\s+(.+)$/)
573
if (not defined $nameslist_hash->{$wc}->{'*'}->{'index'}) {
574
$nameslist_hash->{$wc}->{'*'}->{'index'} = $star_i;
576
push (@{$nameslist_hash->{$wc}->{'*'}->{'values'}}, $value);
580
elsif ($line =~ /^\s+#\s+(.+)$/)
584
if (not defined $nameslist_hash->{$wc}->{'#'}->{'index'}) {
585
$nameslist_hash->{$wc}->{'#'}->{'index'} = $pound_i;
587
push (@{$nameslist_hash->{$wc}->{'#'}->{'values'}}, $value);
591
elsif ($line =~ /^\s+:\s+(.+)$/)
595
if (not defined $nameslist_hash->{$wc}->{':'}->{'index'}) {
596
$nameslist_hash->{$wc}->{':'}->{'index'} = $colon_i;
598
push (@{$nameslist_hash->{$wc}->{':'}->{'values'}}, $value);
602
elsif ($line =~ /^\s+x\s+.*?([0-9A-F]{4,6})\)$/) # this one is different
604
my $value = hex ($1);
606
if (not defined $nameslist_hash->{$wc}->{'x'}->{'index'}) {
607
$nameslist_hash->{$wc}->{'x'}->{'index'} = $ex_i;
609
push (@{$nameslist_hash->{$wc}->{'x'}->{'values'}}, $value);
617
open (my $out, "> unicode-nameslist.h") or die;
619
print $out "/* unicode-nameslist.h */\n";
620
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
621
print $out "/* Generated by $0 */\n";
622
print $out "/* Generated from UCD version $v */\n\n";
624
print $out "#ifndef UNICODE_NAMESLIST_H\n";
625
print $out "#define UNICODE_NAMESLIST_H\n\n";
627
print $out "#include <glib.h>\n\n";
629
print $out "typedef struct _UnicharStringIndex UnicharStringIndex;\n";
630
print $out "typedef struct _UnicharUnichar UnicharUnichar;\n";
631
print $out "typedef struct _NamesList NamesList;\n\n";
633
print $out "struct _UnicharStringIndex\n";
635
print $out " gunichar index;\n";
636
print $out " guint32 string_index;\n";
637
print $out "}; \n\n";
639
print $out "struct _UnicharUnichar\n";
641
print $out " gunichar index;\n";
642
print $out " gunichar value;\n";
643
print $out "}; \n\n";
645
print $out "struct _NamesList\n";
647
print $out " gunichar index;\n";
648
print $out " gint16 equals_index; /* -1 means */\n";
649
print $out " gint16 stars_index; /* this character */\n";
650
print $out " gint16 exes_index; /* doesn't */\n";
651
print $out " gint16 pounds_index; /* have any */\n";
652
print $out " gint16 colons_index;\n";
655
print_names_list($out, $nameslist_hash, '=', "names_list_equals");
656
print_names_list($out, $nameslist_hash, '*', "names_list_stars");
657
print_names_list($out, $nameslist_hash, '#', "names_list_pounds");
658
print_names_list($out, $nameslist_hash, ':', "names_list_colons");
660
print $out "static const UnicharUnichar names_list_exes[] = \n";
662
for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
664
next if not exists $nameslist_hash->{$wc}->{'x'};
665
for my $value (@{$nameslist_hash->{$wc}->{'x'}->{'values'}}) {
666
printf $out (qq/ { 0x%04X, 0x%04X },\n/, $wc, $value);
669
print $out " { (gunichar)(-1), 0 }\n";
672
print $out "static const NamesList names_list[] =\n";
674
for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
676
my $eq = exists $nameslist_hash->{$wc}->{'='}->{'index'} ? $nameslist_hash->{$wc}->{'='}->{'index'} : -1;
677
my $star = exists $nameslist_hash->{$wc}->{'*'}->{'index'} ? $nameslist_hash->{$wc}->{'*'}->{'index'} : -1;
678
my $ex = exists $nameslist_hash->{$wc}->{'x'}->{'index'} ? $nameslist_hash->{$wc}->{'x'}->{'index'} : -1;
679
my $pound = exists $nameslist_hash->{$wc}->{'#'}->{'index'} ? $nameslist_hash->{$wc}->{'#'}->{'index'} : -1;
680
my $colon = exists $nameslist_hash->{$wc}->{':'}->{'index'} ? $nameslist_hash->{$wc}->{':'}->{'index'} : -1;
682
printf $out (" { 0x%04X, \%d, \%d, \%d, \%d, \%d },\n", $wc, $eq, $star, $ex, $pound, $colon);
686
print $out "#endif /* #ifndef UNICODE_NAMESLIST_H */\n";
693
#------------------------#
695
sub process_blocks_txt ($)
697
my ($blocks_txt) = @_;
699
# Override script names
700
my %block_overrides =
705
open (my $blocks, $blocks_txt) or die;
706
open (my $out, "> unicode-blocks.h") or die;
708
print "processing $blocks_txt...";
710
print $out "/* unicode-blocks.h */\n";
711
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
712
print $out "/* Generated by $0 */\n";
713
print $out "/* Generated from UCD version $v */\n\n";
715
print $out "#ifndef UNICODE_BLOCKS_H\n";
716
print $out "#define UNICODE_BLOCKS_H\n\n";
718
print $out "#include <glib.h>\n";
719
print $out "#include <glib/gi18n-lib.h>\n\n";
724
while (my $line = <$blocks>)
726
$line =~ /^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$/ or next;
728
my ($start,$end,$block) = ($1, $2, $3);
730
if (exists $block_overrides{$block}) {
731
$block = $block_overrides{$block};
734
push @blocks, [$start, $end, $block, $offset];
735
$offset += length($block) + 1;
738
print $out "/* for extraction by intltool */\n";
739
print $out "#if 0\n";
740
foreach my $block (@blocks)
742
my ($start, $end, $name, $offset) = @{$block};
743
print $out qq/ N_("$name"),\n/;
745
print $out "#endif /* 0 */\n\n";
747
print $out "static const char unicode_blocks_strings[] =\n";
748
foreach my $block (@blocks)
750
my ($start, $end, $name, $offset) = @{$block};
751
print $out qq/ "$name\\0"\n/;
755
print $out "typedef struct _UnicodeBlock UnicodeBlock;\n";
757
print $out "static const struct _UnicodeBlock\n";
759
print $out " gunichar start;\n";
760
print $out " gunichar end;\n";
761
print $out " guint16 block_name_index;\n";
763
print $out "unicode_blocks[] =\n";
765
foreach my $block (@blocks)
767
my ($start, $end, $name, $offset) = @{$block};
768
print $out qq/ { 0x$start, 0x$end, $offset },\n/;
772
print $out "#endif /* #ifndef UNICODE_BLOCKS_H */\n";
780
#------------------------#
782
sub process_scripts_txt ($)
784
my ($scripts_txt) = @_;
786
# Override script names
787
my %script_overrides =
795
open (my $scripts_file, $scripts_txt) or die;
796
open (my $out, "> unicode-scripts.h") or die;
798
print "processing $scripts_txt...";
800
while (my $line = <$scripts_file>)
802
my ($start, $end, $raw_script);
804
if ($line =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\S+)/)
810
elsif ($line =~ /^([0-9A-F]+)\s+;\s+(\S+)/)
821
my $script = $raw_script;
823
$script =~ s/(\w+)/\u\L$1/g;
825
if (exists $script_overrides{$script}) {
826
$script = $script_overrides{$script};
829
$script_hash{$start} = { 'end' => $end, 'script' => $script };
830
$scripts{$script} = 1;
833
close ($scripts_file);
835
# Adds Common to make sure works with UCD <= 4.0.0
836
$scripts{"Common"} = 1;
838
print $out "/* unicode-scripts.h */\n";
839
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
840
print $out "/* Generated by $0 */\n";
841
print $out "/* Generated from UCD version $v */\n\n";
843
print $out "#ifndef UNICODE_SCRIPTS_H\n";
844
print $out "#define UNICODE_SCRIPTS_H\n\n";
846
print $out "#include <glib.h>\n";
847
print $out "#include <glib/gi18n-lib.h>\n\n";
849
print $out "typedef struct _UnicodeScript UnicodeScript;\n\n";
851
print $out "/* for extraction by intltool */\n";
852
print $out "#if 0\n";
854
for my $script (sort keys %scripts)
856
$scripts{$script} = $i;
859
print $out qq/ N_("$script"),\n/;
861
print $out "#endif /* 0 */\n\n";
863
print $out "static const gchar unicode_script_list_strings[] =\n";
866
for my $script (sort keys %scripts)
868
printf $out (qq/ "\%s\\0"\n/, $script);
869
$script_offsets{$script} = $offset;
870
$offset += length($script) + 1;
875
print $out "static const guint16 unicode_script_list_offsets[] =\n";
877
for my $script (sort keys %scripts)
879
printf $out (qq/ \%d,\n/, $script_offsets{$script});
883
print $out "static const struct _UnicodeScript\n";
885
print $out " gunichar start;\n";
886
print $out " gunichar end;\n";
887
print $out " guint8 script_index; /* index into unicode_script_list_offsets */\n";
889
print $out "unicode_scripts[] =\n";
891
for my $start (sort { $a <=> $b } keys %script_hash)
893
printf $out (qq/ { 0x%04X, 0x%04X, \%2d },\n/,
894
$start, $script_hash{$start}->{'end'}, $scripts{$script_hash{$start}->{'script'}});
898
print $out "#endif /* #ifndef UNICODE_SCRIPTS_H */\n";
904
#------------------------#
906
sub process_versions_txt ($)
908
my ($versions_txt) = @_;
913
open (my $versions, $versions_txt) or die;
914
open (my $out, "> unicode-versions.h") or die;
916
print "processing $versions_txt...";
918
while (my $line = <$versions>)
920
my ($start, $end, $raw_version);
922
if ($line =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\S+)/)
928
elsif ($line =~ /^([0-9A-F]+)\s+;\s+(\S+)/)
939
my $version = $raw_version;
941
$version =~ s/(\w+)/\u\L$1/g;
943
$versions{$version} = 1;
945
$version =~ s/\./_/g;
946
$version_hash{$start} = { 'end' => $end, 'version' => $version };
951
print $out "/* unicode-versions.h */\n";
952
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
953
print $out "/* Generated by $0 */\n";
954
print $out "/* Generated from UCD version $v */\n\n";
956
print $out "#ifndef UNICODE_VERSIONS_H\n";
957
print $out "#define UNICODE_VERSIONS_H\n\n";
959
print $out "#include <glib.h>\n";
960
print $out "#include <glib/gi18n-lib.h>\n\n";
962
print $out "typedef struct {\n";
963
print $out " gunichar start;\n";
964
print $out " gunichar end;\n";
965
print $out " GucharmapUnicodeVersion version;\n";
966
print $out "} UnicodeVersion;\n\n";
968
print $out "static const UnicodeVersion unicode_versions[] =\n";
970
for my $start (sort { $a <=> $b } keys %version_hash)
972
printf $out (qq/ { 0x%04X, 0x%04X, GUCHARMAP_UNICODE_VERSION_\%s },\n/,
973
$start, $version_hash{$start}->{'end'}, $version_hash{$start}->{'version'});
977
print $out "static const gchar unicode_version_strings[] =\n";
980
for my $version (sort keys %versions)
982
printf $out (qq/ "\%s\\0"\n/, $version);
983
$version_offsets{$version} = $offset;
984
$offset += length($version) + 1;
989
print $out "static const guint16 unicode_version_string_offsets[] =\n";
991
for my $version (sort keys %versions)
993
printf $out (qq/ \%d,\n/, $version_offsets{$version});
997
print $out "#endif /* #ifndef UNICODE_VERSIONS_H */\n";