3
# Copyright (c) 2001-2011, PostgreSQL Global Development Group
5
# src/backend/utils/mb/Unicode/UCS_to_BIG5.pl
7
# Generate UTF-8 <--> BIG5 conversion tables from
8
# map files provided by Unicode organization.
9
# Unfortunately it is prohibited by the organization
10
# to distribute the map files. So if you try to use this script,
11
# you have to obtain the map files from the organization's ftp site.
12
# ftp://www.unicode.org/Public/MAPPINGS/
14
# Our "big5" comes from BIG5.TXT, with the addition of the characters
15
# in the range 0xf9d6-0xf9dc from CP950.TXT.
20
# # and Unicode name (not used in this script)
25
# # and Unicode name (not used in this script)
32
# first, generate UTF8 --> BIG5 table
34
$in_file = "BIG5.TXT";
36
open( FILE, $in_file ) || die( "cannot open $in_file" );
45
( $c, $u, $rest ) = split;
48
if( $code >= 0x80 && $ucs >= 0x0080){
49
$utf = &ucs2utf($ucs);
50
if( $array{ $utf } ne "" ){
51
printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
55
$array{ $utf } = $code;
60
$in_file = "CP950.TXT";
62
open( FILE, $in_file ) || die( "cannot open $in_file" );
69
( $c, $u, $rest ) = split;
73
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
75
if( $code >= 0x80 && $ucs >= 0x0080 &&
76
$code >= 0xf9d6 && $code <= 0xf9dc ){
77
$utf = &ucs2utf($ucs);
78
if( $array{ $utf } ne "" ){
79
printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
83
$array{ $utf } = $code;
88
$file = lc("utf8_to_big5.map");
89
open( FILE, "> $file" ) || die( "cannot open $file" );
90
print FILE "static pg_utf_to_local ULmapBIG5[ $count ] = {\n";
92
for $index ( sort {$a <=> $b} keys( %array ) ){
93
$code = $array{ $index };
96
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
98
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
106
# then generate BIG5 --> UTF8 table
108
$in_file = "BIG5.TXT";
110
open( FILE, $in_file ) || die( "cannot open $in_file" );
119
( $c, $u, $rest ) = split;
122
if( $code >= 0x80 && $ucs >= 0x0080){
123
$utf = &ucs2utf($ucs);
124
if( $array{ $utf } ne "" ){
125
printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
129
$array{ $code } = $utf;
134
$in_file = "CP950.TXT";
136
open( FILE, $in_file ) || die( "cannot open $in_file" );
143
( $c, $u, $rest ) = split;
147
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
149
if( $code >= 0x80 && $ucs >= 0x0080 &&
150
$code >= 0xf9d6 && $code <= 0xf9dc ){
151
$utf = &ucs2utf($ucs);
152
if( $array{ $utf } ne "" ){
153
printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
157
$array{ $code } = $utf;
162
$file = lc("big5_to_utf8.map");
163
open( FILE, "> $file" ) || die( "cannot open $file" );
164
print FILE "static pg_local_to_utf LUmapBIG5[ $count ] = {\n";
165
for $index ( sort {$a <=> $b} keys( %array ) ){
166
$utf = $array{ $index };
169
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
171
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;