3
# @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4
# with $ENV{PERL_CORE} set
5
# In case we need it in future...
6
require Config; import Config;
12
my @orig_ARGV = @ARGV;
13
our $VERSION = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
15
# These may get re-ordered.
16
# RAW is a do_now as inserted by &enter
17
# AGG is an aggreagated do_now, as built up by &process
34
# (See the algorithm in encengine.c - we're building structures for it)
36
# There are two sorts of structures.
37
# "do_now" (an array, two variants of what needs storing) is whatever we need
38
# to do now we've read an input byte.
39
# It's housed in a "do_next" (which is how we got to it), and in turn points
40
# to a "do_next" which contains all the "do_now"s for the next input byte.
42
# There will be a "do_next" which is the start state.
43
# For a single byte encoding it's the only "do_next" - each "do_now" points
44
# back to it, and each "do_now" will cause bytes. There is no state.
46
# For a multi-byte encoding where all characters in the input are the same
47
# length, then there will be a tree of "do_now"->"do_next"->"do_now"
48
# branching out from the start state, one step for each input byte.
49
# The leaf "do_now"s will all be at the same distance from the start state,
50
# only the leaf "do_now"s cause output bytes, and they in turn point back to
53
# For an encoding where there are varaible length input byte sequences, you
54
# will encounter a leaf "do_now" sooner for the shorter input sequences, but
55
# as before the leaves will point back to the start state.
57
# The system will cope with escape encodings (imagine them as a mostly
58
# self-contained tree for each escape state, and cross links between trees
59
# at the state-switching characters) but so far no input format defines these.
61
# The system will also cope with having output "leaves" in the middle of
62
# the bifurcating branches, not just at the extremities, but again no
63
# input format does this yet.
65
# There are two variants of the "do_now" structure. The first, smaller variant
66
# is generated by &enter as the input file is read. There is one structure
67
# for each input byte. Say we are mapping a single byte encoding to a
68
# single byte encoding, with "ABCD" going "abcd". There will be
69
# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
71
# &process then walks the tree, building aggregate "do_now" structres for
72
# adjacent bytes where possible. The aggregate is for a contiguous range of
73
# bytes which each produce the same length of output, each move to the
74
# same next state, and each have the same fallback flag.
75
# So our 4 RAW "do_now"s above become replaced by a single structure
77
# ["A", "D", "abcd", 1, ...]
78
# ie, for an input byte $_ in "A".."D", output 1 byte, found as
79
# substr ("abcd", (ord $_ - ord "A") * 1, 1)
80
# which maps very nicely into pointer arithmetic in C for encengine.c
84
# UTF-8 encode long hand - only covers part of perl's range
86
# chr() works in native space so convert value from table
87
# into that space before using chr().
88
my $ch = chr(utf8::unicode_to_native($_[0]));
89
# Now get core perl to encode that the way it likes.
97
## my ($ch,$page) = @_; return chr($ch);
103
# encode double byte MS byte first
104
## my ($ch,$page) = @_; return chr($page).chr($ch);
105
return chr ($_[1]) . chr $_[0];
110
# encode Multi-byte - single for 0..255 otherwise double
111
## my ($ch,$page) = @_;
112
## return &encode_D if $page;
114
return chr ($_[1]) . chr $_[0] if $_[1];
118
my %encode_types = (U => \&encode_U,
124
# Win32 does not expand globs on command line
125
eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
129
# -Q to disable the duplicate codepoint test
130
# -S make mapping errors fatal
131
# -q to remove comments written to output files
132
# -O to enable the (brute force) substring optimiser
133
# -o <output> to specify the output file name (else it's the first arg)
134
# -f <inlist> to give a file with a list of input files (else use the args)
135
# -n <name> to name the encoding (else use the basename of the input file.
136
getopts('CM:SQqOo:f:n:',\%opt);
138
$opt{M} and make_makefile_pl($opt{M}, @ARGV);
139
$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
141
# This really should go first, else the die here causes empty (non-erroneous)
142
# output files to be written.
144
if (exists $opt{'f'}) {
145
# -F is followed by name of file containing list of filenames
146
my $flist = $opt{'f'};
147
open(FLIST,$flist) || die "Cannot open $flist:$!";
148
chomp(@encfiles = <FLIST>);
154
my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
155
chmod(0666,$cname) if -f $cname && !-w $cname;
156
open(C,">$cname") || die "Cannot open $cname:$!";
161
my ($doC,$doEnc,$doUcm,$doPet);
163
if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
166
$dname =~ s/(\.[^\.]*)?$/.exh/;
167
chmod(0666,$dname) if -f $cname && !-w $dname;
168
open(D,">$dname") || die "Cannot open $dname:$!";
169
$hname =~ s/(\.[^\.]*)?$/.h/;
170
chmod(0666,$hname) if -f $cname && !-w $hname;
171
open(H,">$hname") || die "Cannot open $hname:$!";
173
foreach my $fh (\*C,\*D,\*H)
175
print $fh <<"END" unless $opt{'q'};
177
!!!!!!! DO NOT EDIT THIS FILE !!!!!!!
178
This file was autogenerated by:
180
enc2xs VERSION $VERSION
185
if ($cname =~ /(\w+)\.xs$/)
187
print C "#include <EXTERN.h>\n";
188
print C "#include <perl.h>\n";
189
print C "#include <XSUB.h>\n";
190
print C "#define U8 U8\n";
192
print C "#include \"encode.h\"\n\n";
195
elsif ($cname =~ /\.enc$/)
199
elsif ($cname =~ /\.ucm$/)
203
elsif ($cname =~ /\.pet$/)
219
if ($a =~ /^.*-(\d+)/)
222
if ($b =~ /^.*-(\d+)/)
232
foreach my $enc (sort cmp_name @encfiles)
234
my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
235
$name = $opt{'n'} if exists $opt{'n'};
240
compile_enc(\*E,lc($name));
244
compile_ucm(\*E,lc($name));
249
warn "Cannot open $enc for $name:$!";
255
print STDERR "Writing compiled form\n";
256
foreach my $name (sort cmp_name keys %encoding)
258
my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
259
process($name.'_utf8',$e2u);
260
addstrings(\*C,$e2u);
262
process('utf8_'.$name,$u2e);
263
addstrings(\*C,$u2e);
265
outbigstring(\*C,"enctable");
266
foreach my $name (sort cmp_name keys %encoding)
268
my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
269
outtable(\*C,$e2u, "enctable");
270
outtable(\*C,$u2e, "enctable");
272
# push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
274
my $cpp = ($Config{d_cplusplus} || '') eq 'define';
275
my $exta = $cpp ? 'extern "C" ' : "static";
276
my $extb = $cpp ? 'extern "C" ' : "";
277
foreach my $enc (sort cmp_name keys %encoding)
279
# my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
280
my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
281
#my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
283
$replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
284
my $sym = "${enc}_encoding";
286
my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
288
print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";
289
print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";
290
print C "${extb} const encode_t $sym = \n";
291
# This is to make null encoding work -- dankogai
292
for (my $i = (scalar @info) - 1; $i >= 0; --$i){
295
# end of null tweak -- dankogai
296
print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
299
foreach my $enc (sort cmp_name keys %encoding)
301
my $sym = "${enc}_encoding";
303
print H "extern encode_t $sym;\n";
304
print D " Encode_XSEncoding(aTHX_ &$sym);\n";
307
if ($cname =~ /(\w+)\.xs$/)
313
Encode_XSEncoding(pTHX_ encode_t *enc)
316
HV *stash = gv_stashpv("Encode::XS", TRUE);
317
SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
323
const char *name = enc->name[i++];
324
XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
327
call_pv("Encode::define_encoding",G_DISCARD);
333
print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
334
print C "BOOT:\n{\n";
335
print C "#include \"$dname\"\n";
338
# Close in void context is bad, m'kay
339
close(D) or warn "Error closing '$dname': $!";
340
close(H) or warn "Error closing '$hname': $!";
342
my $perc_saved = $saved/($strings + $saved) * 100;
343
my $perc_subsaved = $subsave/($strings + $subsave) * 100;
344
printf STDERR "%d bytes in string tables\n",$strings;
345
printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
346
$saved, $perc_saved if $saved;
347
printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
348
$subsave, $perc_subsaved if $subsave;
352
foreach my $name (sort cmp_name keys %encoding)
354
my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
355
output_enc(\*C,$name,$e2u);
360
foreach my $name (sort cmp_name keys %encoding)
362
my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
363
output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
367
# writing half meg files and then not checking to see if you just filled the
369
close(C) or die "Error closing '$cname': $!";
371
# End of the main program.
383
last if /^\s*CHARMAP\s*$/i;
384
if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
389
if (!defined($cs = $attr{'code_set_name'}))
391
warn "No <code_set_name> in $name\n";
395
$name = $cs unless exists $opt{'n'};
401
if (exists $attr{'subchar'})
404
#$attr{'subchar'} =~ /^\s*/cg;
405
#push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
406
#$erep = join('',map(chr(hex($_)),@byte));
407
$erep = $attr{'subchar'};
408
$erep =~ s/^\s+//; $erep =~ s/\s+$//;
410
print "Reading $name ($cs)\n";
416
last if /^\s*END\s+CHARMAP\s*$/i;
418
my (@uni, @byte) = ();
419
my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
420
or die "Bad line: $_";
421
while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
422
push @uni, map { substr($_, 1) } split(/\+/, $1);
424
while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
429
my $uch = join('', map { encode_U(hex($_)) } @uni );
430
my $ech = join('',map(chr(hex($_)),@byte));
431
my $el = length($ech);
432
$max_el = $el if (!defined($max_el) || $el > $max_el);
433
$min_el = $el if (!defined($min_el) || $el < $min_el);
444
# $fb is fallback flag
445
# 0 - round trip safe
446
# 1 - fallback for unicode -> enc
447
# 2 - skip sub-char mapping
448
# 3 - fallback enc -> unicode
449
enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
450
enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
459
die "$nfb entries without fallback, $hfb entries with\n";
461
$encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
473
while ($type = <$fh>)
475
last if $type !~ /^\s*#/;
478
return if $type eq 'E';
479
# Do the hash lookup once, rather than once per function call. 4% speedup.
480
my $type_func = $encode_types{$type};
481
my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
482
warn "$type encoded $name\n";
484
# Save a defined test by setting these to defined values.
485
my $min_el = ~0; # A very big integer
486
my $max_el = 0; # Anything must be longer than 0
489
$rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
493
# use -Q to silence the seen test. Makefile.PL uses this by default.
494
$seen = {} unless $opt{Q};
499
my $page = hex($line);
504
# So why is it 1% faster to leave the my here?
506
$line =~ s/\r\n$/\n/;
507
die "$.:${line}Line should be exactly 65 characters long including
508
newline (".length($line).")" unless length ($line) == 65;
509
# Split line into groups of 4 hex digits, convert groups to ints
511
# map {hex $_} $line =~ /(....)/g
512
# This takes 63.75 (2.5% less time)
513
# unpack "n*", pack "H*", $line
514
# There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
515
# Doing it as while ($line =~ /(....)/g) took 74.63
516
foreach my $val (unpack "n*", pack "H*", $line)
518
next if $val == 0xFFFD;
519
my $ech = &$type_func($ch,$page);
520
if ($val || (!$ch && !$page))
522
my $el = length($ech);
523
$max_el = $el if $el > $max_el;
524
$min_el = $el if $el < $min_el;
525
my $uch = encode_U($val);
527
# We're doing the test.
528
# We don't need to read this quickly, so storing it as a scalar,
529
# rather than 3 (anon array, plus the 2 scalars it holds) saves
530
# RAM and may make us faster on low RAM systems. [see __END__]
531
if (exists $seen->{$uch})
533
warn sprintf("U%04X is %02X%02X and %04X\n",
534
$val,$page,$ch,$seen->{$uch});
539
$seen->{$uch} = $page << 8 | $ch;
542
# Passing 2 extra args each time is 3.6% slower!
543
# Even with having to add $fallback ||= 0 later
544
enter_fb0($e2u,$ech,$uch);
545
enter_fb0($u2e,$uch,$ech);
549
# No character at this position
550
# enter($e2u,$ech,undef,$e2u);
556
die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
557
if $min_el > $max_el;
558
die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
559
$encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
562
# my ($a,$s,$d,$t,$fb) = @_;
564
my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
565
# state we shift to after this (multibyte) input character defaults to same
568
# Making sure it is defined seems to be faster than {no warnings;} in
569
# &process, or passing it in as 0 explicity.
570
# XXX $fallback ||= 0;
572
# Start at the beginning and work forwards through the string to zero.
573
# effectively we are removing 1 character from the front each time
574
# but we don't actually edit the string. [this alone seems to be 14% speedup]
575
# Hence -$pos is the length of the remaining string.
576
my $pos = -length $inbytes;
578
my $byte = substr $inbytes, $pos, 1;
581
# RAW_OUT_BYTES => 2,
583
# to unicode an array would seem to be better, because the pages are dense.
584
# from unicode can be very sparse, favouring a hash.
585
# hash using the bytes (all length 1) as keys rather than ord value,
586
# as it's easier to sort these in &process.
588
# It's faster to always add $fallback even if it's undef, rather than
589
# choosing between 3 and 4 element array. (hence why we set it defined
591
my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
592
# When $pos was -1 we were at the last input character.
594
$do_now->[RAW_OUT_BYTES] = $outbytes;
595
$do_now->[RAW_NEXT] = $next;
598
# Tail recursion. The intermdiate state may not have a name yet.
599
$current = $do_now->[RAW_NEXT];
603
# This is purely for optimistation. It's just &enter hard coded for $fallback
604
# of 0, using only a 3 entry array ref to save memory for every entry.
606
my ($current,$inbytes,$outbytes,$next) = @_;
609
my $pos = -length $inbytes;
611
my $byte = substr $inbytes, $pos, 1;
612
my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
614
$do_now->[RAW_OUT_BYTES] = $outbytes;
615
$do_now->[RAW_NEXT] = $next;
618
$current = $do_now->[RAW_NEXT];
628
my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
631
foreach my $key (sort keys %$raw) {
634
# RAW_OUT_BYTES => 2,
636
my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
637
# Now we are converting from raw to aggregate, switch from 1 byte strings
642
# If this == fails, we're going to reset $agg_max_in below anyway.
643
$b == ++$agg_max_in &&
644
# References in numeric context give the pointer as an int.
645
$agg_next == $next &&
646
$agg_in_len == $in_len &&
647
$agg_out_len == length $out_bytes &&
648
$agg_fallback == $fallback
649
# && length($l->[AGG_OUT_BYTES]) < 16
651
# my $i = ord($b)-ord($l->[AGG_MIN_IN]);
652
# we can aggregate this byte onto the end.
653
$l->[AGG_MAX_IN] = $b;
654
$l->[AGG_OUT_BYTES] .= $out_bytes;
658
# AGG_OUT_BYTES => 2,
663
# Reset the last thing we saw, plus set 5 lexicals to save some derefs.
664
# (only gains .6% on euc-jp -- is it worth it?)
665
push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
666
$agg_in_len = $in_len, $agg_out_len = length $out_bytes,
667
$agg_fallback = $fallback];
669
if (exists $next->{Cname}) {
670
$next->{'Forward'} = 1 if $next != $a;
672
process(sprintf("%s_%02x",$name,$b),$next);
675
# encengine.c rules say that last entry must be for 255
676
if ($agg_max_in < 255) {
677
push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
679
$a->{'Entries'} = \@ent;
686
my $name = $a->{'Cname'};
688
foreach my $b (@{$a->{'Entries'}})
690
next unless $b->[AGG_OUT_LEN];
691
$strings{$b->[AGG_OUT_BYTES]} = undef;
695
my $cpp = ($Config{d_cplusplus} || '') eq 'define';
696
my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';
697
my $const = $cpp ? '' : 'const';
698
print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
700
$a->{'DoneStrings'} = 1;
701
foreach my $b (@{$a->{'Entries'}})
703
my ($s,$e,$out,$t,$end,$l) = @$b;
704
addstrings($fh,$t) unless $t->{'DoneStrings'};
714
# Make the big string in the string accumulator. Longest first, on the hope
715
# that this makes it more likely that we find the short strings later on.
716
# Not sure if it helps sorting strings of the same length lexcically.
717
foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
718
my $index = index $string_acc, $s;
720
$saved += length($s);
721
$strings_in_acc{$s} = $index;
725
my $sublength = length $s;
726
while (--$sublength > 0) {
727
# progressively lop characters off the end, to see if the start of
728
# the new string overlaps the end of the accumulator.
729
if (substr ($string_acc, -$sublength)
730
eq substr ($s, 0, $sublength)) {
731
$subsave += $sublength;
732
$strings_in_acc{$s} = length ($string_acc) - $sublength;
733
# append the last bit on the end.
734
$string_acc .= substr ($s, $sublength);
737
# or if the end of the new string overlaps the start of the
739
next unless substr ($string_acc, 0, $sublength)
740
eq substr ($s, -$sublength);
741
# well, the last $sublength characters of the accumulator match.
742
# so as we're prepending to the accumulator, need to shift all our
743
# existing offsets forwards
744
$_ += $sublength foreach values %strings_in_acc;
745
$subsave += $sublength;
746
$strings_in_acc{$s} = 0;
747
# append the first bit on the start.
748
$string_acc = substr ($s, 0, -$sublength) . $string_acc;
752
# Optimiser (if it ran) found nothing, so just going have to tack the
753
# whole thing on the end.
754
$strings_in_acc{$s} = length $string_acc;
760
$strings = length $string_acc;
761
my $cpp = ($Config{d_cplusplus} || '') eq 'define';
762
my $var = $cpp ? '' : 'static';
763
my $definition = "\n$var const U8 $name\[$strings] = { " .
764
join(',',unpack "C*",$string_acc);
765
# We have a single long line. Split it at convenient commas.
766
print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
767
print $fh substr ($definition, pos $definition), " };\n";
772
my $offset = $strings_in_acc{$s};
773
die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
774
unless defined $offset;
780
my ($fh,$a,$bigname) = @_;
781
my $name = $a->{'Cname'};
783
foreach my $b (@{$a->{'Entries'}})
785
my ($s,$e,$out,$t,$end,$l) = @$b;
786
outtable($fh,$t,$bigname) unless $t->{'Done'};
788
my $cpp = ($Config{d_cplusplus} || '') eq 'define';
789
my $var = $cpp ? '' : 'static';
790
my $const = $cpp ? '' : 'const';
791
print $fh "\n$var $const encpage_t $name\[",
792
scalar(@{$a->{'Entries'}}), "] = {\n";
793
foreach my $b (@{$a->{'Entries'}})
795
my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
796
# $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
800
printf $fh findstring($bigname,$out);
806
print $fh ",",$t->{Cname};
807
printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
814
my ($fh,$name,$a) = @_;
815
die "Changed - fix me for new structure";
816
foreach my $b (sort keys %$a)
818
my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
830
my $s = do "unicore/Name.pl";
831
die "char_names: unicore/Name.pl: $!\n" unless defined $s;
833
while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
837
last if $s >= 0x10000;
838
my $e = length($2) ? hex($2) : $s;
839
for (my $i = $s; $i <= $e; $i++)
842
# print sprintf("U%04X $name\n",$i);
849
my ($cmap,$a,$t,$pre) = @_;
850
# warn sprintf("Page %x\n",$pre);
852
foreach my $key (sort keys %$raw) {
855
# RAW_OUT_BYTES => 2,
857
my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
861
if ($next != $a && $next != $t) {
862
output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
863
} elsif (length $out_bytes) {
865
$u = $pre|($u &0x3f);
867
my $s = sprintf "<U%04X> ",$u;
868
#foreach my $c (split(//,$out_bytes)) {
869
# $s .= sprintf "\\x%02X",ord($c);
871
# 9.5% faster changing that loop to this:
872
$s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
873
$s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
876
warn join(',',$u, @{$raw->{$key}},$a,$t);
883
my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
884
print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
885
print $fh "<code_set_name> \"$name\"\n";
889
print $fh "<mb_cur_min> $min_el\n";
893
print $fh "<mb_cur_max> $max_el\n";
897
print $fh "<subchar> ";
898
foreach my $c (split(//,$rep))
900
printf $fh "\\x%02X",ord($c);
905
output_ucm_page(\@cmap,$h,$h,0);
906
print $fh "#\nCHARMAP\n";
907
foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
911
print $fh "END CHARMAP\n";
925
eval { require File::Find; };
928
push @inc, $inc unless $inc eq '.'; #skip current dir
932
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
933
$atime,$mtime,$ctime,$blksize,$blocks)
934
= lstat($_) or return;
938
$e2x_dir{$File::Find::dir} ||= $mtime;
942
warn join("\n", keys %e2x_dir), "\n";
943
for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
945
# warn "$_E2X => ", scalar localtime($e2x_dir{$d});
952
eval { require Encode; };
953
$@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
954
# our used for variable expanstion
956
$_Version = $VERSION;
959
$_TableFiles = join(",", map {qq('$_')} @_);
960
$_Now = scalar localtime();
962
eval { require File::Spec; };
963
_print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
964
_print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
965
_print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
966
_print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
967
_print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
976
sub make_configlocal_pm {
977
eval { require Encode; };
978
$@ and die "Unable to require Encode: $@\n";
979
eval { require File::Spec; };
981
# our used for variable expanstion
982
my %in_core = map { $_ => 1 } (
983
'ascii', 'iso-8859-1', 'utf8',
984
'ascii-ctrl', 'null', 'utf-8-strict'
991
$File::Find::name =~ /\A\./ and return;
992
$File::Find::name =~ /\.pm\z/ or return;
993
$File::Find::name =~ m/\bEncode\b/ or return;
994
my $mod = $File::Find::name;
995
$mod =~ s/.*\bEncode\b/Encode/o;
998
warn qq{ require $mod;\n};
999
eval qq{ require $mod; };
1000
$@ and die "Can't require $mod: $@\n";
1001
for my $enc ( Encode->encodings() ) {
1003
$in_core{$enc} and next;
1004
$Encode::Config::ExtModule{$enc} and next;
1005
$LocalMod{$enc} ||= $mod;
1008
File::Find::find({wanted => $wanted}, @INC);
1010
for my $enc ( sort keys %LocalMod ) {
1012
qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
1015
$_LocalVer = _mkversion();
1017
$_Inc = $INC{"Encode.pm"};
1018
$_Inc =~ s/\.pm$//o;
1019
_print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1020
File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
1025
# v-string is now depreciated; use time() instead;
1026
#my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1027
#$yyyy += 1900, $mo +=1;
1028
#return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1033
eval { require File::Basename; };
1034
$@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
1035
File::Basename->import();
1036
my ($src, $dst, $clobber) = @_;
1037
if (!$clobber and -e $dst){
1038
warn "$dst exists. skipping\n";
1041
warn "Generating $dst...\n";
1042
open my $in, $src or die "$src : $!";
1043
if ((my $d = dirname($dst)) ne '.'){
1044
-d $d or mkdir $d, 0755 or die "mkdir $d : $!";
1046
open my $out, ">$dst" or die "$!";
1049
if (/^#### END_OF_HEADER/){
1052
s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1060
enc2xs -- Perl Encode Module Generator
1065
enc2xs -M ModName mapfiles...
1070
F<enc2xs> builds a Perl extension for use by Encode from either
1071
Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1072
Besides being used internally during the build process of the Encode
1073
module, you can use F<enc2xs> to add your own encoding to perl.
1074
No knowledge of XS is necessary.
1078
If you want to know as little about Perl as possible but need to
1079
add a new encoding, just read this chapter and forget the rest.
1085
Have a .ucm file ready. You can get it from somewhere or you can write
1086
your own from scratch or you can grab one from the Encode distribution
1087
and customize it. For the UCM format, see the next Chapter. In the
1088
example below, I'll call my theoretical encoding myascii, defined
1089
in I<my.ucm>. C<$> is a shell prompt.
1096
Issue a command as follows;
1098
$ enc2xs -M My my.ucm
1099
generating Makefile.PL
1104
Now take a look at your current directory. It should look like this.
1107
Makefile.PL My.pm my.ucm t/
1109
The following files were created.
1111
Makefile.PL - MakeMaker script
1112
My.pm - Encode submodule
1119
If you want *.ucm installed together with the modules, do as follows;
1123
$ enc2xs -M My Encode/*ucm
1129
Edit the files generated. You don't have to if you have no time AND no
1130
intention to give it to someone else. But it is a good idea to edit
1131
the pod and to add more tests.
1135
Now issue a command all Perl Mongers love:
1138
Writing Makefile for Encode::My
1142
Now all you have to do is make.
1145
cp My.pm blib/lib/Encode/My.pm
1146
/usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1147
-o encode_t.c -f encode_t.fnm
1148
Reading myascii (myascii)
1149
Writing compiled form
1150
128 bytes in string tables
1151
384 bytes (75%) saved spotting duplicates
1152
1 bytes (0.775%) saved using substrings
1154
chmod 644 blib/arch/auto/Encode/My/My.bs
1157
The time it takes varies depending on how fast your machine is and
1158
how large your encoding is. Unless you are working on something big
1159
like euc-tw, it won't take too long.
1163
You can "make install" already but you should test first.
1166
PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1167
-e 'use Test::Harness qw(&runtests $verbose); \
1168
$verbose=0; runtests @ARGV;' t/*.t
1170
All tests successful.
1171
Files=1, Tests=2, 0 wallclock secs
1172
( 0.09 cusr + 0.01 csys = 0.09 CPU)
1176
If you are content with the test result, just "make install"
1180
If you want to add your encoding to Encode's demand-loading list
1181
(so you don't have to "use Encode::YourEncoding"), run
1185
to update Encode::ConfigLocal, a module that controls local settings.
1186
After that, "use Encode;" is enough to load your encodings on demand.
1190
=head1 The Unicode Character Map
1192
Encode uses the Unicode Character Map (UCM) format for source character
1193
mappings. This format is used by IBM's ICU package and was adopted
1194
by Nick Ing-Simmons for use with the Encode module. Since UCM is
1195
more flexible than Tcl's Encoding Map and far more user-friendly,
1196
this is the recommended format for Encode now.
1198
A UCM file looks like this.
1203
<code_set_name> "US-ascii" # Required
1204
<code_set_alias> "ascii" # Optional
1205
<mb_cur_min> 1 # Required; usually 1
1206
<mb_cur_max> 1 # Max. # of bytes/char
1207
<subchar> \x3F # Substitution char
1210
<U0000> \x00 |0 # <control>
1211
<U0001> \x01 |0 # <control>
1212
<U0002> \x02 |0 # <control>
1214
<U007C> \x7C |0 # VERTICAL LINE
1215
<U007D> \x7D |0 # RIGHT CURLY BRACKET
1216
<U007E> \x7E |0 # TILDE
1217
<U007F> \x7F |0 # <control>
1224
Anything that follows C<#> is treated as a comment.
1228
The header section continues until a line containing the word
1229
CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1230
pair per line. Strings used as values must be quoted. Barewords are
1231
treated as numbers. I<\xXX> represents a byte.
1233
Most of the keywords are self-explanatory. I<subchar> means
1234
substitution character, not subcharacter. When you decode a Unicode
1235
sequence to this encoding but no matching character is found, the byte
1236
sequence defined here will be used. For most cases, the value here is
1237
\x3F; in ASCII, this is a question mark.
1241
CHARMAP starts the character map section. Each line has a form as
1244
<UXXXX> \xXX.. |0 # comment
1246
| | +- Fallback flag
1247
| +-------- Encoded byte sequence
1248
+-------------- Unicode Character ID in hex
1250
The format is roughly the same as a header section except for the
1251
fallback flag: | followed by 0..3. The meaning of the possible
1252
values is as follows:
1258
Round trip safe. A character decoded to Unicode encodes back to the
1259
same byte sequence. Most characters have this flag.
1263
Fallback for unicode -> encoding. When seen, enc2xs adds this
1264
character for the encode map only.
1268
Skip sub-char mapping should there be no code point.
1272
Fallback for encoding -> unicode. When seen, enc2xs adds this
1273
character for the decode map only.
1279
And finally, END OF CHARMAP ends the section.
1283
When you are manually creating a UCM file, you should copy ascii.ucm
1284
or an existing encoding which is close to yours, rather than write
1285
your own from scratch.
1287
When you do so, make sure you leave at least B<U0000> to B<U0020> as
1288
is, unless your environment is EBCDIC.
1290
B<CAVEAT>: not all features in UCM are implemented. For example,
1291
icu:state is not used. Because of that, you need to write a perl
1292
module if you want to support algorithmical encodings, notably
1293
the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1294
L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1296
=head2 Coping with duplicate mappings
1298
When you create a map, you SHOULD make your mappings round-trip safe.
1299
That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1300
$data> stands for all characters that are marked as C<|0>. Here is
1307
Sort your map in Unicode order.
1311
When you have a duplicate entry, mark either one with '|1' or '|3'.
1315
And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1319
Here is an example from big5-eten.
1324
Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1328
--------------------------------------
1329
\xF9\xF9 => U2550 U2550 => \xF9\xF9
1332
So it is round-trip safe for \xF9\xF9. But if the line above is upside
1333
down, here is what happens.
1336
--------------------------------------
1337
\xA2\xA4 => U2550 U2550 => \xF9\xF9
1338
(\xF9\xF9 => U2550 is now overwritten!)
1340
The Encode package comes with F<ucmlint>, a crude but sufficient
1341
utility to check the integrity of a UCM file. Check under the
1342
Encode/bin directory for this.
1344
When in doubt, you can use F<ucmsort>, yet another utility under
1345
Encode/bin directory.
1354
L<http://oss.software.ibm.com/icu/>
1358
ICU Character Mapping Tables
1359
L<http://oss.software.ibm.com/icu/charset/>
1364
L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1376
# -Q to disable the duplicate codepoint test
1377
# -S make mapping errors fatal
1378
# -q to remove comments written to output files
1379
# -O to enable the (brute force) substring optimiser
1380
# -o <output> to specify the output file name (else it's the first arg)
1381
# -f <inlist> to give a file with a list of input files (else use the args)
1382
# -n <name> to name the encoding (else use the basename of the input file.
1384
With %seen holding array refs:
1386
865.66 real 28.80 user 8.79 sys
1387
7904 maximum resident set size
1388
1356 average shared memory size
1389
18566 average unshared data size
1390
229 average unshared stack size
1394
With %seen holding simple scalars:
1396
342.16 real 27.11 user 3.54 sys
1397
8388 maximum resident set size
1398
1394 average shared memory size
1399
14969 average unshared data size
1400
236 average unshared stack size
1404
Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1405
how %seen is storing things its seen. So it is pathalogically bad on a 16M
1406
RAM machine, but it's going to help even on modern machines.
1407
Swapping is bad, m'kay :-)