~ubuntu-branches/ubuntu/raring/libencode-perl/raring

« back to all changes in this revision

Viewing changes to bin/enc2xs

Committer: Bazaar Package Importer
Author(s): Jose Luis Rivas
Date: 2007-05-18 23:49:27 UTC
Revision ID: james.westby@ubuntu.com-20070518234927-bs37c807cty7i1ny

Tags: upstream-2.21

Import upstream version 2.21

files added:

AUTHORS

Byte

Byte/Byte.pm

Byte/Makefile.PL

CN/CN.pm

CN/Makefile.PL

Changes

EBCDIC

EBCDIC/EBCDIC.pm

EBCDIC/Makefile.PL

Encode

Encode.pm

Encode.xs

Encode/Changes.e2x

Encode/ConfigLocal_PM.e2x

Encode/Makefile_PL.e2x

Encode/README.e2x

Encode/_PM.e2x

Encode/_T.e2x

Encode/encode.h

JP/JP.pm

JP/Makefile.PL

KR/KR.pm

KR/Makefile.PL

MANIFEST

META.yml

Makefile.PL

README

Symbol

Symbol/Makefile.PL

Symbol/Symbol.pm

TW/Makefile.PL

TW/TW.pm

Unicode

Unicode/Makefile.PL

Unicode/Unicode.pm

Unicode/Unicode.xs

bin/enc2xs

bin/piconv

bin/ucm2table

bin/ucmlint

bin/ucmsort

bin/unidump

encengine.c

encoding.pm

lib/Encode

lib/Encode/Alias.pm

lib/Encode/CJKConstants.pm

lib/Encode/CN

lib/Encode/CN/HZ.pm

lib/Encode/Config.pm

lib/Encode/Encoder.pm

lib/Encode/Encoding.pm

lib/Encode/GSM0338.pm

lib/Encode/Guess.pm

lib/Encode/JP

lib/Encode/JP/H2Z.pm

lib/Encode/JP/JIS7.pm

lib/Encode/KR

lib/Encode/KR/2022_KR.pm

lib/Encode/MIME

lib/Encode/MIME/Header

lib/Encode/MIME/Header.pm

lib/Encode/MIME/Header/ISO_2022_JP.pm

lib/Encode/MIME/Name.pm

lib/Encode/PerlIO.pod

lib/Encode/Supported.pod

lib/Encode/Unicode

lib/Encode/Unicode/UTF7.pm

t/Aliases.t

t/CJKT.t

t/Encode.t

t/Encoder.t

t/Mod_EUCJP.pm

t/Unicode.t

t/at-cn.t

t/at-tw.t

t/big5-eten.enc

t/big5-eten.utf

t/big5-hkscs.enc

t/big5-hkscs.utf

t/enc_data.t

t/enc_eucjp.t

t/enc_module.enc

t/enc_module.t

t/enc_utf8.t

t/encoding.t

t/fallback.t

t/from_to.t

t/gb2312.enc

t/gb2312.utf

t/grow.t

t/gsm0338.t

t/guess.t

t/jis7-fallback.t

t/jisx0201.enc

t/jisx0201.utf

t/jisx0208.enc

t/jisx0208.utf

t/jisx0212.enc

t/jisx0212.utf

t/jperl.t

t/ksc5601.enc

t/ksc5601.utf

t/mime-header.t

t/mime-name.t

t/mime_header_iso2022jp.t

t/perlio.t

t/rt.pl

t/unibench.pl

t/utf8strict.t

ucm/8859-1.ucm

ucm/8859-10.ucm

ucm/8859-11.ucm

ucm/8859-13.ucm

ucm/8859-14.ucm

ucm/8859-15.ucm

ucm/8859-16.ucm

ucm/8859-2.ucm

ucm/8859-3.ucm

ucm/8859-4.ucm

ucm/8859-5.ucm

ucm/8859-6.ucm

ucm/8859-7.ucm

ucm/8859-8.ucm

ucm/8859-9.ucm

ucm/adobeStdenc.ucm

ucm/adobeSymbol.ucm

ucm/adobeZdingbat.ucm

ucm/ascii.ucm

ucm/big5-eten.ucm

ucm/big5-hkscs.ucm

ucm/cp037.ucm

ucm/cp1006.ucm

ucm/cp1026.ucm

ucm/cp1047.ucm

ucm/cp1250.ucm

ucm/cp1251.ucm

ucm/cp1252.ucm

ucm/cp1253.ucm

ucm/cp1254.ucm

ucm/cp1255.ucm

ucm/cp1256.ucm

ucm/cp1257.ucm

ucm/cp1258.ucm

ucm/cp424.ucm

ucm/cp437.ucm

ucm/cp500.ucm

ucm/cp737.ucm

ucm/cp775.ucm

ucm/cp850.ucm

ucm/cp852.ucm

ucm/cp855.ucm

ucm/cp856.ucm

ucm/cp857.ucm

ucm/cp860.ucm

ucm/cp861.ucm

ucm/cp862.ucm

ucm/cp863.ucm

ucm/cp864.ucm

ucm/cp865.ucm

ucm/cp866.ucm

ucm/cp869.ucm

ucm/cp874.ucm

ucm/cp875.ucm

ucm/cp932.ucm

ucm/cp936.ucm

ucm/cp949.ucm

ucm/cp950.ucm

ucm/ctrl.ucm

ucm/dingbats.ucm

ucm/euc-cn.ucm

ucm/euc-jp.ucm

ucm/euc-kr.ucm

ucm/gb12345.ucm

ucm/gb2312.ucm

ucm/hp-roman8.ucm

ucm/ir-165.ucm

ucm/jis0201.ucm

ucm/jis0208.ucm

ucm/jis0212.ucm

ucm/johab.ucm

ucm/koi8-f.ucm

ucm/koi8-r.ucm

ucm/koi8-u.ucm

ucm/ksc5601.ucm

ucm/macArabic.ucm

ucm/macCentEuro.ucm

ucm/macChinsimp.ucm

ucm/macChintrad.ucm

ucm/macCroatian.ucm

ucm/macCyrillic.ucm

ucm/macDingbats.ucm

ucm/macFarsi.ucm

ucm/macGreek.ucm

ucm/macHebrew.ucm

ucm/macIceland.ucm

ucm/macJapanese.ucm

ucm/macKorean.ucm

ucm/macROMnn.ucm

ucm/macRUMnn.ucm

ucm/macRoman.ucm

ucm/macSami.ucm

ucm/macSymbol.ucm

ucm/macThai.ucm

ucm/macTurkish.ucm

ucm/macUkraine.ucm

ucm/nextstep.ucm

ucm/null.ucm

ucm/posix-bc.ucm

ucm/shiftjis.ucm

ucm/symbol.ucm

ucm/viscii.ucm

Show diffs side-by-side

added added

removed removed

bin/enc2xs

#!./perl

BEGIN {

# @INC poking no longer needed w/ new MakeMaker and Makefile.PL's

# with $ENV{PERL_CORE} set

# In case we need it in future...

require Config; import Config;

}

use strict;

use warnings;

use Getopt::Std;

use Config;

my @orig_ARGV = @ARGV;

our $VERSION = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

# These may get re-ordered.

# RAW is a do_now as inserted by &enter

# AGG is an aggreagated do_now, as built up by &process

use constant {

RAW_NEXT => 0,

RAW_IN_LEN => 1,

RAW_OUT_BYTES => 2,

RAW_FALLBACK => 3,

AGG_MIN_IN => 0,

AGG_MAX_IN => 1,

AGG_OUT_BYTES => 2,

AGG_NEXT => 3,

AGG_IN_LEN => 4,

AGG_OUT_LEN => 5,

AGG_FALLBACK => 6,

};

# (See the algorithm in encengine.c - we're building structures for it)

# There are two sorts of structures.

# "do_now" (an array, two variants of what needs storing) is whatever we need

# to do now we've read an input byte.

# It's housed in a "do_next" (which is how we got to it), and in turn points

# to a "do_next" which contains all the "do_now"s for the next input byte.

# There will be a "do_next" which is the start state.

# For a single byte encoding it's the only "do_next" - each "do_now" points

# back to it, and each "do_now" will cause bytes. There is no state.

# For a multi-byte encoding where all characters in the input are the same

# length, then there will be a tree of "do_now"->"do_next"->"do_now"

# branching out from the start state, one step for each input byte.

# The leaf "do_now"s will all be at the same distance from the start state,

# only the leaf "do_now"s cause output bytes, and they in turn point back to

# the start state.

# For an encoding where there are varaible length input byte sequences, you

# will encounter a leaf "do_now" sooner for the shorter input sequences, but

# as before the leaves will point back to the start state.

# The system will cope with escape encodings (imagine them as a mostly

# self-contained tree for each escape state, and cross links between trees

# at the state-switching characters) but so far no input format defines these.

# The system will also cope with having output "leaves" in the middle of

# the bifurcating branches, not just at the extremities, but again no

# input format does this yet.

# There are two variants of the "do_now" structure. The first, smaller variant

# is generated by &enter as the input file is read. There is one structure

# for each input byte. Say we are mapping a single byte encoding to a

# single byte encoding, with "ABCD" going "abcd". There will be

# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}

# &process then walks the tree, building aggregate "do_now" structres for

# adjacent bytes where possible. The aggregate is for a contiguous range of

# bytes which each produce the same length of output, each move to the

# same next state, and each have the same fallback flag.

# So our 4 RAW "do_now"s above become replaced by a single structure

# containing:

# ["A", "D", "abcd", 1, ...]

# ie, for an input byte $_ in "A".."D", output 1 byte, found as

# substr ("abcd", (ord $_ - ord "A") * 1, 1)

# which maps very nicely into pointer arithmetic in C for encengine.c

sub encode_U

{

# UTF-8 encode long hand - only covers part of perl's range

## my $uv = shift;

# chr() works in native space so convert value from table

# into that space before using chr().

my $ch = chr(utf8::unicode_to_native($_[0]));

# Now get core perl to encode that the way it likes.

utf8::encode($ch);

return $ch;

}

sub encode_S

{

# encode single byte

## my ($ch,$page) = @_; return chr($ch);

return chr $_[0];

}

100

101

sub encode_D

102

{

103

# encode double byte MS byte first

104

## my ($ch,$page) = @_; return chr($page).chr($ch);

105

return chr ($_[1]) . chr $_[0];

106

}

107

108

sub encode_M

109

{

110

# encode Multi-byte - single for 0..255 otherwise double

111

## my ($ch,$page) = @_;

112

## return &encode_D if $page;

113

## return &encode_S;

114

return chr ($_[1]) . chr $_[0] if $_[1];

115

return chr $_[0];

116

}

117

118

my %encode_types = (U => \&encode_U,

119

S => \&encode_S,

120

D => \&encode_D,

121

M => \&encode_M,

122

);

123

124

# Win32 does not expand globs on command line

125

eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');

126

127

my %opt;

128

# I think these are:

129

# -Q to disable the duplicate codepoint test

130

# -S make mapping errors fatal

131

# -q to remove comments written to output files

132

# -O to enable the (brute force) substring optimiser

133

# -o <output> to specify the output file name (else it's the first arg)

134

# -f <inlist> to give a file with a list of input files (else use the args)

135

# -n <name> to name the encoding (else use the basename of the input file.

136

getopts('CM:SQqOo:f:n:',\%opt);

137

138

$opt{M} and make_makefile_pl($opt{M}, @ARGV);

139

$opt{C} and make_configlocal_pm($opt{C}, @ARGV);

140

141

# This really should go first, else the die here causes empty (non-erroneous)

142

# output files to be written.

143

my @encfiles;

144

if (exists $opt{'f'}) {

145

# -F is followed by name of file containing list of filenames

146

my $flist = $opt{'f'};

147

open(FLIST,$flist) || die "Cannot open $flist:$!";

148

chomp(@encfiles = <FLIST>);

149

close(FLIST);

150

} else {

151

@encfiles = @ARGV;

152

}

153

154

my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);

155

chmod(0666,$cname) if -f $cname && !-w $cname;

156

open(C,">$cname") || die "Cannot open $cname:$!";

157

158

my $dname = $cname;

159

my $hname = $cname;

160

161

my ($doC,$doEnc,$doUcm,$doPet);

162

163

if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined

164

{

165

$doC = 1;

166

$dname =~ s/(\.[^\.]*)?$/.exh/;

167

chmod(0666,$dname) if -f $cname && !-w $dname;

168

open(D,">$dname") || die "Cannot open $dname:$!";

169

$hname =~ s/(\.[^\.]*)?$/.h/;

170

chmod(0666,$hname) if -f $cname && !-w $hname;

171

open(H,">$hname") || die "Cannot open $hname:$!";

172

173

foreach my $fh (\*C,\*D,\*H)

174

{

175

print $fh <<"END" unless $opt{'q'};

176

177

!!!!!!! DO NOT EDIT THIS FILE !!!!!!!

178

This file was autogenerated by:

179

$^X $0 @orig_ARGV

180

enc2xs VERSION $VERSION

181

182

END

183

}

184

185

if ($cname =~ /(\w+)\.xs$/)

186

{

187

print C "#include <EXTERN.h>\n";

188

print C "#include <perl.h>\n";

189

print C "#include <XSUB.h>\n";

190

print C "#define U8 U8\n";

191

}

192

print C "#include \"encode.h\"\n\n";

193

194

}

195

elsif ($cname =~ /\.enc$/)

196

{

197

$doEnc = 1;

198

}

199

elsif ($cname =~ /\.ucm$/)

200

{

201

$doUcm = 1;

202

}

203

elsif ($cname =~ /\.pet$/)

204

{

205

$doPet = 1;

206

}

207

208

my %encoding;

209

my %strings;

210

my $string_acc;

211

my %strings_in_acc;

212

213

my $saved = 0;

214

my $subsave = 0;

215

my $strings = 0;

216

217

sub cmp_name

218

{

219

if ($a =~ /^.*-(\d+)/)

220

{

221

my $an = $1;

222

if ($b =~ /^.*-(\d+)/)

223

{

224

my $r = $an <=> $1;

225

return $r if $r;

226

}

227

}

228

return $a cmp $b;

229

}

230

231

232

foreach my $enc (sort cmp_name @encfiles)

233

{

234

my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;

235

$name = $opt{'n'} if exists $opt{'n'};

236

if (open(E,$enc))

237

{

238

if ($sfx eq 'enc')

239

{

240

compile_enc(\*E,lc($name));

241

}

242

else

243

{

244

compile_ucm(\*E,lc($name));

245

}

246

}

247

else

248

{

249

warn "Cannot open $enc for $name:$!";

250

}

251

}

252

253

if ($doC)

254

{

255

print STDERR "Writing compiled form\n";

256

foreach my $name (sort cmp_name keys %encoding)

257

{

258

my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};

259

process($name.'_utf8',$e2u);

260

addstrings(\*C,$e2u);

261

262

process('utf8_'.$name,$u2e);

263

addstrings(\*C,$u2e);

264

}

265

outbigstring(\*C,"enctable");

266

foreach my $name (sort cmp_name keys %encoding)

267

{

268

my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};

269

outtable(\*C,$e2u, "enctable");

270

outtable(\*C,$u2e, "enctable");

271

272

# push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));

273

}

274

my $cpp = ($Config{d_cplusplus} || '') eq 'define';

275

my $exta = $cpp ? 'extern "C" ' : "static";

276

my $extb = $cpp ? 'extern "C" ' : "";

277

foreach my $enc (sort cmp_name keys %encoding)

278

{

279

# my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};

280

my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};

281

#my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);

282

my $replen = 0;

283

$replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);

284

my $sym = "${enc}_encoding";

285

$sym =~ s/\W+/_/g;

286

my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,

287

$min_el,$max_el);

288

print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";

289

print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";

290

print C "${extb} const encode_t $sym = \n";

291

# This is to make null encoding work -- dankogai

292

for (my $i = (scalar @info) - 1; $i >= 0; --$i){

293

$info[$i] ||= 1;

294

}

295

# end of null tweak -- dankogai

296

print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";

297

}

298

299

foreach my $enc (sort cmp_name keys %encoding)

300

{

301

my $sym = "${enc}_encoding";

302

$sym =~ s/\W+/_/g;

303

print H "extern encode_t $sym;\n";

304

print D " Encode_XSEncoding(aTHX_ &$sym);\n";

305

}

306

307

if ($cname =~ /(\w+)\.xs$/)

308

{

309

my $mod = $1;

310

print C <<'END';

311

312

static void

313

Encode_XSEncoding(pTHX_ encode_t *enc)

314

{

315

dSP;

316

HV *stash = gv_stashpv("Encode::XS", TRUE);

317

SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);

318

int i = 0;

319

PUSHMARK(sp);

320

XPUSHs(sv);

321

while (enc->name[i])

322

{

323

const char *name = enc->name[i++];

324

XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));

325

}

326

PUTBACK;

327

call_pv("Encode::define_encoding",G_DISCARD);

328

SvREFCNT_dec(sv);

329

}

330

331

END

332

333

print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";

334

print C "BOOT:\n{\n";

335

print C "#include \"$dname\"\n";

336

print C "}\n";

337

}

338

# Close in void context is bad, m'kay

339

close(D) or warn "Error closing '$dname': $!";

340

close(H) or warn "Error closing '$hname': $!";

341

342

my $perc_saved = $saved/($strings + $saved) * 100;

343

my $perc_subsaved = $subsave/($strings + $subsave) * 100;

344

printf STDERR "%d bytes in string tables\n",$strings;

345

printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",

346

$saved, $perc_saved if $saved;

347

printf STDERR "%d bytes (%.3g%%) saved using substrings\n",

348

$subsave, $perc_subsaved if $subsave;

349

}

350

elsif ($doEnc)

351

{

352

foreach my $name (sort cmp_name keys %encoding)

353

{

354

my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};

355

output_enc(\*C,$name,$e2u);

356

}

357

}

358

elsif ($doUcm)

359

{

360

foreach my $name (sort cmp_name keys %encoding)

361

{

362

my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};

363

output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);

364

}

365

}

366

367

# writing half meg files and then not checking to see if you just filled the

368

# disk is bad, m'kay

369

close(C) or die "Error closing '$cname': $!";

370

371

# End of the main program.

372

373

sub compile_ucm

374

{

375

my ($fh,$name) = @_;

376

my $e2u = {};

377

my $u2e = {};

378

my $cs;

379

my %attr;

380

while (<$fh>)

381

{

382

s/#.*$//;

383

last if /^\s*CHARMAP\s*$/i;

384

if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr

385

{

386

$attr{$1} = $2;

387

}

388

}

389

if (!defined($cs = $attr{'code_set_name'}))

390

{

391

warn "No <code_set_name> in $name\n";

392

}

393

else

394

{

395

$name = $cs unless exists $opt{'n'};

396

}

397

my $erep;

398

my $urep;

399

my $max_el;

400

my $min_el;

401

if (exists $attr{'subchar'})

402

{

403

#my @byte;

404

#$attr{'subchar'} =~ /^\s*/cg;

405

#push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;

406

#$erep = join('',map(chr(hex($_)),@byte));

407

$erep = $attr{'subchar'};

408

$erep =~ s/^\s+//; $erep =~ s/\s+$//;

409

}

410

print "Reading $name ($cs)\n";

411

my $nfb = 0;

412

my $hfb = 0;

413

while (<$fh>)

414

{

415

s/#.*$//;

416

last if /^\s*END\s+CHARMAP\s*$/i;

417

next if /^\s*$/;

418

my (@uni, @byte) = ();

419

my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o

420

or die "Bad line: $_";

421

while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){

422

push @uni, map { substr($_, 1) } split(/\+/, $1);

423

}

424

while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){

425

push @byte, $1;

426

}

427

if (@uni)

428

{

429

my $uch = join('', map { encode_U(hex($_)) } @uni );

430

my $ech = join('',map(chr(hex($_)),@byte));

431

my $el = length($ech);

432

$max_el = $el if (!defined($max_el) || $el > $max_el);

433

$min_el = $el if (!defined($min_el) || $el < $min_el);

434

if (length($fb))

435

{

436

$fb = substr($fb,1);

437

$hfb++;

438

}

439

else

440

{

441

$nfb++;

442

$fb = '0';

443

}

444

# $fb is fallback flag

445

# 0 - round trip safe

446

# 1 - fallback for unicode -> enc

447

# 2 - skip sub-char mapping

448

# 3 - fallback enc -> unicode

449

enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);

450

enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);

451

}

452

else

453

{

454

warn $_;

455

}

456

}

457

if ($nfb && $hfb)

458

{

459

die "$nfb entries without fallback, $hfb entries with\n";

460

}

461

$encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];

462

}

463

464

465

466

sub compile_enc

467

{

468

my ($fh,$name) = @_;

469

my $e2u = {};

470

my $u2e = {};

471

472

my $type;

473

while ($type = <$fh>)

474

{

475

last if $type !~ /^\s*#/;

476

}

477

chomp($type);

478

return if $type eq 'E';

479

# Do the hash lookup once, rather than once per function call. 4% speedup.

480

my $type_func = $encode_types{$type};

481

my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));

482

warn "$type encoded $name\n";

483

my $rep = '';

484

# Save a defined test by setting these to defined values.

485

my $min_el = ~0; # A very big integer

486

my $max_el = 0; # Anything must be longer than 0

487

{

488

my $v = hex($def);

489

$rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);

490

}

491

my $errors;

492

my $seen;

493

# use -Q to silence the seen test. Makefile.PL uses this by default.

494

$seen = {} unless $opt{Q};

495

496

{

497

my $line = <$fh>;

498

chomp($line);

499

my $page = hex($line);

500

my $ch = 0;

501

my $i = 16;

502

503

{

504

# So why is it 1% faster to leave the my here?

505

my $line = <$fh>;

506

$line =~ s/\r\n$/\n/;

507

die "$.:${line}Line should be exactly 65 characters long including

508

newline (".length($line).")" unless length ($line) == 65;

509

# Split line into groups of 4 hex digits, convert groups to ints

510

# This takes 65.35

511

# map {hex $_} $line =~ /(....)/g

512

# This takes 63.75 (2.5% less time)

513

# unpack "n*", pack "H*", $line

514

# There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay

515

# Doing it as while ($line =~ /(....)/g) took 74.63

516

foreach my $val (unpack "n*", pack "H*", $line)

517

{

518

next if $val == 0xFFFD;

519

my $ech = &$type_func($ch,$page);

520

if ($val || (!$ch && !$page))

521

{

522

my $el = length($ech);

523

$max_el = $el if $el > $max_el;

524

$min_el = $el if $el < $min_el;

525

my $uch = encode_U($val);

526

if ($seen) {

527

# We're doing the test.

528

# We don't need to read this quickly, so storing it as a scalar,

529

# rather than 3 (anon array, plus the 2 scalars it holds) saves

530

# RAM and may make us faster on low RAM systems. [see __END__]

531

if (exists $seen->{$uch})

532

{

533

warn sprintf("U%04X is %02X%02X and %04X\n",

534

$val,$page,$ch,$seen->{$uch});

535

$errors++;

536

}

537

else

538

{

539

$seen->{$uch} = $page << 8 | $ch;

540

}

541

}

542

# Passing 2 extra args each time is 3.6% slower!

543

# Even with having to add $fallback ||= 0 later

544

enter_fb0($e2u,$ech,$uch);

545

enter_fb0($u2e,$uch,$ech);

546

}

547

else

548

{

549

# No character at this position

550

# enter($e2u,$ech,undef,$e2u);

551

}

552

$ch++;

553

}

554

} while --$i;

555

} while --$pages;

556

die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"

557

if $min_el > $max_el;

558

die "$errors mapping conflicts\n" if ($errors && $opt{'S'});

559

$encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];

560

}

561

562

# my ($a,$s,$d,$t,$fb) = @_;

563

sub enter {

564

my ($current,$inbytes,$outbytes,$next,$fallback) = @_;

565

# state we shift to after this (multibyte) input character defaults to same

566

# as current state.

567

$next ||= $current;

568

# Making sure it is defined seems to be faster than {no warnings;} in

569

# &process, or passing it in as 0 explicity.

570

# XXX $fallback ||= 0;

571

572

# Start at the beginning and work forwards through the string to zero.

573

# effectively we are removing 1 character from the front each time

574

# but we don't actually edit the string. [this alone seems to be 14% speedup]

575

# Hence -$pos is the length of the remaining string.

576

my $pos = -length $inbytes;

577

while (1) {

578

my $byte = substr $inbytes, $pos, 1;

579

# RAW_NEXT => 0,

580

# RAW_IN_LEN => 1,

581

# RAW_OUT_BYTES => 2,

582

# RAW_FALLBACK => 3,

583

# to unicode an array would seem to be better, because the pages are dense.

584

# from unicode can be very sparse, favouring a hash.

585

# hash using the bytes (all length 1) as keys rather than ord value,

586

# as it's easier to sort these in &process.

587

588

# It's faster to always add $fallback even if it's undef, rather than

589

# choosing between 3 and 4 element array. (hence why we set it defined

590

# above)

591

my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];

592

# When $pos was -1 we were at the last input character.

593

unless (++$pos) {

594

$do_now->[RAW_OUT_BYTES] = $outbytes;

595

$do_now->[RAW_NEXT] = $next;

596

return;

597

}

598

# Tail recursion. The intermdiate state may not have a name yet.

599

$current = $do_now->[RAW_NEXT];

600

}

601

}

602

603

# This is purely for optimistation. It's just &enter hard coded for $fallback

604

# of 0, using only a 3 entry array ref to save memory for every entry.

605

sub enter_fb0 {

606

my ($current,$inbytes,$outbytes,$next) = @_;

607

$next ||= $current;

608

609

my $pos = -length $inbytes;

610

while (1) {

611

my $byte = substr $inbytes, $pos, 1;

612

my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];

613

unless (++$pos) {

614

$do_now->[RAW_OUT_BYTES] = $outbytes;

615

$do_now->[RAW_NEXT] = $next;

616

return;

617

}

618

$current = $do_now->[RAW_NEXT];

619

}

620

}

621

622

sub process

623

{

624

my ($name,$a) = @_;

625

$name =~ s/\W+/_/g;

626

$a->{Cname} = $name;

627

my $raw = $a->{Raw};

628

my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);

629

my @ent;

630

$agg_max_in = 0;

631

foreach my $key (sort keys %$raw) {

632

# RAW_NEXT => 0,

633

# RAW_IN_LEN => 1,

634

# RAW_OUT_BYTES => 2,

635

# RAW_FALLBACK => 3,

636

my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};

637

# Now we are converting from raw to aggregate, switch from 1 byte strings

638

# to numbers

639

my $b = ord $key;

640

$fallback ||= 0;

641

if ($l &&

642

# If this == fails, we're going to reset $agg_max_in below anyway.

643

$b == ++$agg_max_in &&

644

# References in numeric context give the pointer as an int.

645

$agg_next == $next &&

646

$agg_in_len == $in_len &&

647

$agg_out_len == length $out_bytes &&

648

$agg_fallback == $fallback

649

# && length($l->[AGG_OUT_BYTES]) < 16

650

) {

651

# my $i = ord($b)-ord($l->[AGG_MIN_IN]);

652

# we can aggregate this byte onto the end.

653

$l->[AGG_MAX_IN] = $b;

654

$l->[AGG_OUT_BYTES] .= $out_bytes;

655

} else {

656

# AGG_MIN_IN => 0,

657

# AGG_MAX_IN => 1,

658

# AGG_OUT_BYTES => 2,

659

# AGG_NEXT => 3,

660

# AGG_IN_LEN => 4,

661

# AGG_OUT_LEN => 5,

662

# AGG_FALLBACK => 6,

663

# Reset the last thing we saw, plus set 5 lexicals to save some derefs.

664

# (only gains .6% on euc-jp -- is it worth it?)

665

push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,

666

$agg_in_len = $in_len, $agg_out_len = length $out_bytes,

667

$agg_fallback = $fallback];

668

}

669

if (exists $next->{Cname}) {

670

$next->{'Forward'} = 1 if $next != $a;

671

} else {

672

process(sprintf("%s_%02x",$name,$b),$next);

673

}

674

}

675

# encengine.c rules say that last entry must be for 255

676

if ($agg_max_in < 255) {

677

push @ent, [1+$agg_max_in, 255,undef,$a,0,0];

678

}

679

$a->{'Entries'} = \@ent;

680

}

681

682

683

sub addstrings

684

{

685

my ($fh,$a) = @_;

686

my $name = $a->{'Cname'};

687

# String tables

688

foreach my $b (@{$a->{'Entries'}})

689

{

690

next unless $b->[AGG_OUT_LEN];

691

$strings{$b->[AGG_OUT_BYTES]} = undef;

692

}

693

if ($a->{'Forward'})

694

{

695

my $cpp = ($Config{d_cplusplus} || '') eq 'define';

696

my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';

697

my $const = $cpp ? '' : 'const';

698

print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";

699

}

700

$a->{'DoneStrings'} = 1;

701

foreach my $b (@{$a->{'Entries'}})

702

{

703

my ($s,$e,$out,$t,$end,$l) = @$b;

704

addstrings($fh,$t) unless $t->{'DoneStrings'};

705

}

706

}

707

708

sub outbigstring

709

{

710

my ($fh,$name) = @_;

711

712

$string_acc = '';

713

714

# Make the big string in the string accumulator. Longest first, on the hope

715

# that this makes it more likely that we find the short strings later on.

716

# Not sure if it helps sorting strings of the same length lexcically.

717

foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {

718

my $index = index $string_acc, $s;

719

if ($index >= 0) {

720

$saved += length($s);

721

$strings_in_acc{$s} = $index;

722

} else {

723

OPTIMISER: {

724

if ($opt{'O'}) {

725

my $sublength = length $s;

726

while (--$sublength > 0) {

727

# progressively lop characters off the end, to see if the start of

728

# the new string overlaps the end of the accumulator.

729

if (substr ($string_acc, -$sublength)

730

eq substr ($s, 0, $sublength)) {

731

$subsave += $sublength;

732

$strings_in_acc{$s} = length ($string_acc) - $sublength;

733

# append the last bit on the end.

734

$string_acc .= substr ($s, $sublength);

735

last OPTIMISER;

736

}

737

# or if the end of the new string overlaps the start of the

738

# accumulator

739

next unless substr ($string_acc, 0, $sublength)

740

eq substr ($s, -$sublength);

741

# well, the last $sublength characters of the accumulator match.

742

# so as we're prepending to the accumulator, need to shift all our

743

# existing offsets forwards

744

$_ += $sublength foreach values %strings_in_acc;

745

$subsave += $sublength;

746

$strings_in_acc{$s} = 0;

747

# append the first bit on the start.

748

$string_acc = substr ($s, 0, -$sublength) . $string_acc;

749

last OPTIMISER;

750

}

751

}

752

# Optimiser (if it ran) found nothing, so just going have to tack the

753

# whole thing on the end.

754

$strings_in_acc{$s} = length $string_acc;

755

$string_acc .= $s;

756

};

757

}

758

}

759

760

$strings = length $string_acc;

761

my $cpp = ($Config{d_cplusplus} || '') eq 'define';

762

my $var = $cpp ? '' : 'static';

763

my $definition = "\n$var const U8 $name\[$strings] = { " .

764

join(',',unpack "C*",$string_acc);

765

# We have a single long line. Split it at convenient commas.

766

print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;

767

print $fh substr ($definition, pos $definition), " };\n";

768

}

769

770

sub findstring {

771

my ($name,$s) = @_;

772

my $offset = $strings_in_acc{$s};

773

die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"

774

unless defined $offset;

775

"$name + $offset";

776

}

777

778

sub outtable

779

{

780

my ($fh,$a,$bigname) = @_;

781

my $name = $a->{'Cname'};

782

$a->{'Done'} = 1;

783

foreach my $b (@{$a->{'Entries'}})

784

{

785

my ($s,$e,$out,$t,$end,$l) = @$b;

786

outtable($fh,$t,$bigname) unless $t->{'Done'};

787

}

788

my $cpp = ($Config{d_cplusplus} || '') eq 'define';

789

my $var = $cpp ? '' : 'static';

790

my $const = $cpp ? '' : 'const';

791

print $fh "\n$var $const encpage_t $name\[",

792

scalar(@{$a->{'Entries'}}), "] = {\n";

793

foreach my $b (@{$a->{'Entries'}})

794

{

795

my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;

796

# $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan

797

print $fh "{";

798

if ($l)

799

{

800

printf $fh findstring($bigname,$out);

801

}

802

else

803

{

804

print $fh "0";

805

}

806

print $fh ",",$t->{Cname};

807

printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;

808

}

809

print $fh "};\n";

810

}

811

812

sub output_enc

813

{

814

my ($fh,$name,$a) = @_;

815

die "Changed - fix me for new structure";

816

foreach my $b (sort keys %$a)

817

{

818

my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};

819

}

820

}

821

822

sub decode_U

823

{

824

my $s = shift;

825

}

826

827

my @uname;

828

sub char_names

829

{

830

my $s = do "unicore/Name.pl";

831

die "char_names: unicore/Name.pl: $!\n" unless defined $s;

832

pos($s) = 0;

833

while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)

834

{

835

my $name = $3;

836

my $s = hex($1);

837

last if $s >= 0x10000;

838

my $e = length($2) ? hex($2) : $s;

839

for (my $i = $s; $i <= $e; $i++)

840

{

841

$uname[$i] = $name;

842

# print sprintf("U%04X $name\n",$i);

843

}

844

}

845

}

846

847

sub output_ucm_page

848

{

849

my ($cmap,$a,$t,$pre) = @_;

850

# warn sprintf("Page %x\n",$pre);

851

my $raw = $t->{Raw};

852

foreach my $key (sort keys %$raw) {

853

# RAW_NEXT => 0,

854

# RAW_IN_LEN => 1,

855

# RAW_OUT_BYTES => 2,

856

# RAW_FALLBACK => 3,

857

my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};

858

my $u = ord $key;

859

$fallback ||= 0;

860

861

if ($next != $a && $next != $t) {

862

output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);

863

} elsif (length $out_bytes) {

864

if ($pre) {

865

$u = $pre|($u &0x3f);

866

}

867

my $s = sprintf "<U%04X> ",$u;

868

#foreach my $c (split(//,$out_bytes)) {

869

# $s .= sprintf "\\x%02X",ord($c);

870

871

# 9.5% faster changing that loop to this:

872

$s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;

873

$s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];

874

push(@$cmap,$s);

875

} else {

876

warn join(',',$u, @{$raw->{$key}},$a,$t);

877

}

878

}

879

}

880

881

sub output_ucm

882

{

883

my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;

884

print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};

885

print $fh "<code_set_name> \"$name\"\n";

886

char_names();

887

if (defined $min_el)

888

{

889

print $fh "<mb_cur_min> $min_el\n";

890

}

891

if (defined $max_el)

892

{

893

print $fh "<mb_cur_max> $max_el\n";

894

}

895

if (defined $rep)

896

{

897

print $fh "<subchar> ";

898

foreach my $c (split(//,$rep))

899

{

900

printf $fh "\\x%02X",ord($c);

901

}

902

print $fh "\n";

903

}

904

my @cmap;

905

output_ucm_page(\@cmap,$h,$h,0);

906

print $fh "#\nCHARMAP\n";

907

foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)

908

{

909

print $fh $line;

910

}

911

print $fh "END CHARMAP\n";

912

}

913

914

use vars qw(

915

$_Enc2xs

916

$_Version

917

$_Inc

918

$_E2X

919

$_Name

920

$_TableFiles

921

$_Now

922

);

923

924

sub find_e2x{

925

eval { require File::Find; };

926

my (@inc, %e2x_dir);

927

for my $inc (@INC){

928

push @inc, $inc unless $inc eq '.'; #skip current dir

929

}

930

File::Find::find(

931

sub {

932

my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,

933

$atime,$mtime,$ctime,$blksize,$blocks)

934

= lstat($_) or return;

935

-f _ or return;

936

if (/^.*\.e2x$/o){

937

no warnings 'once';

938

$e2x_dir{$File::Find::dir} ||= $mtime;

939

}

940

return;

941

}, @inc);

942

warn join("\n", keys %e2x_dir), "\n";

943

for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){

944

$_E2X = $d;

945

# warn "$_E2X => ", scalar localtime($e2x_dir{$d});

946

return $_E2X;

947

}

948

}

949

950

sub make_makefile_pl

951

{

952

eval { require Encode; };

953

$@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";

954

# our used for variable expanstion

955

$_Enc2xs = $0;

956

$_Version = $VERSION;

957

$_E2X = find_e2x();

958

$_Name = shift;

959

$_TableFiles = join(",", map {qq('$_')} @_);

960

$_Now = scalar localtime();

961

962

eval { require File::Spec; };

963

_print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");

964

_print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");

965

_print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");

966

_print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");

967

_print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");

968

exit;

969

}

970

971

use vars qw(

972

$_ModLines

973

$_LocalVer

974

);

975

976

sub make_configlocal_pm {

977

eval { require Encode; };

978

$@ and die "Unable to require Encode: $@\n";

979

eval { require File::Spec; };

980

981

# our used for variable expanstion

982

my %in_core = map { $_ => 1 } (

983

'ascii', 'iso-8859-1', 'utf8',

984

'ascii-ctrl', 'null', 'utf-8-strict'

985

);

986

my %LocalMod = ();

987

# check @enc;

988

use File::Find ();

989

my $wanted = sub{

990

-f $_ or return;

991

$File::Find::name =~ /\A\./ and return;

992

$File::Find::name =~ /\.pm\z/ or return;

993

$File::Find::name =~ m/\bEncode\b/ or return;

994

my $mod = $File::Find::name;

995

$mod =~ s/.*\bEncode\b/Encode/o;

996

$mod =~ s/\.pm\z//o;

997

$mod =~ s,/,::,og;

998

warn qq{ require $mod;\n};

999

eval qq{ require $mod; };

1000

$@ and die "Can't require $mod: $@\n";

1001

for my $enc ( Encode->encodings() ) {

1002

no warnings;

1003

$in_core{$enc} and next;

1004

$Encode::Config::ExtModule{$enc} and next;

1005

$LocalMod{$enc} ||= $mod;

1006

}

1007

};

1008

File::Find::find({wanted => $wanted}, @INC);

1009

$_ModLines = "";

1010

for my $enc ( sort keys %LocalMod ) {

1011

$_ModLines .=

1012

qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);

1013

}

1014

warn $_ModLines;

1015

$_LocalVer = _mkversion();

1016

$_E2X = find_e2x();

1017

$_Inc = $INC{"Encode.pm"};

1018

$_Inc =~ s/\.pm$//o;

1019

_print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),

1020

File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );

1021

exit;

1022

}

1023

1024

sub _mkversion{

1025

# v-string is now depreciated; use time() instead;

1026

#my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();

1027

#$yyyy += 1900, $mo +=1;

1028

#return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);

1029

return time();

1030

}

1031

1032

sub _print_expand{

1033

eval { require File::Basename; };

1034

$@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";

1035

File::Basename->import();

1036

my ($src, $dst, $clobber) = @_;

1037

if (!$clobber and -e $dst){

1038

warn "$dst exists. skipping\n";

1039

return;

1040

}

1041

warn "Generating $dst...\n";

1042

open my $in, $src or die "$src : $!";

1043

if ((my $d = dirname($dst)) ne '.'){

1044

-d $d or mkdir $d, 0755 or die "mkdir $d : $!";

1045

}

1046

open my $out, ">$dst" or die "$!";

1047

my $asis = 0;

1048

while (<$in>){

1049

if (/^#### END_OF_HEADER/){

1050

$asis = 1; next;

1051

}

1052

s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;

1053

print $out $_;

1054

}

1055

}

1056

__END__

1057

1058

=head1 NAME

1059

1060

enc2xs -- Perl Encode Module Generator

1061

1062

=head1 SYNOPSIS

1063

1064

enc2xs -[options]

1065

enc2xs -M ModName mapfiles...

1066

enc2xs -C

1067

1068

=head1 DESCRIPTION

1069

1070

F<enc2xs> builds a Perl extension for use by Encode from either

1071

Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).

1072

Besides being used internally during the build process of the Encode

1073

module, you can use F<enc2xs> to add your own encoding to perl.

1074

No knowledge of XS is necessary.

1075

1076

=head1 Quick Guide

1077

1078

If you want to know as little about Perl as possible but need to

1079

add a new encoding, just read this chapter and forget the rest.

1080

1081

=over 4

1082

1083

=item 0.

1084

1085

Have a .ucm file ready. You can get it from somewhere or you can write

1086

your own from scratch or you can grab one from the Encode distribution

1087

and customize it. For the UCM format, see the next Chapter. In the

1088

example below, I'll call my theoretical encoding myascii, defined

1089

in I<my.ucm>. C<$> is a shell prompt.

1090

1091

$ ls -F

1092

my.ucm

1093

1094

=item 1.

1095

1096

Issue a command as follows;

1097

1098

$ enc2xs -M My my.ucm

1099

generating Makefile.PL

1100

generating My.pm

1101

generating README

1102

generating Changes

1103

1104

Now take a look at your current directory. It should look like this.

1105

1106

$ ls -F

1107

Makefile.PL My.pm my.ucm t/

1108

1109

The following files were created.

1110

1111

Makefile.PL - MakeMaker script

1112

My.pm - Encode submodule

1113

t/My.t - test file

1114

1115

=over 4

1116

1117

=item 1.1.

1118

1119

If you want *.ucm installed together with the modules, do as follows;

1120

1121

$ mkdir Encode

1122

$ mv *.ucm Encode

1123

$ enc2xs -M My Encode/*ucm

1124

1125

=back

1126

1127

=item 2.

1128

1129

Edit the files generated. You don't have to if you have no time AND no

1130

intention to give it to someone else. But it is a good idea to edit

1131

the pod and to add more tests.

1132

1133

=item 3.

1134

1135

Now issue a command all Perl Mongers love:

1136

1137

$ perl Makefile.PL

1138

Writing Makefile for Encode::My

1139

1140

=item 4.

1141

1142

Now all you have to do is make.

1143

1144

$ make

1145

cp My.pm blib/lib/Encode/My.pm

1146

/usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \

1147

-o encode_t.c -f encode_t.fnm

1148

Reading myascii (myascii)

1149

Writing compiled form

1150

128 bytes in string tables

1151

384 bytes (75%) saved spotting duplicates

1152

1 bytes (0.775%) saved using substrings

1153

....

1154

chmod 644 blib/arch/auto/Encode/My/My.bs

1155

1156

1157

The time it takes varies depending on how fast your machine is and

1158

how large your encoding is. Unless you are working on something big

1159

like euc-tw, it won't take too long.

1160

1161

=item 5.

1162

1163

You can "make install" already but you should test first.

1164

1165

$ make test

1166

PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \

1167

-e 'use Test::Harness qw(&runtests $verbose); \

1168

$verbose=0; runtests @ARGV;' t/*.t

1169

t/My....ok

1170

All tests successful.

1171

Files=1, Tests=2, 0 wallclock secs

1172

( 0.09 cusr + 0.01 csys = 0.09 CPU)

1173

1174

=item 6.

1175

1176

If you are content with the test result, just "make install"

1177

1178

=item 7.

1179

1180

If you want to add your encoding to Encode's demand-loading list

1181

(so you don't have to "use Encode::YourEncoding"), run

1182

1183

enc2xs -C

1184

1185

to update Encode::ConfigLocal, a module that controls local settings.

1186

After that, "use Encode;" is enough to load your encodings on demand.

1187

1188

=back

1189

1190

=head1 The Unicode Character Map

1191

1192

Encode uses the Unicode Character Map (UCM) format for source character

1193

mappings. This format is used by IBM's ICU package and was adopted

1194

by Nick Ing-Simmons for use with the Encode module. Since UCM is

1195

more flexible than Tcl's Encoding Map and far more user-friendly,

1196

this is the recommended format for Encode now.

1197

1198

A UCM file looks like this.

1199

1200

1201

# Comments

1202

1203

<code_set_name> "US-ascii" # Required

1204

<code_set_alias> "ascii" # Optional

1205

<mb_cur_min> 1 # Required; usually 1

1206

<mb_cur_max> 1 # Max. # of bytes/char

1207

<subchar> \x3F # Substitution char

1208

1209

CHARMAP

1210

1211

1212

1213

....

1214

<U007C> \x7C |0 # VERTICAL LINE

1215

<U007D> \x7D |0 # RIGHT CURLY BRACKET

1216

<U007E> \x7E |0 # TILDE

1217

1218

END CHARMAP

1219

1220

=over 4

1221

1222

=item *

1223

1224

Anything that follows C<#> is treated as a comment.

1225

1226

=item *

1227

1228

The header section continues until a line containing the word

1229

CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one

1230

pair per line. Strings used as values must be quoted. Barewords are

1231

treated as numbers. I<\xXX> represents a byte.

1232

1233

Most of the keywords are self-explanatory. I<subchar> means

1234

substitution character, not subcharacter. When you decode a Unicode

1235

sequence to this encoding but no matching character is found, the byte

1236

sequence defined here will be used. For most cases, the value here is

1237

\x3F; in ASCII, this is a question mark.

1238

1239

=item *

1240

1241

CHARMAP starts the character map section. Each line has a form as

1242

follows:

1243

1244

<UXXXX> \xXX.. |0 # comment

1245

^ ^ ^

1246

| | +- Fallback flag

1247

| +-------- Encoded byte sequence

1248

+-------------- Unicode Character ID in hex

1249

1250

The format is roughly the same as a header section except for the

1251

fallback flag: | followed by 0..3. The meaning of the possible

1252

values is as follows:

1253

1254

=over 4

1255

1256

=item |0

1257

1258

Round trip safe. A character decoded to Unicode encodes back to the

1259

same byte sequence. Most characters have this flag.

1260

1261

=item |1

1262

1263

Fallback for unicode -> encoding. When seen, enc2xs adds this

1264

character for the encode map only.

1265

1266

=item |2

1267

1268

Skip sub-char mapping should there be no code point.

1269

1270

=item |3

1271

1272

Fallback for encoding -> unicode. When seen, enc2xs adds this

1273

character for the decode map only.

1274

1275

=back

1276

1277

=item *

1278

1279

And finally, END OF CHARMAP ends the section.

1280

1281

=back

1282

1283

When you are manually creating a UCM file, you should copy ascii.ucm

1284

or an existing encoding which is close to yours, rather than write

1285

your own from scratch.

1286

1287

When you do so, make sure you leave at least B<U0000> to B<U0020> as

1288

is, unless your environment is EBCDIC.

1289

1290

B<CAVEAT>: not all features in UCM are implemented. For example,

1291

icu:state is not used. Because of that, you need to write a perl

1292

module if you want to support algorithmical encodings, notably

1293

the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,

1294

L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.

1295

1296

=head2 Coping with duplicate mappings

1297

1298

When you create a map, you SHOULD make your mappings round-trip safe.

1299

That is, C<encode('your-encoding', decode('your-encoding', $data)) eq

1300

$data> stands for all characters that are marked as C<|0>. Here is

1301

how to make sure:

1302

1303

=over 4

1304

1305

=item *

1306

1307

Sort your map in Unicode order.

1308

1309

=item *

1310

1311

When you have a duplicate entry, mark either one with '|1' or '|3'.

1312

1313

=item *

1314

1315

And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.

1316

1317

=back

1318

1319

Here is an example from big5-eten.

1320

1321

<U2550> \xF9\xF9 |0

1322

<U2550> \xA2\xA4 |3

1323

1324

Internally Encoding -> Unicode and Unicode -> Encoding Map looks like

1325

this;

1326

1327

E to U U to E

1328

--------------------------------------

1329

\xF9\xF9 => U2550 U2550 => \xF9\xF9

1330

\xA2\xA4 => U2550

1331

1332

So it is round-trip safe for \xF9\xF9. But if the line above is upside

1333

down, here is what happens.

1334

1335

E to U U to E

1336

--------------------------------------

1337

\xA2\xA4 => U2550 U2550 => \xF9\xF9

1338

(\xF9\xF9 => U2550 is now overwritten!)

1339

1340

The Encode package comes with F<ucmlint>, a crude but sufficient

1341

utility to check the integrity of a UCM file. Check under the

1342

Encode/bin directory for this.

1343

1344

When in doubt, you can use F<ucmsort>, yet another utility under

1345

Encode/bin directory.

1346

1347

=head1 Bookmarks

1348

1349

=over 4

1350

1351

=item *

1352

1353

ICU Home Page

1354

L<http://oss.software.ibm.com/icu/>

1355

1356

=item *

1357

1358

ICU Character Mapping Tables

1359

L<http://oss.software.ibm.com/icu/charset/>

1360

1361

=item *

1362

1363

ICU:Conversion Data

1364

L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>

1365

1366

=back

1367

1368

=head1 SEE ALSO

1369

1370

L<Encode>,

1371

L<perlmod>,

1372

L<perlpod>

1373

1374

=cut

1375

1376

# -Q to disable the duplicate codepoint test

1377

# -S make mapping errors fatal

1378

# -q to remove comments written to output files

1379

# -O to enable the (brute force) substring optimiser

1380

# -o <output> to specify the output file name (else it's the first arg)

1381

# -f <inlist> to give a file with a list of input files (else use the args)

1382

# -n <name> to name the encoding (else use the basename of the input file.

1383

1384

With %seen holding array refs:

1385

1386

865.66 real 28.80 user 8.79 sys

1387

7904 maximum resident set size

1388

1356 average shared memory size

1389

18566 average unshared data size

1390

229 average unshared stack size

1391

46080 page reclaims

1392

33373 page faults

1393

1394

With %seen holding simple scalars:

1395

1396

342.16 real 27.11 user 3.54 sys

1397

8388 maximum resident set size

1398

1394 average shared memory size

1399

14969 average unshared data size

1400

236 average unshared stack size

1401

28159 page reclaims

1402

9839 page faults

1403

1404

Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is

1405

how %seen is storing things its seen. So it is pathalogically bad on a 16M

1406

RAM machine, but it's going to help even on modern machines.

1407

Swapping is bad, m'kay :-)

Older »