~ubuntu-branches/ubuntu/raring/libencode-perl/raring

« back to all changes in this revision

Viewing changes to lib/Encode/Encoding.pm

Committer: Bazaar Package Importer
Author(s): Jose Luis Rivas
Date: 2007-05-18 23:49:27 UTC
Revision ID: james.westby@ubuntu.com-20070518234927-bs37c807cty7i1ny

Tags: upstream-2.21

Import upstream version 2.21

files added:

AUTHORS

Byte

Byte/Byte.pm

Byte/Makefile.PL

CN/CN.pm

CN/Makefile.PL

Changes

EBCDIC

EBCDIC/EBCDIC.pm

EBCDIC/Makefile.PL

Encode

Encode.pm

Encode.xs

Encode/Changes.e2x

Encode/ConfigLocal_PM.e2x

Encode/Makefile_PL.e2x

Encode/README.e2x

Encode/_PM.e2x

Encode/_T.e2x

Encode/encode.h

JP/JP.pm

JP/Makefile.PL

KR/KR.pm

KR/Makefile.PL

MANIFEST

META.yml

Makefile.PL

README

Symbol

Symbol/Makefile.PL

Symbol/Symbol.pm

TW/Makefile.PL

TW/TW.pm

Unicode

Unicode/Makefile.PL

Unicode/Unicode.pm

Unicode/Unicode.xs

bin/enc2xs

bin/piconv

bin/ucm2table

bin/ucmlint

bin/ucmsort

bin/unidump

encengine.c

encoding.pm

lib/Encode

lib/Encode/Alias.pm

lib/Encode/CJKConstants.pm

lib/Encode/CN

lib/Encode/CN/HZ.pm

lib/Encode/Config.pm

lib/Encode/Encoder.pm

lib/Encode/Encoding.pm

lib/Encode/GSM0338.pm

lib/Encode/Guess.pm

lib/Encode/JP

lib/Encode/JP/H2Z.pm

lib/Encode/JP/JIS7.pm

lib/Encode/KR

lib/Encode/KR/2022_KR.pm

lib/Encode/MIME

lib/Encode/MIME/Header

lib/Encode/MIME/Header.pm

lib/Encode/MIME/Header/ISO_2022_JP.pm

lib/Encode/MIME/Name.pm

lib/Encode/PerlIO.pod

lib/Encode/Supported.pod

lib/Encode/Unicode

lib/Encode/Unicode/UTF7.pm

t/Aliases.t

t/CJKT.t

t/Encode.t

t/Encoder.t

t/Mod_EUCJP.pm

t/Unicode.t

t/at-cn.t

t/at-tw.t

t/big5-eten.enc

t/big5-eten.utf

t/big5-hkscs.enc

t/big5-hkscs.utf

t/enc_data.t

t/enc_eucjp.t

t/enc_module.enc

t/enc_module.t

t/enc_utf8.t

t/encoding.t

t/fallback.t

t/from_to.t

t/gb2312.enc

t/gb2312.utf

t/grow.t

t/gsm0338.t

t/guess.t

t/jis7-fallback.t

t/jisx0201.enc

t/jisx0201.utf

t/jisx0208.enc

t/jisx0208.utf

t/jisx0212.enc

t/jisx0212.utf

t/jperl.t

t/ksc5601.enc

t/ksc5601.utf

t/mime-header.t

t/mime-name.t

t/mime_header_iso2022jp.t

t/perlio.t

t/rt.pl

t/unibench.pl

t/utf8strict.t

ucm/8859-1.ucm

ucm/8859-10.ucm

ucm/8859-11.ucm

ucm/8859-13.ucm

ucm/8859-14.ucm

ucm/8859-15.ucm

ucm/8859-16.ucm

ucm/8859-2.ucm

ucm/8859-3.ucm

ucm/8859-4.ucm

ucm/8859-5.ucm

ucm/8859-6.ucm

ucm/8859-7.ucm

ucm/8859-8.ucm

ucm/8859-9.ucm

ucm/adobeStdenc.ucm

ucm/adobeSymbol.ucm

ucm/adobeZdingbat.ucm

ucm/ascii.ucm

ucm/big5-eten.ucm

ucm/big5-hkscs.ucm

ucm/cp037.ucm

ucm/cp1006.ucm

ucm/cp1026.ucm

ucm/cp1047.ucm

ucm/cp1250.ucm

ucm/cp1251.ucm

ucm/cp1252.ucm

ucm/cp1253.ucm

ucm/cp1254.ucm

ucm/cp1255.ucm

ucm/cp1256.ucm

ucm/cp1257.ucm

ucm/cp1258.ucm

ucm/cp424.ucm

ucm/cp437.ucm

ucm/cp500.ucm

ucm/cp737.ucm

ucm/cp775.ucm

ucm/cp850.ucm

ucm/cp852.ucm

ucm/cp855.ucm

ucm/cp856.ucm

ucm/cp857.ucm

ucm/cp860.ucm

ucm/cp861.ucm

ucm/cp862.ucm

ucm/cp863.ucm

ucm/cp864.ucm

ucm/cp865.ucm

ucm/cp866.ucm

ucm/cp869.ucm

ucm/cp874.ucm

ucm/cp875.ucm

ucm/cp932.ucm

ucm/cp936.ucm

ucm/cp949.ucm

ucm/cp950.ucm

ucm/ctrl.ucm

ucm/dingbats.ucm

ucm/euc-cn.ucm

ucm/euc-jp.ucm

ucm/euc-kr.ucm

ucm/gb12345.ucm

ucm/gb2312.ucm

ucm/hp-roman8.ucm

ucm/ir-165.ucm

ucm/jis0201.ucm

ucm/jis0208.ucm

ucm/jis0212.ucm

ucm/johab.ucm

ucm/koi8-f.ucm

ucm/koi8-r.ucm

ucm/koi8-u.ucm

ucm/ksc5601.ucm

ucm/macArabic.ucm

ucm/macCentEuro.ucm

ucm/macChinsimp.ucm

ucm/macChintrad.ucm

ucm/macCroatian.ucm

ucm/macCyrillic.ucm

ucm/macDingbats.ucm

ucm/macFarsi.ucm

ucm/macGreek.ucm

ucm/macHebrew.ucm

ucm/macIceland.ucm

ucm/macJapanese.ucm

ucm/macKorean.ucm

ucm/macROMnn.ucm

ucm/macRUMnn.ucm

ucm/macRoman.ucm

ucm/macSami.ucm

ucm/macSymbol.ucm

ucm/macThai.ucm

ucm/macTurkish.ucm

ucm/macUkraine.ucm

ucm/nextstep.ucm

ucm/null.ucm

ucm/posix-bc.ucm

ucm/shiftjis.ucm

ucm/symbol.ucm

ucm/viscii.ucm

Show diffs side-by-side

added added

removed removed

lib/Encode/Encoding.pm

package Encode::Encoding;

# Base class for classes which implement encodings

use strict;

use warnings;

our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };

require Encode;

sub DEBUG { 0 }

sub Define {

my $obj = shift;

my $canonical = shift;

$obj = bless { Name => $canonical }, $obj unless ref $obj;

# warn "$canonical => $obj\n";

Encode::define_encoding( $obj, $canonical, @_ );

}

sub name { return shift->{'Name'} }

sub mime_name{

require Encode::MIME::Name;

return Encode::MIME::Name::get_mime_name(shift->name);

}

# sub renew { return $_[0] }

sub renew {

my $self = shift;

my $clone = bless {%$self} => ref($self);

$clone->{renewed}++; # so the caller can see it

DEBUG and warn $clone->{renewed};

return $clone;

}

sub renewed { return $_[0]->{renewed} || 0 }

*new_sequence = \&renew;

sub needs_lines { 0 }

sub perlio_ok {

eval { require PerlIO::encoding };

return $@ ? 0 : 1;

}

# (Temporary|legacy) methods

sub toUnicode { shift->decode(@_) }

sub fromUnicode { shift->encode(@_) }

# Needs to be overloaded or just croak

sub encode {

require Carp;

my $obj = shift;

my $class = ref($obj) ? ref($obj) : $obj;

Carp::croak( $class . "->encode() not defined!" );

}

sub decode {

require Carp;

my $obj = shift;

my $class = ref($obj) ? ref($obj) : $obj;

Carp::croak( $class . "->encode() not defined!" );

}

sub DESTROY { }

__END__

=head1 NAME

Encode::Encoding - Encode Implementation Base Class

=head1 SYNOPSIS

package Encode::MyEncoding;

use base qw(Encode::Encoding);

__PACKAGE__->Define(qw(myCanonical myAlias));

=head1 DESCRIPTION

As mentioned in L<Encode>, encodings are (in the current

implementation at least) defined as objects. The mapping of encoding

name to object is via the C<%Encode::Encoding> hash. Though you can

directly manipulate this hash, it is strongly encouraged to use this

base class module and add encode() and decode() methods.

=head2 Methods you should implement

You are strongly encouraged to implement methods below, at least

either encode() or decode().

100

101

=over 4

102

103

=item -E<gt>encode($string [,$check])

104

105

MUST return the octet sequence representing I<$string>.

106

107

=over 2

108

109

=item *

110

111

If I<$check> is true, it SHOULD modify I<$string> in place to remove

112

the converted part (i.e. the whole string unless there is an error).

113

If perlio_ok() is true, SHOULD becomes MUST.

114

115

=item *

116

117

If an error occurs, it SHOULD return the octet sequence for the

118

fragment of string that has been converted and modify $string in-place

119

to remove the converted part leaving it starting with the problem

120

fragment. If perlio_ok() is true, SHOULD becomes MUST.

121

122

=item *

123

124

If I<$check> is is false then C<encode> MUST make a "best effort" to

125

convert the string - for example, by using a replacement character.

126

127

=back

128

129

=item -E<gt>decode($octets [,$check])

130

131

MUST return the string that I<$octets> represents.

132

133

=over 2

134

135

=item *

136

137

If I<$check> is true, it SHOULD modify I<$octets> in place to remove

138

the converted part (i.e. the whole sequence unless there is an

139

error). If perlio_ok() is true, SHOULD becomes MUST.

140

141

=item *

142

143

If an error occurs, it SHOULD return the fragment of string that has

144

been converted and modify $octets in-place to remove the converted

145

part leaving it starting with the problem fragment. If perlio_ok() is

146

true, SHOULD becomes MUST.

147

148

=item *

149

150

If I<$check> is false then C<decode> should make a "best effort" to

151

convert the string - for example by using Unicode's "\x{FFFD}" as a

152

replacement character.

153

154

=back

155

156

=back

157

158

If you want your encoding to work with L<encoding> pragma, you should

159

also implement the method below.

160

161

=over 4

162

163

=item -E<gt>cat_decode($destination, $octets, $offset, $terminator [,$check])

164

165

MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>.

166

Decoding will terminate when $terminator (a string) appears in output.

167

I<$offset> will be modified to the last $octets position at end of decode.

168

Returns true if $terminator appears output, else returns false.

169

170

=back

171

172

=head2 Other methods defined in Encode::Encodings

173

174

You do not have to override methods shown below unless you have to.

175

176

=over 4

177

178

=item -E<gt>name

179

180

Predefined As:

181

182

sub name { return shift->{'Name'} }

183

184

MUST return the string representing the canonical name of the encoding.

185

186

=item -E<gt>mime_name

187

188

Predefined As:

189

190

sub mime_name{

191

require Encode::MIME::Name;

192

return Encode::MIME::Name::get_mime_name(shift->name);

193

}

194

195

MUST return the string representing the IANA charset name of the encoding.

196

197

=item -E<gt>renew

198

199

Predefined As:

200

201

sub renew {

202

my $self = shift;

203

my $clone = bless { %$self } => ref($self);

204

$clone->{renewed}++;

205

return $clone;

206

}

207

208

This method reconstructs the encoding object if necessary. If you need

209

to store the state during encoding, this is where you clone your object.

210

211

PerlIO ALWAYS calls this method to make sure it has its own private

212

encoding object.

213

214

=item -E<gt>renewed

215

216

Predefined As:

217

218

sub renewed { $_[0]->{renewed} || 0 }

219

220

Tells whether the object is renewed (and how many times). Some

221

modules emit C<Use of uninitialized value in null operation> warning

222

unless the value is numeric so return 0 for false.

223

224

=item -E<gt>perlio_ok()

225

226

Predefined As:

227

228

sub perlio_ok {

229

eval{ require PerlIO::encoding };

230

return $@ ? 0 : 1;

231

}

232

233

If your encoding does not support PerlIO for some reasons, just;

234

235

sub perlio_ok { 0 }

236

237

=item -E<gt>needs_lines()

238

239

Predefined As:

240

241

sub needs_lines { 0 };

242

243

If your encoding can work with PerlIO but needs line buffering, you

244

MUST define this method so it returns true. 7bit ISO-2022 encodings

245

are one example that needs this. When this method is missing, false

246

is assumed.

247

248

=back

249

250

=head2 Example: Encode::ROT13

251

252

package Encode::ROT13;

253

use strict;

254

use base qw(Encode::Encoding);

255

256

__PACKAGE__->Define('rot13');

257

258

sub encode($$;$){

259

my ($obj, $str, $chk) = @_;

260

$str =~ tr/A-Za-z/N-ZA-Mn-za-m/;

261

$_[1] = '' if $chk; # this is what in-place edit means

262

return $str;

263

}

264

265

# Jr pna or ynml yvxr guvf;

266

*decode = \&encode;

267

268

269

270

=head1 Why the heck Encode API is different?

271

272

It should be noted that the I<$check> behaviour is different from the

273

outer public API. The logic is that the "unchecked" case is useful

274

when the encoding is part of a stream which may be reporting errors

275

(e.g. STDERR). In such cases, it is desirable to get everything

276

through somehow without causing additional errors which obscure the

277

original one. Also, the encoding is best placed to know what the

278

correct replacement character is, so if that is the desired behaviour

279

then letting low level code do it is the most efficient.

280

281

By contrast, if I<$check> is true, the scheme above allows the

282

encoding to do as much as it can and tell the layer above how much

283

that was. What is lacking at present is a mechanism to report what

284

went wrong. The most likely interface will be an additional method

285

call to the object, or perhaps (to avoid forcing per-stream objects

286

on otherwise stateless encodings) an additional parameter.

287

288

It is also highly desirable that encoding classes inherit from

289

C<Encode::Encoding> as a base class. This allows that class to define

290

additional behaviour for all encoding objects.

291

292

package Encode::MyEncoding;

293

use base qw(Encode::Encoding);

294

295

__PACKAGE__->Define(qw(myCanonical myAlias));

296

297

to create an object with C<< bless {Name => ...}, $class >>, and call

298

define_encoding. They inherit their C<name> method from

299

C<Encode::Encoding>.

300

301

=head2 Compiled Encodings

302

303

For the sake of speed and efficiency, most of the encodings are now

304

supported via a I<compiled form>: XS modules generated from UCM

305

files. Encode provides the enc2xs tool to achieve that. Please see

306

L<enc2xs> for more details.

307

308

=head1 SEE ALSO

309

310

L<perlmod>, L<enc2xs>

311

312

=begin future

313

314

=over 4

315

316

=item Scheme 1

317

318

The fixup routine gets passed the remaining fragment of string being

319

processed. It modifies it in place to remove bytes/characters it can

320

understand and returns a string used to represent them. For example:

321

322

sub fixup {

323

my $ch = substr($_[0],0,1,'');

324

return sprintf("\x{%02X}",ord($ch);

325

}

326

327

This scheme is close to how the underlying C code for Encode works,

328

but gives the fixup routine very little context.

329

330

=item Scheme 2

331

332

The fixup routine gets passed the original string, an index into

333

it of the problem area, and the output string so far. It appends

334

what it wants to the output string and returns a new index into the

335

original string. For example:

336

337

sub fixup {

338

# my ($s,$i,$d) = @_;

339

my $ch = substr($_[0],$_[1],1);

340

$_[2] .= sprintf("\x{%02X}",ord($ch);

341

return $_[1]+1;

342

}

343

344

This scheme gives maximal control to the fixup routine but is more

345

complicated to code, and may require that the internals of Encode be tweaked to

346

keep the original string intact.

347

348

=item Other Schemes

349

350

Hybrids of the above.

351

352

Multiple return values rather than in-place modifications.

353

354

Index into the string could be C<pos($str)> allowing C<s/\G...//>.

355

356

=back

357

358

=end future

359

360

=cut

Older »