~ubuntu-branches/ubuntu/precise/tidy/precise-updates

« back to all changes in this revision

Viewing changes to src/streamio.c

Committer: Bazaar Package Importer
Author(s): Jason Thomas
Date: 2005-04-20 11:22:49 UTC
mto: (3.1.1 lenny)
mto: This revision was merged to the branch mainline in revision 2.
Revision ID: james.westby@ubuntu.com-20050420112249-epdnkgi03ubep83z

Tags: upstream-20050415

Import upstream version 20050415

files added:

Makefile.am

Makefile.in

aclocal.m4

build

build/gmake

build/gmake/Makefile

build/gmake/readme.txt

build/gnuauto

build/gnuauto/Makefile.am

build/gnuauto/configure.in

build/gnuauto/console

build/gnuauto/console/Makefile.am

build/gnuauto/include

build/gnuauto/include/Makefile.am

build/gnuauto/readme.txt

build/gnuauto/setup.sh

build/gnuauto/src

build/gnuauto/src/Makefile.am

build/msvc

build/msvc/MakeDLL.vc6

build/msvc/Makefile.vc6

build/msvc/tidy.dsp

build/msvc/tidy.dsw

build/msvc/tidydll.dsp

build/msvc/tidylib.dsp

build/readme.txt

build/rpm

build/rpm/readme.txt

build/rpm/tidy.spec

config.guess

config.sub

configure

configure.in

console

console/Makefile.am

console/Makefile.in

console/tab2space.c

console/tidy.c

depcomp

include

include/Makefile.am

include/Makefile.in

include/buffio.h

include/fileio.h

include/platform.h

include/tidy.h

include/tidyenum.h

install-sh

ltmain.sh

missing

readme.txt

setup.sh

src/Makefile.am

src/Makefile.in

src/access.c

src/access.h

src/alloc.c

src/attrask.c

src/attrdict.c

src/attrdict.h

src/attrget.c

src/attrs.c

src/attrs.h

src/buffio.c

src/charsets.c

src/charsets.h

src/clean.c

src/clean.h

src/config.c

src/config.h

src/entities.c

src/entities.h

src/fileio.c

src/forward.h

src/iconvtc.c

src/iconvtc.h

src/istack.c

src/lexer.c

src/lexer.h

src/localize.c

src/message.h

src/parser.c

src/parser.h

src/pprint.c

src/pprint.h

src/streamio.c

src/streamio.h

src/tagask.c

src/tags.c

src/tags.h

src/tidy-int.h

src/tidylib.c

src/tmbstr.c

src/tmbstr.h

src/utf8.c

src/utf8.h

src/win32tc.c

src/win32tc.h

Show diffs side-by-side

added added

removed removed

src/streamio.c

/* streamio.c -- handles character stream I/O

See tidy.h for the copyright notice.

CVS Info :

$Author: arnaud02 $

$Date: 2005/04/08 09:11:13 $

$Revision: 1.29 $

Wrapper around Tidy input source and output sink

that calls appropriate interfaces, and applies

necessary char encoding transformations: to/from

ISO-10646 and/or UTF-8.

#include <stdio.h>

#include <errno.h>

#include "streamio.h"

#include "tidy-int.h"

#include "lexer.h"

#include "message.h"

#include "utf8.h"

#include "tmbstr.h"

#ifdef TIDY_WIN32_MLANG_SUPPORT

#include "win32tc.h"

#endif

/************************

** Forward Declarations

************************/

static uint ReadCharFromStream( StreamIn* in );

static uint ReadByte( StreamIn* in );

static void UngetByte( StreamIn* in, uint byteValue );

static void PutByte( uint byteValue, StreamOut* out );

static void EncodeWin1252( uint c, StreamOut* out );

static void EncodeMacRoman( uint c, StreamOut* out );

static void EncodeIbm858( uint c, StreamOut* out );

static void EncodeLatin0( uint c, StreamOut* out );

/******************************

** Static (duration) Globals

******************************/

static StreamOut stderrStreamOut =

{

ASCII,

FSM_ASCII,

DEFAULT_NL_CONFIG,

#ifdef TIDY_WIN32_MLANG_SUPPORT

(ulong)NULL,

#endif

FileIO,

{ 0, filesink_putByte }

};

static StreamOut stdoutStreamOut =

{

ASCII,

FSM_ASCII,

DEFAULT_NL_CONFIG,

#ifdef TIDY_WIN32_MLANG_SUPPORT

(ulong)NULL,

#endif

FileIO,

{ 0, filesink_putByte }

};

StreamOut* StdErrOutput(void)

{

if ( stderrStreamOut.sink.sinkData == 0 )

stderrStreamOut.sink.sinkData = (ulong) stderr;

return &stderrStreamOut;

}

StreamOut* StdOutOutput(void)

{

if ( stdoutStreamOut.sink.sinkData == 0 )

stdoutStreamOut.sink.sinkData = (ulong) stdout;

return &stdoutStreamOut;

}

void ReleaseStreamOut( StreamOut* out )

{

if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )

{

if ( out->iotype == FileIO )

fclose( (FILE*) out->sink.sinkData );

MemFree( out );

}

100

101

102

/************************

103

** Source

104

************************/

105

106

static StreamIn* initStreamIn( TidyDocImpl* doc, int encoding )

107

{

108

StreamIn *in = (StreamIn*) MemAlloc( sizeof(StreamIn) );

109

110

ClearMemory( in, sizeof(StreamIn) );

111

in->curline = 1;

112

in->curcol = 1;

113

in->encoding = encoding;

114

in->state = FSM_ASCII;

115

in->doc = doc;

116

in->bufsize = CHARBUF_SIZE;

117

in->charbuf = MemAlloc(sizeof(tchar) * in->bufsize);

118

#ifdef TIDY_STORE_ORIGINAL_TEXT

119

in->otextbuf = NULL;

120

in->otextlen = 0;

121

in->otextsize = 0;

122

#endif

123

return in;

124

}

125

126

void freeStreamIn(StreamIn* in)

127

{

128

#ifdef TIDY_STORE_ORIGINAL_TEXT

129

if (in->otextbuf)

130

MemFree(in->otextbuf);

131

#endif

132

MemFree(in->charbuf);

133

MemFree(in);

134

}

135

136

StreamIn* FileInput( TidyDocImpl* doc, FILE *fp, int encoding )

137

{

138

StreamIn *in = initStreamIn( doc, encoding );

139

initFileSource( &in->source, fp );

140

in->iotype = FileIO;

141

return in;

142

}

143

144

StreamIn* BufferInput( TidyDocImpl* doc, TidyBuffer* buf, int encoding )

145

{

146

StreamIn *in = initStreamIn( doc, encoding );

147

initInputBuffer( &in->source, buf );

148

in->iotype = BufferIO;

149

return in;

150

}

151

152

StreamIn* UserInput( TidyDocImpl* doc, TidyInputSource* source, int encoding )

153

{

154

StreamIn *in = initStreamIn( doc, encoding );

155

memcpy( &in->source, source, sizeof(TidyInputSource) );

156

in->iotype = UserIO;

157

return in;

158

}

159

160

int ReadBOMEncoding(StreamIn *in)

161

{

162

uint c, c1;

163

#if SUPPORT_UTF16_ENCODINGS

164

uint bom;

165

#endif

166

167

c = ReadByte(in);

168

if (c == EndOfStream)

169

return -1;

170

171

c1 = ReadByte( in );

172

if (c1 == EndOfStream)

173

{

174

UngetByte(in, c);

175

return -1;

176

}

177

178

/* todo: dont warn about mismatch for auto input encoding */

179

/* todo: let the user override the encoding found here */

180

181

#if SUPPORT_UTF16_ENCODINGS

182

bom = (c << 8) + c1;

183

184

if ( bom == UNICODE_BOM_BE )

185

{

186

/* big-endian UTF-16 */

187

if ( in->encoding != UTF16 && in->encoding != UTF16BE )

188

ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16BE);

189

190

return UTF16BE; /* return decoded BOM */

191

}

192

else if (bom == UNICODE_BOM_LE)

193

{

194

/* little-endian UTF-16 */

195

if (in->encoding != UTF16 && in->encoding != UTF16LE)

196

ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16LE);

197

198

return UTF16LE; /* return decoded BOM */

199

}

200

else

201

#endif /* SUPPORT_UTF16_ENCODINGS */

202

{

203

uint c2 = ReadByte(in);

204

205

if (c2 == EndOfStream)

206

{

207

UngetByte(in, c1);

208

UngetByte(in, c);

209

return -1;

210

}

211

212

if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)

213

{

214

/* UTF-8 */

215

if (in->encoding != UTF8)

216

ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF8);

217

218

return UTF8;

219

}

220

else

221

UngetByte( in, c2 );

222

}

223

224

UngetByte(in, c1);

225

UngetByte(in, c);

226

227

return -1;

228

}

229

230

#ifdef TIDY_STORE_ORIGINAL_TEXT

231

void AddByteToOriginalText(StreamIn *in, tmbchar c)

232

{

233

if (in->otextlen + 1 >= in->otextsize)

234

{

235

size_t size = in->otextsize ? 1 : 2;

236

in->otextbuf = MemRealloc(in->otextbuf, in->otextsize + size);

237

in->otextsize += size;

238

}

239

in->otextbuf[in->otextlen++] = c;

240

in->otextbuf[in->otextlen ] = 0;

241

}

242

243

void AddCharToOriginalText(StreamIn *in, tchar c)

244

{

245

int i, err, count = 0;

246

tmbchar buf[10] = {0};

247

248

err = EncodeCharToUTF8Bytes(c, buf, NULL, &count);

249

250

if (err)

251

{

252

/* replacement character 0xFFFD encoded as UTF-8 */

253

buf[0] = (byte) 0xEF;

254

buf[1] = (byte) 0xBF;

255

buf[2] = (byte) 0xBD;

256

count = 3;

257

}

258

259

for (i = 0; i < count; ++i)

260

AddByteToOriginalText(in, buf[i]);

261

}

262

#endif

263

264

265

uint ReadChar( StreamIn *in )

266

{

267

uint c = EndOfStream;

268

uint tabsize = cfg( in->doc, TidyTabSize );

269

#ifdef TIDY_STORE_ORIGINAL_TEXT

270

Bool added = no;

271

#endif

272

273

if ( in->pushed )

274

return PopChar( in );

275

276

in->lastcol = in->curcol;

277

278

if ( in->tabs > 0 )

279

{

280

in->curcol++;

281

in->tabs--;

282

return ' ';

283

}

284

285

for (;;)

286

{

287

c = ReadCharFromStream(in);

288

289

if ( EndOfStream == c )

290

return EndOfStream;

291

292

if (c == '\n')

293

{

294

#ifdef TIDY_STORE_ORIGINAL_TEXT

295

added = yes;

296

AddCharToOriginalText(in, (tchar)c);

297

#endif

298

in->curcol = 1;

299

in->curline++;

300

break;

301

}

302

303

if (c == '\t')

304

{

305

#ifdef TIDY_STORE_ORIGINAL_TEXT

306

added = yes;

307

AddCharToOriginalText(in, (tchar)c);

308

#endif

309

in->tabs = tabsize - ((in->curcol - 1) % tabsize) - 1;

310

in->curcol++;

311

c = ' ';

312

break;

313

}

314

315

/* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */

316

if (c == '\r')

317

{

318

#ifdef TIDY_STORE_ORIGINAL_TEXT

319

added = yes;

320

AddCharToOriginalText(in, (tchar)c);

321

#endif

322

c = ReadCharFromStream(in);

323

if (c != '\n')

324

{

325

UngetChar( c, in );

326

c = '\n';

327

}

328

else

329

{

330

#ifdef TIDY_STORE_ORIGINAL_TEXT

331

AddCharToOriginalText(in, (tchar)c);

332

#endif

333

}

334

in->curcol = 1;

335

in->curline++;

336

break;

337

}

338

339

#ifndef NO_NATIVE_ISO2022_SUPPORT

340

/* strip control characters, except for Esc */

341

if (c == '\033')

342

break;

343

#endif

344

345

/* Form Feed is allowed in HTML */

346

if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )

347

break;

348

349

if ( c < 32 )

350

continue; /* discard control char */

351

352

/* watch out for chars that have already been decoded such as */

353

/* IS02022, UTF-8 etc, that don't require further decoding */

354

355

if (

356

in->encoding == RAW

357

#ifndef NO_NATIVE_ISO2022_SUPPORT

358

|| in->encoding == ISO2022

359

#endif

360

|| in->encoding == UTF8

361

362

#if SUPPORT_ASIAN_ENCODINGS

363

|| in->encoding == SHIFTJIS /* #431953 - RJ */

364

|| in->encoding == BIG5 /* #431953 - RJ */

365

#endif

366

)

367

{

368

in->curcol++;

369

break;

370

}

371

372

#if SUPPORT_UTF16_ENCODINGS

373

/* handle surrogate pairs */

374

if ( in->encoding == UTF16LE ||

375

in->encoding == UTF16 ||

376

in->encoding == UTF16BE )

377

{

378

if ( !IsValidUTF16FromUCS4(c) )

379

{

380

/* invalid UTF-16 value */

381

ReportEncodingError(in->doc, INVALID_UTF16, c, yes);

382

c = 0;

383

}

384

else if ( IsLowSurrogate(c) )

385

{

386

uint n = c;

387

uint m = ReadCharFromStream( in );

388

if ( m == EndOfStream )

389

return EndOfStream;

390

391

c = 0;

392

if ( IsHighSurrogate(m) )

393

{

394

n = CombineSurrogatePair( m, n );

395

if ( IsValidCombinedChar(n) )

396

c = n;

397

}

398

/* not a valid pair */

399

if ( 0 == c )

400

ReportEncodingError( in->doc, INVALID_UTF16, c, yes );

401

}

402

}

403

#endif

404

405

/* Do first: acts on range 128 - 255 */

406

switch ( in->encoding )

407

{

408

case MACROMAN:

409

c = DecodeMacRoman( c );

410

break;

411

case IBM858:

412

c = DecodeIbm850( c );

413

break;

414

case LATIN0:

415

c = DecodeLatin0( c );

416

break;

417

}

418

419

/* produced e.g. as a side-effect of smart quotes in Word */

420

/* but can't happen if using MACROMAN encoding */

421

if ( 127 < c && c < 160 )

422

{

423

uint c1 = 0, replMode = DISCARDED_CHAR;

424

Bool isVendorChar = ( in->encoding == WIN1252 ||

425

in->encoding == MACROMAN );

426

Bool isWinChar = ( in->encoding == WIN1252 ||

427

ReplacementCharEncoding == WIN1252 );

428

Bool isMacChar = ( in->encoding == MACROMAN ||

429

ReplacementCharEncoding == MACROMAN );

430

431

/* set error position just before offending character */

432

in->doc->lexer->lines = in->curline;

433

in->doc->lexer->columns = in->curcol;

434

435

if ( isWinChar )

436

c1 = DecodeWin1252( c );

437

else if ( isMacChar )

438

c1 = DecodeMacRoman( c );

439

if ( c1 )

440

replMode = REPLACED_CHAR;

441

442

if ( c1 == 0 && isVendorChar )

443

ReportEncodingError(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);

444

else if ( ! isVendorChar )

445

ReportEncodingError(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);

446

447

c = c1;

448

}

449

450

if ( c == 0 )

451

continue; /* illegal char is discarded */

452

453

in->curcol++;

454

break;

455

}

456

457

#ifdef TIDY_STORE_ORIGINAL_TEXT

458

if (!added)

459

AddCharToOriginalText(in, (tchar)c);

460

#endif

461

462

return c;

463

}

464

465

uint PopChar( StreamIn *in )

466

{

467

uint c = EndOfStream;

468

if ( in->pushed )

469

{

470

assert( in->bufpos > 0 );

471

c = in->charbuf[ --in->bufpos ];

472

if ( in->bufpos == 0 )

473

in->pushed = no;

474

475

if ( c == '\n' )

476

{

477

in->curcol = 1;

478

in->curline++;

479

return c;

480

}

481

in->curcol++;

482

}

483

return c;

484

}

485

486

void UngetChar( uint c, StreamIn *in )

487

{

488

if (c == EndOfStream)

489

{

490

/* fprintf(stderr, "Attempt to UngetChar EOF\n"); */

491

return;

492

}

493

494

in->pushed = yes;

495

496

if (in->bufpos + 1 >= in->bufsize)

497

in->charbuf = MemRealloc(in->charbuf, sizeof(tchar) * ++(in->bufsize));

498

499

in->charbuf[(in->bufpos)++] = c;

500

501

if (c == '\n')

502

--(in->curline);

503

504

in->curcol = in->lastcol;

505

}

506

507

508

509

/************************

510

** Sink

511

************************/

512

513

static StreamOut* initStreamOut( int encoding, uint nl )

514

{

515

StreamOut* out = (StreamOut*) MemAlloc( sizeof(StreamOut) );

516

ClearMemory( out, sizeof(StreamOut) );

517

out->encoding = encoding;

518

out->state = FSM_ASCII;

519

out->nl = nl;

520

return out;

521

}

522

523

StreamOut* FileOutput( FILE* fp, int encoding, uint nl )

524

{

525

StreamOut* out = initStreamOut( encoding, nl );

526

initFileSink( &out->sink, fp );

527

out->iotype = FileIO;

528

return out;

529

}

530

StreamOut* BufferOutput( TidyBuffer* buf, int encoding, uint nl )

531

{

532

StreamOut* out = initStreamOut( encoding, nl );

533

initOutputBuffer( &out->sink, buf );

534

out->iotype = BufferIO;

535

return out;

536

}

537

StreamOut* UserOutput( TidyOutputSink* sink, int encoding, uint nl )

538

{

539

StreamOut* out = initStreamOut( encoding, nl );

540

memcpy( &out->sink, sink, sizeof(TidyOutputSink) );

541

out->iotype = UserIO;

542

return out;

543

}

544

545

void WriteChar( uint c, StreamOut* out )

546

{

547

/* Translate outgoing newlines */

548

if ( LF == c )

549

{

550

if ( out->nl == TidyCRLF )

551

WriteChar( CR, out );

552

else if ( out->nl == TidyCR )

553

c = CR;

554

}

555

556

if (out->encoding == MACROMAN)

557

{

558

EncodeMacRoman( c, out );

559

}

560

else if (out->encoding == WIN1252)

561

{

562

EncodeWin1252( c, out );

563

}

564

else if (out->encoding == IBM858)

565

{

566

EncodeIbm858( c, out );

567

}

568

else if (out->encoding == LATIN0)

569

{

570

EncodeLatin0( c, out );

571

}

572

573

else if (out->encoding == UTF8)

574

{

575

int count = 0;

576

577

EncodeCharToUTF8Bytes( c, NULL, &out->sink, &count );

578

if (count <= 0)

579

{

580

/* ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */

581

/* replacement char 0xFFFD encoded as UTF-8 */

582

PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);

583

}

584

}

585

#ifndef NO_NATIVE_ISO2022_SUPPORT

586

else if (out->encoding == ISO2022)

587

{

588

if (c == 0x1b) /* ESC */

589

out->state = FSM_ESC;

590

else

591

{

592

switch (out->state)

593

{

594

case FSM_ESC:

595

if (c == '$')

596

out->state = FSM_ESCD;

597

else if (c == '(')

598

out->state = FSM_ESCP;

599

else

600

out->state = FSM_ASCII;

601

break;

602

603

case FSM_ESCD:

604

if (c == '(')

605

out->state = FSM_ESCDP;

606

else

607

out->state = FSM_NONASCII;

608

break;

609

610

case FSM_ESCDP:

611

out->state = FSM_NONASCII;

612

break;

613

614

case FSM_ESCP:

615

out->state = FSM_ASCII;

616

break;

617

618

case FSM_NONASCII:

619

c &= 0x7F;

620

break;

621

}

622

}

623

624

PutByte(c, out);

625

}

626

#endif /* NO_NATIVE_ISO2022_SUPPORT */

627

628

#if SUPPORT_UTF16_ENCODINGS

629

else if ( out->encoding == UTF16LE ||

630

out->encoding == UTF16BE ||

631

out->encoding == UTF16 )

632

{

633

int i, numChars = 1;

634

uint theChars[2];

635

636

if ( !IsValidUTF16FromUCS4(c) )

637

{

638

/* invalid UTF-16 value */

639

/* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */

640

c = 0;

641

numChars = 0;

642

}

643

else if ( IsCombinedChar(c) )

644

{

645

/* output both, unless something goes wrong */

646

numChars = 2;

647

if ( !SplitSurrogatePair(c, &theChars[0], &theChars[1]) )

648

{

649

/* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */

650

c = 0;

651

numChars = 0;

652

}

653

}

654

else

655

{

656

/* just put the char out */

657

theChars[0] = c;

658

}

659

660

for (i = 0; i < numChars; i++)

661

{

662

c = theChars[i];

663

664

if (out->encoding == UTF16LE)

665

{

666

uint ch = c & 0xFF; PutByte(ch, out);

667

ch = (c >> 8) & 0xFF; PutByte(ch, out);

668

}

669

670

else if (out->encoding == UTF16BE || out->encoding == UTF16)

671

{

672

uint ch = (c >> 8) & 0xFF; PutByte(ch, out);

673

ch = c & 0xFF; PutByte(ch, out);

674

}

675

}

676

}

677

#endif

678

679

#if SUPPORT_ASIAN_ENCODINGS

680

else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)

681

{

682

if (c < 128)

683

PutByte(c, out);

684

else

685

{

686

uint ch = (c >> 8) & 0xFF; PutByte(ch, out);

687

ch = c & 0xFF; PutByte(ch, out);

688

}

689

}

690

#endif

691

692

else

693

PutByte( c, out );

694

}

695

696

697

698

/****************************

699

** Miscellaneous / Helpers

700

****************************/

701

702

/* char encoding used when replacing illegal SGML chars,

703

** regardless of specified encoding. Set at compile time

704

** to either Windows or Mac.

705

706

const int ReplacementCharEncoding = DFLT_REPL_CHARENC;

707

708

709

/* Mapping for Windows Western character set CP 1252

710

** (chars 128-159/U+0080-U+009F) to Unicode.

711

712

static const uint Win2Unicode[32] =

713

{

714

0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,

715

0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,

716

0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,

717

0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178

718

};

719

720

/* Function for conversion from Windows-1252 to Unicode */

721

uint DecodeWin1252(uint c)

722

{

723

if (127 < c && c < 160)

724

c = Win2Unicode[c - 128];

725

726

return c;

727

}

728

729

static void EncodeWin1252( uint c, StreamOut* out )

730

{

731

if (c < 128 || (c > 159 && c < 256))

732

PutByte(c, out);

733

else

734

{

735

int i;

736

737

for (i = 128; i < 160; i++)

738

if (Win2Unicode[i - 128] == c)

739

{

740

PutByte(i, out);

741

break;

742

}

743

}

744

}

745

746

747

John Love-Jensen contributed this table for mapping MacRoman

748

character set to Unicode

749

750

751

/* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */

752

static const uint Mac2Unicode[128] =

753

{

754

/* x7F = DEL */

755

756

0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,

757

0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,

758

759

0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,

760

0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,

761

762

0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,

763

0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,

764

765

0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,

766

/* =BD U+2126 OHM SIGN */

767

0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,

768

769

0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,

770

0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,

771

772

0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,

773

/* =DB U+00A4 CURRENCY SIGN */

774

0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,

775

776

0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,

777

0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,

778

/* xF0 = Apple Logo */

779

/* =F0 U+2665 BLACK HEART SUIT */

780

0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,

781

0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7

782

};

783

784

/* Function to convert from MacRoman to Unicode */

785

uint DecodeMacRoman(uint c)

786

{

787

if (127 < c)

788

c = Mac2Unicode[c - 128];

789

return c;

790

}

791

792

static void EncodeMacRoman( uint c, StreamOut* out )

793

{

794

if (c < 128)

795

PutByte(c, out);

796

else

797

{

798

/* For mac users, map Unicode back to MacRoman. */

799

int i;

800

for (i = 128; i < 256; i++)

801

{

802

if (Mac2Unicode[i - 128] == c)

803

{

804

PutByte(i, out);

805

break;

806

}

807

}

808

}

809

}

810

811

/* Mapping for OS/2 Western character set CP 850

812

** (chars 128-255) to Unicode.

813

814

static const uint IBM2Unicode[128] =

815

{

816

0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,

817

0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,

818

0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,

819

0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,

820

0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,

821

0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,

822

0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,

823

0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,

824

0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,

825

0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,

826

0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,

827

0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,

828

0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,

829

0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,

830

0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,

831

0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0

832

};

833

834

/* Function for conversion from OS/2-850 to Unicode */

835

uint DecodeIbm850(uint c)

836

{

837

if (127 < c && c < 256)

838

c = IBM2Unicode[c - 128];

839

840

return c;

841

}

842

843

/* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */

844

static void EncodeIbm858( uint c, StreamOut* out )

845

{

846

if (c < 128)

847

PutByte(c, out);

848

else

849

{

850

int i;

851

for (i = 128; i < 256; i++)

852

{

853

if (IBM2Unicode[i - 128] == c)

854

{

855

PutByte(i, out);

856

break;

857

}

858

}

859

}

860

}

861

862

863

/* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */

864

uint DecodeLatin0(uint c)

865

{

866

if (159 < c && c < 191)

867

{

868

switch (c)

869

{

870

case 0xA4: c = 0x20AC; break;

871

case 0xA6: c = 0x0160; break;

872

case 0xA8: c = 0x0161; break;

873

case 0xB4: c = 0x017D; break;

874

case 0xB8: c = 0x017E; break;

875

case 0xBC: c = 0x0152; break;

876

case 0xBD: c = 0x0153; break;

877

case 0xBE: c = 0x0178; break;

878

}

879

}

880

return c;

881

}

882

883

/* Map Unicode back to ISO-8859-15. */

884

static void EncodeLatin0( uint c, StreamOut* out )

885

{

886

switch (c)

887

{

888

case 0x20AC: c = 0xA4; break;

889

case 0x0160: c = 0xA6; break;

890

case 0x0161: c = 0xA8; break;

891

case 0x017D: c = 0xB4; break;

892

case 0x017E: c = 0xB8; break;

893

case 0x0152: c = 0xBC; break;

894

case 0x0153: c = 0xBD; break;

895

case 0x0178: c = 0xBE; break;

896

}

897

PutByte(c, out);

898

}

899

900

901

Table to map symbol font characters to Unicode; undefined

902

characters are mapped to 0x0000 and characters without any

903

Unicode equivalent are mapped to '?'. Is this appropriate?

904

905

906

static const uint Symbol2Unicode[] =

907

{

908

0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,

909

0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,

910

911

0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,

912

0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,

913

914

0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,

915

0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,

916

917

0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,

918

0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,

919

920

0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,

921

0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,

922

923

0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,

924

0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,

925

926

0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,

927

0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,

928

929

0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,

930

0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,

931

932

0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,

933

0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,

934

935

0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,

936

0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,

937

938

0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,

939

0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,

940

941

0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,

942

0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,

943

944

0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,

945

0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,

946

947

0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,

948

0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,

949

950

0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,

951

0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,

952

953

0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,

954

0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F

955

};

956

957

/* Function to convert from Symbol Font chars to Unicode */

958

uint DecodeSymbolFont(uint c)

959

{

960

if (c > 255)

961

return c;

962

963

/* todo: add some error message */

964

965

return Symbol2Unicode[c];

966

}

967

968

969

/* Facilitates user defined source by providing

970

** an entry point to marshal pointers-to-functions.

971

** Needed by .NET and possibly other language bindings.

972

973

Bool TIDY_CALL tidyInitSource( TidyInputSource* source,

974

void* srcData,

975

TidyGetByteFunc gbFunc,

976

TidyUngetByteFunc ugbFunc,

977

TidyEOFFunc endFunc )

978

{

979

Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );

980

981

if ( status )

982

{

983

source->sourceData = (ulong) srcData;

984

source->getByte = gbFunc;

985

source->ungetByte = ugbFunc;

986

source->eof = endFunc;

987

}

988

989

return status;

990

}

991

992

Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,

993

void* snkData,

994

TidyPutByteFunc pbFunc )

995

{

996

Bool status = ( sink && snkData && pbFunc );

997

if ( status )

998

{

999

sink->sinkData = (ulong) snkData;

1000

sink->putByte = pbFunc;

1001

}

1002

return status;

1003

}

1004

1005

/* GetByte must return a byte value in a signed

1006

** integer so that a negative value can signal EOF

1007

** without interfering w/ 0-255 legitimate byte values.

1008

1009

uint TIDY_CALL tidyGetByte( TidyInputSource* source )

1010

{

1011

int bv = source->getByte( source->sourceData );

1012

return (uint) bv;

1013

}

1014

Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )

1015

{

1016

return source->eof( source->sourceData );

1017

}

1018

void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )

1019

{

1020

source->ungetByte( source->sourceData, (byte) ch );

1021

}

1022

void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )

1023

{

1024

sink->putByte( sink->sinkData, (byte) ch );

1025

}

1026

1027

static uint ReadByte( StreamIn* in )

1028

{

1029

return tidyGetByte( &in->source );

1030

}

1031

Bool IsEOF( StreamIn* in )

1032

{

1033

return tidyIsEOF( &in->source );

1034

}

1035

static void UngetByte( StreamIn* in, uint byteValue )

1036

{

1037

tidyUngetByte( &in->source, byteValue );

1038

}

1039

static void PutByte( uint byteValue, StreamOut* out )

1040

{

1041

tidyPutByte( &out->sink, byteValue );

1042

}

1043

1044

#if 0

1045

static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count )

1046

{

1047

int i;

1048

1049

for (i = 0; i < *count; i++)

1050

{

1051

/* should never get here; testing for 0xFF, a valid char, is not a good idea */

1052

if ( in && IsEOF(in) )

1053

{

1054

/* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */

1055

*count = -i;

1056

return;

1057

}

1058

1059

in->source.ungetByte( in->source.sourceData, buf[i] );

1060

}

1061

}

1062

1063

1064

Read raw bytes from stream, return <= 0 if EOF; or if

1065

"unget" is true, Unget the bytes to re-synchronize the input stream

1066

Normally UTF-8 successor bytes are read using this routine.

1067

1068

static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count )

1069

{

1070

int ix;

1071

for ( ix=0; ix < *count; ++ix )

1072

{

1073

if ( in->rawPushed )

1074

{

1075

buf[ix] = in->rawBytebuf[ --in->rawBufpos ];

1076

if ( in->rawBufpos == 0 )

1077

in->rawPushed = no;

1078

}

1079

else

1080

{

1081

if ( in->source.eof(in->source.sourceData) )

1082

{

1083

*count = -i;

1084

break;

1085

}

1086

buf[ix] = in->source.getByte( in->source.sourceData );

1087

}

1088

}

1089

}

1090

#endif /* 0 */

1091

1092

/* read char from stream */

1093

static uint ReadCharFromStream( StreamIn* in )

1094

{

1095

uint c, n;

1096

#ifdef TIDY_WIN32_MLANG_SUPPORT

1097

uint bytesRead = 0;

1098

#endif

1099

1100

if ( IsEOF(in) )

1101

return EndOfStream;

1102

1103

c = ReadByte( in );

1104

1105

if (c == EndOfStream)

1106

return c;

1107

1108

#ifndef NO_NATIVE_ISO2022_SUPPORT

1109

1110

A document in ISO-2022 based encoding uses some ESC sequences

1111

called "designator" to switch character sets. The designators

1112

defined and used in ISO-2022-JP are:

1113

1114

"ESC" + "(" + ? for ISO646 variants

1115

1116

"ESC" + "$" + ? and

1117

"ESC" + "$" + "(" + ? for multibyte character sets

1118

1119

Where ? stands for a single character used to indicate the

1120

character set for multibyte characters.

1121

1122

Tidy handles this by preserving the escape sequence and

1123

setting the top bit of each byte for non-ascii chars. This

1124

bit is then cleared on output. The input stream keeps track

1125

of the state to determine when to set/clear the bit.

1126

1127

1128

if (in->encoding == ISO2022)

1129

{

1130

if (c == 0x1b) /* ESC */

1131

{

1132

in->state = FSM_ESC;

1133

return c;

1134

}

1135

1136

switch (in->state)

1137

{

1138

case FSM_ESC:

1139

if (c == '$')

1140

in->state = FSM_ESCD;

1141

else if (c == '(')

1142

in->state = FSM_ESCP;

1143

else

1144

in->state = FSM_ASCII;

1145

break;

1146

1147

case FSM_ESCD:

1148

if (c == '(')

1149

in->state = FSM_ESCDP;

1150

else

1151

in->state = FSM_NONASCII;

1152

break;

1153

1154

case FSM_ESCDP:

1155

in->state = FSM_NONASCII;

1156

break;

1157

1158

case FSM_ESCP:

1159

in->state = FSM_ASCII;

1160

break;

1161

1162

case FSM_NONASCII:

1163

c |= 0x80;

1164

break;

1165

}

1166

1167

return c;

1168

}

1169

#endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */

1170

1171

#if SUPPORT_UTF16_ENCODINGS

1172

if ( in->encoding == UTF16LE )

1173

{

1174

uint c1 = ReadByte( in );

1175

if ( EndOfStream == c1 )

1176

return EndOfStream;

1177

n = (c1 << 8) + c;

1178

return n;

1179

}

1180

1181

if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */

1182

{

1183

uint c1 = ReadByte( in );

1184

if ( EndOfStream == c1 )

1185

return EndOfStream;

1186

n = (c << 8) + c1;

1187

return n;

1188

}

1189

#endif

1190

1191

if ( in->encoding == UTF8 )

1192

{

1193

/* deal with UTF-8 encoded char */

1194

1195

int err, count = 0;

1196

1197

/* first byte "c" is passed in separately */

1198

err = DecodeUTF8BytesToChar( &n, c, NULL, &in->source, &count );

1199

if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */

1200

return EndOfStream;

1201

else if (err)

1202

{

1203

/* set error position just before offending character */

1204

in->doc->lexer->lines = in->curline;

1205

in->doc->lexer->columns = in->curcol;

1206

1207

ReportEncodingError(in->doc, INVALID_UTF8, n, no);

1208

n = 0xFFFD; /* replacement char */

1209

}

1210

1211

return n;

1212

}

1213

1214

#if SUPPORT_ASIAN_ENCODINGS

1215

1216

This section is suitable for any "multibyte" variable-width

1217

character encoding in which a one-byte code is less than

1218

128, and the first byte of a two-byte code is greater or

1219

equal to 128. Note that Big5 and ShiftJIS fit into this

1220

kind, even though their second byte may be less than 128

1221

1222

if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))

1223

{

1224

if (c < 128)

1225

return c;

1226

else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */

1227

{

1228

1229

Rick Cameron pointed out that for Shift_JIS, the values from

1230

0xa1 through 0xdf represent singe-byte characters

1231

(U+FF61 to U+FF9F - half-shift Katakana)

1232

1233

return c;

1234

}

1235

else

1236

{

1237

uint c1 = ReadByte( in );

1238

if ( EndOfStream == c1 )

1239

return EndOfStream;

1240

n = (c << 8) + c1;

1241

return n;

1242

}

1243

}

1244

#endif

1245

1246

#ifdef TIDY_WIN32_MLANG_SUPPORT

1247

else if (in->encoding > WIN32MLANG)

1248

{

1249

assert( in->mlang != 0 );

1250

return Win32MLangGetChar((byte)c, in, &bytesRead);

1251

}

1252

#endif

1253

1254

else

1255

n = c;

1256

1257

return n;

1258

}

1259

1260

/* Output a Byte Order Mark if required */

1261

void outBOM( StreamOut *out )

1262

{

1263

if ( out->encoding == UTF8

1264

#if SUPPORT_UTF16_ENCODINGS

1265

|| out->encoding == UTF16LE

1266

|| out->encoding == UTF16BE

1267

|| out->encoding == UTF16

1268

#endif

1269

)

1270

{

1271

/* this will take care of encoding the BOM correctly */

1272

WriteChar( UNICODE_BOM, out );

1273

}

1274

}

1275

1276

/* this is in intermediate fix for various problems in the */

1277

/* long term code and data in charsets.c should be used */

1278

static struct _enc2iana

1279

{

1280

uint id;

1281

ctmbstr name;

1282

ctmbstr tidyOptName;

1283

} const enc2iana[] =

1284

{

1285

{ ASCII, "us-ascii", "ascii" },

1286

{ LATIN0, "iso-8859-15", "latin0" },

1287

{ LATIN1, "iso-8859-1", "latin1" },

1288

{ UTF8, "utf-8", "utf8" },

1289

{ MACROMAN, "macintosh", "mac" },

1290

{ WIN1252, "windows-1252", "win1252" },

1291

{ IBM858, "ibm00858", "ibm858" },

1292

#if SUPPORT_UTF16_ENCODINGS

1293

{ UTF16LE, "utf-16", "utf16le" },

1294

{ UTF16BE, "utf-16", "utf16be" },

1295

{ UTF16, "utf-16", "utf16" },

1296

#endif

1297

#if SUPPORT_ASIAN_ENCODINGS

1298

{ BIG5, "big5", "big5" },

1299

{ SHIFTJIS, "shift_jis", "shiftjis"},

1300

#endif

1301

#ifndef NO_NATIVE_ISO2022_SUPPORT

1302

{ ISO2022, NULL, "iso2022" },

1303

#endif

1304

{ RAW, NULL, "raw" }

1305

};

1306

1307

ctmbstr GetEncodingNameFromTidyId(uint id)

1308

{

1309

uint i;

1310

1311

for (i = 0; enc2iana[i].name; ++i)

1312

if (enc2iana[i].id == id)

1313

return enc2iana[i].name;

1314

1315

return NULL;

1316

}

1317

1318

ctmbstr GetEncodingOptNameFromTidyId(uint id)

1319

{

1320

uint i;

1321

1322

for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)

1323

if (enc2iana[i].id == id)

1324

return enc2iana[i].tidyOptName;

1325

1326

return NULL;

1327

}

1328

1329

int GetCharEncodingFromOptName( ctmbstr charenc )

1330

{

1331

uint i;

1332

1333

for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)

1334

if (tmbstrcasecmp(charenc, enc2iana[i].tidyOptName) == 0 )

1335

return enc2iana[i].id;

1336

1337

return -1;

1338

}