~ubuntu-branches/ubuntu/gutsy/tidy/gutsy

« back to all changes in this revision

Viewing changes to src/tidy.c

Committer: Bazaar Package Importer
Author(s): Jason Thomas
Date: 2005-04-20 11:22:49 UTC
mfrom: (0.2.1 upstream) (1.1.2 hoary)
Revision ID: james.westby@ubuntu.com-20050420112249-mygnr5vcrutwsen3

Tags: 20050415-1

New upstream release

files added:
Makefile.am

Makefile.in

aclocal.m4

build

build/gmake

build/gmake/Makefile

build/gmake/readme.txt

build/gnuauto

build/gnuauto/Makefile.am

build/gnuauto/configure.in

build/gnuauto/console

build/gnuauto/console/Makefile.am

build/gnuauto/include

build/gnuauto/include/Makefile.am

build/gnuauto/readme.txt

build/gnuauto/setup.sh

build/gnuauto/src

build/gnuauto/src/Makefile.am

build/msvc

build/msvc/MakeDLL.vc6

build/msvc/Makefile.vc6

build/msvc/tidy.dsp

build/msvc/tidy.dsw

build/msvc/tidydll.dsp

build/msvc/tidylib.dsp

build/readme.txt

build/rpm

build/rpm/readme.txt

build/rpm/tidy.spec

config.guess

config.guess.cdbs-orig

config.sub

config.sub.cdbs-orig

configure

configure.in

console

console/Makefile.am

console/Makefile.in

console/tab2space.c

console/tidy.c

debian/README.Debian

debian/compat

debian/libtidy-dev.install

debian/libtidy0.install

debian/patches/01config_file.patch

debian/patches/tidy_20040811-1.diff

debian/tidy.1

debian/tidy.install

debian/tidy.manpages

depcomp

include/Makefile.am

include/Makefile.in

include/buffio.h

include/fileio.h

include/tidy.h

include/tidyenum.h

install-sh

ltmain.sh

missing

readme.txt

setup.sh

src/Makefile.am

src/Makefile.in

src/access.c

src/access.h

src/alloc.c

src/attrask.c

src/attrdict.c

src/attrdict.h

src/attrget.c

src/attrs.h

src/buffio.c

src/charsets.c

src/charsets.h

src/clean.h

src/config.h

src/entities.h

src/fileio.c

src/forward.h

src/iconvtc.c

src/iconvtc.h

src/lexer.h

src/message.h

src/parser.h

src/pprint.h

src/streamio.c

src/streamio.h

src/tagask.c

src/tags.h

src/tidy-int.h

src/tidylib.c

src/tmbstr.c

src/tmbstr.h

src/utf8.c

src/utf8.h

src/win32tc.c

src/win32tc.h

files removed:
.bzr-builddeb

.bzr-builddeb/default.conf

Makefile

Makefile.nmake

debian/conffiles

debian/dirs

debian/doc-base

debian/docs

debian/manpages

debian/patches/01man_page.patch

debian/patches/02config_file.patch

debian/patches/03copyright.patch

htmldoc

htmldoc/Overview.html

htmldoc/grid.gif

htmldoc/pending.html

htmldoc/release-notes.html

htmldoc/tidy.gif

include/html.h

man_page.txt

src/tab2space.c

src/tidy.c

files modified:
debian/changelog

debian/control

debian/copyright

debian/rules

include/platform.h

src/attrs.c

src/clean.c

src/config.c

src/entities.c

src/istack.c

src/lexer.c

src/localize.c

src/parser.c

src/pprint.c

src/tags.c

Show diffs side-by-side

added added

removed removed

src/tidy.c

tidy.c - HTML parser and pretty printer

(Massachusetts Institute of Technology, Institut National de

Recherche en Informatique et en Automatique, Keio University).

CVS Info :

$Author: terry_teague $

$Date: 2002/01/10 08:57:17 $

$Revision: 1.42 $

Contributing Author(s):

Dave Raggett <dsr@w3.org>

The contributing author(s) would like to thank all those who

helped with testing, bug fixes and suggestions for improvements.

This wouldn't have been possible without your help.

This software and documentation is provided "as is," and

the copyright holders and contributing author(s) make no

representations or warranties, express or implied, including

but not limited to, warranties of merchantability or fitness

for any particular purpose or that the use of the software or

documentation will not infringe any third party patents,

copyrights, trademarks or other rights.

The copyright holders and contributing author(s) will not be held

liable for any direct, indirect, special or consequential damages

arising out of any use of the software or documentation, even if

advised of the possibility of such damage.

Permission is hereby granted to use, copy, modify, and distribute

this source code, or portions hereof, documentation and executables,

for any purpose, without fee, subject to the following restrictions:

1. The origin of this source code must not be misrepresented.

2. Altered versions must be plainly marked as such and must

not be misrepresented as being the original source.

3. This Copyright notice may not be removed or altered from any

source or altered source distribution.

The copyright holders and contributing author(s) specifically

permit, without fee, and encourage the use of this source code

as a component for supporting the Hypertext Markup Language in

commercial products. If you use this source code in a product,

acknowledgment is not required but would be appreciated.

#include "platform.h"

#include "html.h"

void InitTidy(void);

void DeInitTidy(void);

extern char *release_date;

Bool debug_flag = no;

Node *debug_element = null;

Lexer *debug_lexer = null;

uint totalerrors = 0;

uint totalwarnings = 0;

uint optionerrors = 0;

FILE *errout; /* set to stderr or stdout */

FILE *input;

/* char encoding used when replacing illegal SGML chars, regardless of specified encoding */

int ReplacementCharEncoding = WIN1252; /* by default */

#define UNICODE_BOM_BE 0xFEFF /* this is the big-endian (default) UNICODE BOM */

#define UNICODE_BOM UNICODE_BOM_BE

#define UNICODE_BOM_LE 0xFFFE /* this is the little-endian UNICODE BOM */

#define UNICODE_BOM_UTF8 0xEFBBBF /* this is the UTF-8 UNICODE BOM */

Private unget buffer for the raw bytes read from the input stream.

Normally this will only be used by the UTF-8 decoder to resynchronize the

input stream after finding an illegal UTF-8 sequences.

But it can be used for other purposes when reading bytes in ReadCharFromStream.

static unsigned char rawBytebuf[CHARBUF_SIZE];

static int rawBufpos = 0;

static Bool rawPushed = no;

/* Mapping for Windows Western character set CP 1252 (chars 128-159/U+0080-U+009F) to Unicode */

uint Win2Unicode[32] =

{

0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,

0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,

0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,

0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178

};

100

/* Function for conversion from Windows-1252 to Unicode */

101

uint DecodeWin1252(uint c)

102

{

103

if (127 < c && c < 160)

104

c = Win2Unicode[c - 128];

105

106

return c;

107

}

108

109

110

John Love-Jensen contributed this table for mapping MacRoman

111

character set to Unicode

112

113

114

/* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */

115

uint Mac2Unicode[128] =

116

{

117

/* x7F = DEL */

118

119

0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,

120

0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,

121

122

0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,

123

0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,

124

125

0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,

126

0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,

127

128

0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,

129

/* =BD U+2126 OHM SIGN */

130

0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,

131

132

0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,

133

0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,

134

135

0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,

136

/* =DB U+00A4 CURRENCY SIGN */

137

0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,

138

139

0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,

140

0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,

141

/* xF0 = Apple Logo */

142

/* =F0 U+2665 BLACK HEART SUIT */

143

0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,

144

0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7

145

};

146

147

/* Function to convert from MacRoman to Unicode */

148

uint DecodeMacRoman(uint c)

149

{

150

if (127 < c)

151

c = Mac2Unicode[c - 128];

152

153

return c;

154

}

155

156

157

Table to map symbol font characters to Unicode; undefined

158

characters are mapped to 0x0000 and characters without any

159

Unicode equivalent are mapped to '?'. Is this appropriate?

160

161

162

uint Symbol2Unicode[] =

163

{

164

0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,

165

0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,

166

167

0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,

168

0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,

169

170

0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,

171

0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,

172

173

0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,

174

0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,

175

176

0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,

177

0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,

178

179

0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,

180

0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,

181

182

0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,

183

0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,

184

185

0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,

186

0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,

187

188

0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,

189

0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,

190

191

0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,

192

0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,

193

194

0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,

195

0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,

196

197

0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,

198

0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,

199

200

0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,

201

0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,

202

203

0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,

204

0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,

205

206

0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,

207

0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,

208

209

0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,

210

0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F

211

};

212

213

/* Function to convert from Symbol Font chars to Unicode */

214

uint DecodeSymbolFont(uint c)

215

{

216

if (c > 255)

217

return c;

218

219

/* todo: add some error message */

220

221

return Symbol2Unicode[c];

222

}

223

224

void FatalError(char *msg)

225

{

226

fprintf(stderr, "Fatal error: %s\n", msg);

227

DeInitTidy();

228

229

if (input && input != stdin)

230

fclose(input);

231

232

/* 2 signifies a serious error */

233

exit(2);

234

}

235

236

void *MemAlloc(uint size)

237

{

238

void *p;

239

240

p = malloc(size);

241

242

if (!p)

243

FatalError("Out of memory!");

244

245

return p;

246

}

247

248

void *MemRealloc(void *mem, uint newsize)

249

{

250

void *p;

251

252

if (mem == (void *)null)

253

return MemAlloc(newsize);

254

255

p = realloc(mem, newsize);

256

257

if (!p)

258

FatalError("Out of memory!");

259

260

return p;

261

}

262

263

void MemFree(void *mem)

264

{

265

if (mem != (void *)null)

266

free(mem);

267

}

268

269

void ClearMemory(void *mem, uint size)

270

{

271

memset(mem, 0, size);

272

}

273

274

275

276

UTF-8 encoding/decoding functions

277

Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence

278

279

Also see below for UTF-16 encoding/decoding functions

280

281

References :

282

283

1) UCS Transformation Format 8 (UTF-8):

284

ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D

285

<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>

286

<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>

287

288

Table 4 - Mapping from UCS-4 to UTF-8

289

290

2) Unicode standards:

291

<http://www.unicode.org/unicode/standard/standard.html>

292

293

3) Legal UTF-8 byte sequences:

294

<http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>

295

296

Code point 1st byte 2nd byte 3rd byte 4th byte

297

---------- -------- -------- -------- --------

298

U+0000..U+007F 00..7F

299

U+0080..U+07FF C2..DF 80..BF

300

U+0800..U+0FFF E0 A0..BF 80..BF

301

U+1000..U+FFFF E1..EF 80..BF 80..BF

302

U+10000..U+3FFFF F0 90..BF 80..BF 80..BF

303

U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF

304

U+100000..U+10FFFF F4 80..8F 80..BF 80..BF

305

306

The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows for the use of

307

five- and six-byte sequences to encode characters that are outside the range of the Unicode

308

character set; those five- and six-byte sequences are illegal for the use of UTF-8 as a

309

transformation of Unicode characters. ISO/IEC 10646 does not allow mapping of

310

unpaired surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters).

311

312

4) RFC 2279: UTF-8, a transformation format of ISO 10646:

313

<http://www.ietf.org/rfc/rfc2279.txt>

314

315

5) UTF-8 and Unicode FAQ:

316

<http://www.cl.cam.ac.uk/~mgk25/unicode.html>

317

318

6) Markus Kuhn's UTF-8 decoder stress test file:

319

<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>

320

321

7) UTF-8 Demo:

322

<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>

323

324

8) UTF-8 Sampler:

325

<http://www.columbia.edu/kermit/utf8.html>

326

327

9) Transformation Format for 16 Planes of Group 00 (UTF-16):

328

ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C

329

<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>

330

<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>

331

332

10) RFC 2781: UTF-16, an encoding of ISO 10646:

333

<http://www.ietf.org/rfc/rfc2781.txt>

334

335

11) UTF-16 invalid surrogate pairs:

336

<http://www.unicode.org/unicode/faq/utf_bom.html#16>

337

338

UTF-16 UTF-8 UCS-4

339

D83F DFF* F0 9F BF B* 0001FFF*

340

D87F DFF* F0 AF BF B* 0002FFF*

341

D8BF DFF* F0 BF BF B* 0003FFF*

342

D8FF DFF* F1 8F BF B* 0004FFF*

343

D93F DFF* F1 9F BF B* 0005FFF*

344

D97F DFF* F1 AF BF B* 0006FFF*

345

...

346

DBBF DFF* F3 BF BF B* 000FFFF*

347

DBFF DFF* F4 8F BF B* 0010FFF*

348

349

* = E or F

350

351

1010 A

352

1011 B

353

1100 C

354

1101 D

355

1110 E

356

1111 F

357

358

359

360

#define kNumUTF8Sequences 7

361

#define kMaxUTF8Bytes 4

362

363

#define kUTF8ByteSwapNotAChar 0xFFFE

364

#define kUTF8NotAChar 0xFFFF

365

366

#define kMaxUTF8FromUCS4 0x10FFFF

367

368

#define kUTF16SurrogatesBegin 0x10000

369

#define kMaxUTF16FromUCS4 0x10FFFF

370

371

/* UTF-16 surrogate pair areas */

372

#define kUTF16LowSurrogateBegin 0xD800

373

#define kUTF16LowSurrogateEnd 0xDBFF

374

#define kUTF16HighSurrogateBegin 0xDC00

375

#define kUTF16HighSurrogateEnd 0xDFFF

376

377

/* offsets into validUTF8 table below */

378

static int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =

379

{

380

0, /* 1 byte */

381

1, /* 2 bytes */

382

2, /* 3 bytes */

383

4, /* 4 bytes */

384

kNumUTF8Sequences /* must be last */

385

};

386

387

static struct validUTF8Sequence

388

{

389

unsigned int lowChar;

390

unsigned int highChar;

391

int numBytes;

392

unsigned char validBytes[8];

393

} validUTF8[kNumUTF8Sequences] =

394

{

395

/* low high #bytes byte 1 byte 2 byte 3 byte 4 */

396

{0x0000, 0x007F, 1, 0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},

397

{0x0080, 0x07FF, 2, 0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00},

398

{0x0800, 0x0FFF, 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00},

399

{0x1000, 0xFFFF, 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00},

400

{0x10000, 0x3FFFF, 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF},

401

{0x40000, 0xFFFFF, 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF},

402

{0x100000, 0x10FFFF, 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}

403

};

404

405

int DecodeUTF8BytesToChar(uint *c, uint firstByte, unsigned char *successorBytes,

406

StreamIn *in, GetBytes getter, int *count)

407

{

408

unsigned char tempbuf[10];

409

unsigned char *buf = &tempbuf[0];

410

uint ch = 0, n = 0;

411

int i, bytes = 0;

412

Bool hasError = no;

413

414

if (successorBytes)

415

buf = successorBytes;

416

417

/* special check if we have been passed an EOF char */

418

if (/* (in && feof(in->file)) || */ firstByte == (uint)EndOfStream)

419

{

420

/* at present */

421

*c = firstByte;

422

*count = 1;

423

return 0;

424

}

425

426

ch = firstByte; /* first byte is passed in separately */

427

428

if (ch <= 0x7F) /* 0XXX XXXX one byte */

429

{

430

n = ch;

431

bytes = 1;

432

}

433

else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */

434

{

435

n = ch & 31;

436

bytes = 2;

437

}

438

else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */

439

{

440

n = ch & 15;

441

bytes = 3;

442

}

443

else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */

444

{

445

n = ch & 7;

446

bytes = 4;

447

}

448

else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */

449

{

450

n = ch & 3;

451

bytes = 5;

452

hasError = yes;

453

}

454

else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */

455

{

456

n = ch & 1;

457

bytes = 6;

458

hasError = yes;

459

}

460

else

461

{

462

/* not a valid first byte of a UTF-8 sequence */

463

n = ch;

464

bytes = 1;

465

hasError = yes;

466

}

467

468

for (i = 1; i < bytes; ++i)

469

{

470

int tempCount; /* no. of additional bytes to get */

471

472

/* successor bytes should have the form 10XX XXXX */

473

if ( getter != null && (bytes - i) > 0 )

474

{

475

tempCount = 1; /* to simplify things, get 1 byte at a time */

476

getter(in, (unsigned char *)&buf[i - 1], &tempCount, no);

477

if (tempCount <= 0) /* EOF */

478

{

479

hasError = yes;

480

bytes = i;

481

break;

482

}

483

}

484

485

if ((buf[i - 1] & 0xC0) != 0x80)

486

{

487

/* illegal successor byte value */

488

hasError = yes;

489

bytes = i;

490

if (getter != null)

491

{

492

tempCount = 1; /* to simplify things, unget 1 byte at a time */

493

getter(in, (unsigned char *)&buf[i - 1], &tempCount, yes); /* Unget the byte */

494

}

495

break;

496

}

497

498

n = (n << 6) | (buf[i - 1] & 0x3F);

499

}

500

501

if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))

502

hasError = yes;

503

504

if (!hasError && (n > kMaxUTF8FromUCS4))

505

hasError = yes;

506

507

if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))

508

/* unpaired surrogates not allowed */

509

hasError = yes;

510

511

if (!hasError)

512

{

513

int lo, hi;

514

515

lo = offsetUTF8Sequences[bytes - 1];

516

hi = offsetUTF8Sequences[bytes] - 1;

517

518

/* check for overlong sequences */

519

if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))

520

hasError = yes;

521

else

522

{

523

hasError = yes; /* assume error until proven otherwise */

524

525

for (i = lo; i <= hi; i++)

526

{

527

int tempCount;

528

unsigned char theByte;

529

530

for (tempCount = 0; tempCount < bytes; tempCount++)

531

{

532

if (!tempCount)

533

theByte = firstByte;

534

else

535

theByte = buf[tempCount - 1];

536

537

if ((theByte >= validUTF8[i].validBytes[(tempCount * 2)]) &&

538

(theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1]))

539

hasError = no;

540

if (hasError)

541

break;

542

}

543

}

544

}

545

}

546

547

*count = bytes;

548

549

*c = n;

550

551

if (hasError)

552

{

553

#if 0

554

/* debug */

555

tidy_out(errout, "UTF-8 decoding error of %d bytes : ", bytes);

556

tidy_out(errout, "0x%02x ", firstByte);

557

for (i = 1; i < bytes; i++)

558

tidy_out(errout, "0x%02x ", buf[i - 1]);

559

tidy_out(errout, " = U+%04lx\n", n);

560

#endif

561

562

/* n = 0xFFFD; */ /* replacement char - do this in the caller */

563

return -1;

564

}

565

566

return 0;

567

}

568

569

int EncodeCharToUTF8Bytes(uint c, unsigned char *encodebuf,

570

Out *out, PutBytes putter, int *count)

571

{

572

unsigned char tempbuf[10];

573

unsigned char *buf = &tempbuf[0];

574

int bytes = 0;

575

Bool hasError = no;

576

577

if (encodebuf)

578

buf = encodebuf;

579

580

if (c <= 0x7F) /* 0XXX XXXX one byte */

581

{

582

buf[0] = c;

583

bytes = 1;

584

}

585

else if (c <= 0x7FF) /* 110X XXXX two bytes */

586

{

587

buf[0] = (0xC0 | (c >> 6));

588

buf[1] = (0x80 | (c & 0x3F));

589

bytes = 2;

590

}

591

else if (c <= 0xFFFF) /* 1110 XXXX three bytes */

592

{

593

buf[0] = (0xE0 | (c >> 12));

594

buf[1] = (0x80 | ((c >> 6) & 0x3F));

595

buf[2] = (0x80 | (c & 0x3F));

596

bytes = 3;

597

if ((c == kUTF8ByteSwapNotAChar) || (c == kUTF8NotAChar))

598

hasError = yes;

599

else if ((c >= kUTF16LowSurrogateBegin) && (c <= kUTF16HighSurrogateEnd))

600

/* unpaired surrogates not allowed */

601

hasError = yes;

602

}

603

else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */

604

{

605

buf[0] = (0xF0 | (c >> 18));

606

buf[1] = (0x80 | ((c >> 12) & 0x3F));

607

buf[2] = (0x80 | ((c >> 6) & 0x3F));

608

buf[3] = (0x80 | (c & 0x3F));

609

bytes = 4;

610

if (c > kMaxUTF8FromUCS4)

611

hasError = yes;

612

}

613

else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */

614

{

615

buf[0] = (0xF8 | (c >> 24));

616

buf[1] = (0x80 | (c >> 18));

617

buf[2] = (0x80 | ((c >> 12) & 0x3F));

618

buf[3] = (0x80 | ((c >> 6) & 0x3F));

619

buf[4] = (0x80 | (c & 0x3F));

620

bytes = 5;

621

hasError = yes;

622

}

623

else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */

624

{

625

buf[0] = (0xFC | (c >> 30));

626

buf[1] = (0x80 | ((c >> 24) & 0x3F));

627

buf[2] = (0x80 | ((c >> 18) & 0x3F));

628

buf[3] = (0x80 | ((c >> 12) & 0x3F));

629

buf[4] = (0x80 | ((c >> 6) & 0x3F));

630

buf[5] = (0x80 | (c & 0x3F));

631

bytes = 6;

632

hasError = yes;

633

}

634

else

635

hasError = yes;

636

637

/* don't output invalid UTF-8 byte sequence to a stream */

638

if ( !hasError && putter != null )

639

{

640

int tempCount = bytes;

641

642

putter(out, buf, &tempCount);

643

if (tempCount < bytes)

644

hasError = yes;

645

}

646

647

*count = bytes;

648

649

if (hasError)

650

{

651

#if 0

652

int i;

653

654

/* debug */

655

tidy_out(errout, "UTF-8 encoding error for U+%x : ", c);

656

for (i = 0; 0 < bytes; i++)

657

tidy_out(errout, "0x%02x ", buf[i]);

658

tidy_out(errout, "\n");

659

#endif

660

661

return -1;

662

}

663

664

return 0;

665

}

666

667

StreamIn *OpenInput(FILE *fp)

668

{

669

StreamIn *in;

670

671

in = (StreamIn *)MemAlloc(sizeof(StreamIn));

672

in->file = fp;

673

in->pushed = no;

674

in->bufpos = 0;

675

in->charbuf[0] = '\0';

676

in->tabs = 0;

677

in->curline = 1;

678

in->curcol = 1;

679

in->encoding = inCharEncoding;

680

in->state = FSM_ASCII;

681

682

return in;

683

}

684

685

686

Read raw bytes from stream, return <= 0 if EOF; or if

687

"unget" is true, Unget the bytes to re-synchronize the input stream

688

Normally UTF-8 successor bytes are read using this routine.

689

690

static void ReadRawBytesFromStream(StreamIn *in, unsigned char *buf, int *count, Bool unget)

691

{

692

int i;

693

694

for (i = 0; i < *count; i++)

695

{

696

if (unget)

697

{

698

/* should never get here; testing for 0xFF, a valid char, is not a good idea */

699

if ((in && feof(in->file)) /* || buf[i] == (unsigned char)EndOfStream */)

700

{

701

/* tidy_out(errout, "Attempt to unget EOF in ReadRawBytesFromStream\n"); */ /* debug */

702

*count = -i;

703

return;

704

}

705

706

rawPushed = yes;

707

708

if (rawBufpos >= CHARBUF_SIZE)

709

{

710

memcpy(rawBytebuf, rawBytebuf + 1, CHARBUF_SIZE - 1);

711

rawBufpos--;

712

}

713

rawBytebuf[rawBufpos++] = buf[i];

714

}

715

else

716

{

717

if (rawPushed)

718

{

719

buf[i] = rawBytebuf[--rawBufpos];

720

if (rawBufpos == 0)

721

rawPushed = no;

722

}

723

else

724

{

725

int c;

726

727

if (feof(in->file))

728

{

729

*count = -i;

730

break;

731

}

732

733

c = getc(in->file);

734

if (c == EOF)

735

{

736

*count = -i;

737

break;

738

}

739

else

740

{

741

buf[i] = c;

742

}

743

}

744

}

745

}

746

}

747

748

/* read char from stream */

749

static int ReadCharFromStream(StreamIn *in)

750

{

751

static Bool lookingForBOM = yes;

752

uint c, n;

753

unsigned char tempchar;

754

int count;

755

756

count = 1;

757

ReadRawBytesFromStream(in, &tempchar, &count, no);

758

if (count <= 0)

759

return EndOfStream;

760

c = (uint)tempchar;

761

762

if (lookingForBOM &&

763

(

764

#if SUPPORT_UTF16_ENCODINGS

765

766

in->encoding == UTF16 ||

767

in->encoding == UTF16LE ||

768

in->encoding == UTF16BE ||

769

770

#endif

771

in->encoding == UTF8))

772

{

773

/* check for a Byte Order Mark */

774

uint c1, bom;

775

776

lookingForBOM = no;

777

778

if (feof(in->file))

779

{

780

lookingForBOM = no;

781

return EndOfStream;

782

}

783

784

count = 1;

785

ReadRawBytesFromStream(in, &tempchar, &count, no);

786

c1 = (uint)tempchar;

787

788

#if SUPPORT_UTF16_ENCODINGS

789

790

bom = (c << 8) + c1;

791

792

if (bom == UNICODE_BOM_BE)

793

{

794

/* big-endian UTF-16 */

795

if (in->encoding != UTF16 && in->encoding != UTF16BE)

796

{

797

/* tidy_out(errout, "Input is encoded as UTF16BE\n"); */ /* debug */

798

ReportEncodingError(in->lexer, ENCODING_MISMATCH, UTF16BE); /* fatal error */

799

}

800

in->encoding = UTF16BE;

801

inCharEncoding = UTF16BE;

802

803

return UNICODE_BOM; /* return decoded BOM */

804

}

805

else if (bom == UNICODE_BOM_LE)

806

{

807

/* little-endian UTF-16 */

808

if (in->encoding != UTF16 && in->encoding != UTF16LE)

809

{

810

/* tidy_out(errout, "Input is encoded as UTF16LE\n"); */ /* debug */

811

ReportEncodingError(in->lexer, ENCODING_MISMATCH, UTF16LE); /* fatal error */

812

}

813

in->encoding = UTF16LE;

814

inCharEncoding = UTF16LE;

815

816

return UNICODE_BOM; /* return decoded BOM */

817

}

818

else

819

820

#endif

821

{

822

uint c2;

823

824

count = 1;

825

ReadRawBytesFromStream(in, &tempchar, &count, no);

826

c2 = (uint)tempchar;

827

828

if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)

829

{

830

/* UTF-8 */

831

if (in->encoding != UTF8)

832

{

833

/* tidy_out(errout, "Input is encoded as UTF8\n"); */ /* debug */

834

ReportEncodingError(in->lexer, ENCODING_MISMATCH, UTF8); /* fatal error */

835

}

836

in->encoding = UTF8;

837

inCharEncoding = UTF8;

838

839

return UNICODE_BOM; /* return decoded BOM */

840

}

841

else

842

{

843

/* the 2nd and/or 3rd bytes weren't what we were */

844

/* expecting, so unget the extra 2 bytes */

845

rawPushed = yes;

846

847

if ((rawBufpos + 1) >= CHARBUF_SIZE)

848

{

849

memcpy(rawBytebuf, rawBytebuf + 2, CHARBUF_SIZE - 2);

850

rawBufpos -= 2;

851

}

852

/* make sure the bytes are pushed in the right order */

853

rawBytebuf[rawBufpos++] = (unsigned char)c2;

854

rawBytebuf[rawBufpos++] = (unsigned char)c1;

855

856

/* drop through to code below, with the original char */

857

}

858

}

859

}

860

861

lookingForBOM = no;

862

863

864

A document in ISO-2022 based encoding uses some ESC sequences

865

called "designator" to switch character sets. The designators

866

defined and used in ISO-2022-JP are:

867

868

"ESC" + "(" + ? for ISO646 variants

869

870

"ESC" + "$" + ? and

871

"ESC" + "$" + "(" + ? for multibyte character sets

872

873

Where ? stands for a single character used to indicate the

874

character set for multibyte characters.

875

876

Tidy handles this by preserving the escape sequence and

877

setting the top bit of each byte for non-ascii chars. This

878

bit is then cleared on output. The input stream keeps track

879

of the state to determine when to set/clear the bit.

880

881

882

if (in->encoding == ISO2022)

883

{

884

if (c == 0x1b) /* ESC */

885

{

886

in->state = FSM_ESC;

887

return c;

888

}

889

890

switch (in->state)

891

{

892

case FSM_ESC:

893

if (c == '$')

894

in->state = FSM_ESCD;

895

else if (c == '(')

896

in->state = FSM_ESCP;

897

else

898

in->state = FSM_ASCII;

899

break;

900

901

case FSM_ESCD:

902

if (c == '(')

903

in->state = FSM_ESCDP;

904

else

905

in->state = FSM_NONASCII;

906

break;

907

908

case FSM_ESCDP:

909

in->state = FSM_NONASCII;

910

break;

911

912

case FSM_ESCP:

913

in->state = FSM_ASCII;

914

break;

915

916

case FSM_NONASCII:

917

c |= 0x80;

918

break;

919

}

920

921

return c;

922

}

923

924

#if SUPPORT_UTF16_ENCODINGS

925

926

if (in->encoding == UTF16LE)

927

{

928

uint c1;

929

930

count = 1;

931

ReadRawBytesFromStream(in, &tempchar, &count, no);

932

if (count <= 0)

933

return EndOfStream;

934

c1 = (uint)tempchar;

935

936

n = (c1 << 8) + c;

937

938

return n;

939

}

940

941

if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */

942

{

943

uint c1;

944

945

count = 1;

946

ReadRawBytesFromStream(in, &tempchar, &count, no);

947

if (count <= 0)

948

return EndOfStream;

949

c1 = (uint)tempchar;

950

951

n = (c << 8) + c1;

952

953

return n;

954

}

955

956

#endif

957

958

if (in->encoding == UTF8)

959

#if 0

960

{

961

/* deal with UTF-8 encoded char */

962

963

uint i, count;

964

965

if ((c & 0xE0) == 0xC0) /* 110X XXXX two bytes */

966

{

967

n = c & 31;

968

count = 1;

969

}

970

else if ((c & 0xF0) == 0xE0) /* 1110 XXXX three bytes */

971

{

972

n = c & 15;

973

count = 2;

974

}

975

else if ((c & 0xF8) == 0xF0) /* 1111 0XXX four bytes */

976

{

977

n = c & 7;

978

count = 3;

979

}

980

else if ((c & 0xFC) == 0xF8) /* 1111 10XX five bytes */

981

{

982

n = c & 3;

983

count = 4;

984

}

985

else if ((c & 0xFE) == 0xFC) /* 1111 110X six bytes */

986

{

987

n = c & 1;

988

count = 5;

989

}

990

else /* 0XXX XXXX one byte */

991

return c;

992

993

/* successor bytes should have the form 10XX XXXX */

994

for (i = 1; i <= count; ++i)

995

{

996

if (feof(in->file))

997

return = EndOfStream;

998

999

c = getc(in->file);

1000

1001

n = (n << 6) | (c & 0x3F);

1002

}

1003

1004

return n;

1005

}

1006

#else

1007

{

1008

/* deal with UTF-8 encoded char */

1009

1010

int err, count = 0;

1011

1012

/* first byte "c" is passed in separately */

1013

err = DecodeUTF8BytesToChar(&n, c, null, in, ReadRawBytesFromStream, &count);

1014

if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */

1015

return EndOfStream;

1016

else if (err)

1017

{

1018

/* set error position just before offending character */

1019

in->lexer->lines = in->curline;

1020

in->lexer->columns = in->curcol;

1021

1022

ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, n);

1023

n = 0xFFFD; /* replacement char */

1024

}

1025

1026

return n;

1027

}

1028

#endif

1029

1030

#if SUPPORT_ASIAN_ENCODINGS

1031

1032

/* #431953 - start RJ */

1033

1034

This section is suitable for any "multibyte" variable-width

1035

character encoding in which a one-byte code is less than

1036

128, and the first byte of a two-byte code is greater or

1037

equal to 128. Note that Big5 and ShiftJIS fit into this

1038

kind, even though their second byte may be less than 128

1039

1040

if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))

1041

{

1042

if (c < 128)

1043

return c;

1044

else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */

1045

{

1046

1047

Rick Cameron pointed out that for Shift_JIS, the values from

1048

0xa1 through 0xdf represent singe-byte characters

1049

(U+FF61 to U+FF9F - half-shift Katakana)

1050

1051

return c;

1052

}

1053

else

1054

{

1055

uint c1;

1056

1057

count = 1;

1058

ReadRawBytesFromStream(in, &tempchar, &count, no);

1059

if (count <= 0)

1060

return EndOfStream;

1061

c1 = (uint)tempchar;

1062

1063

n = (c << 8) + c1;

1064

1065

return n;

1066

}

1067

}

1068

/* #431953 - end RJ */

1069

1070

#endif

1071

1072

else

1073

n = c;

1074

1075

return n;

1076

}

1077

1078

int ReadChar(StreamIn *in)

1079

{

1080

int c;

1081

1082

if (in->pushed)

1083

{

1084

c = in->charbuf[--(in->bufpos)];

1085

if ((in->bufpos) == 0)

1086

in->pushed = no;

1087

1088

if (c == '\n')

1089

{

1090

in->curcol = 1;

1091

in->curline++;

1092

return c;

1093

}

1094

1095

in->curcol++;

1096

return c;

1097

}

1098

1099

in->lastcol = in->curcol;

1100

1101

if (in->tabs > 0)

1102

{

1103

in->curcol++;

1104

in->tabs--;

1105

return ' ';

1106

}

1107

1108

for (;;)

1109

{

1110

c = ReadCharFromStream(in);

1111

1112

if (c < 0)

1113

return EndOfStream;

1114

1115

if (c == '\n')

1116

{

1117

in->curcol = 1;

1118

in->curline++;

1119

break;

1120

}

1121

1122

if (c == '\t')

1123

{

1124

in->tabs = tabsize - ((in->curcol - 1) % tabsize) - 1;

1125

in->curcol++;

1126

c = ' ';

1127

break;

1128

}

1129

1130

/* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */

1131

if (c == '\r')

1132

{

1133

c = ReadCharFromStream(in);

1134

if (c != '\n')

1135

{

1136

if (c == EndOfStream) /* EOF fix by Terry Teague 12 Aug 01 */

1137

{

1138

/* c = EndOfStream; */ /* debug */

1139

}

1140

else

1141

UngetChar(c, in);

1142

c = '\n';

1143

}

1144

in->curcol = 1;

1145

in->curline++;

1146

break;

1147

}

1148

1149

/* strip control characters, except for Esc */

1150

1151

if (c == '\033')

1152

break;

1153

1154

/* Form Feed is allowed in HTML */

1155

if ((c == '\015') && !XmlTags)

1156

break;

1157

1158

if (0 <= c && c < 32)

1159

continue; /* discard control char */

1160

1161

/* watch out for chars that have already been decoded such as */

1162

/* IS02022, UTF-8 etc, that don't require further decoding */

1163

1164

if (

1165

in->encoding == RAW

1166

|| in->encoding == ISO2022

1167

|| in->encoding == UTF8

1168

1169

#if SUPPORT_ASIAN_ENCODINGS

1170

1171

|| in->encoding == SHIFTJIS /* #431953 - RJ */

1172

|| in->encoding == BIG5 /* #431953 - RJ */

1173

1174

#endif

1175

1176

)

1177

{

1178

in->curcol++;

1179

break;

1180

}

1181

1182

#if SUPPORT_UTF16_ENCODINGS

1183

1184

/* handle surrogate pairs */

1185

if ((in->encoding == UTF16LE) || (in->encoding == UTF16) || (in->encoding == UTF16BE))

1186

{

1187

if (c > kMaxUTF16FromUCS4)

1188

{

1189

/* invalid UTF-16 value */

1190

ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c);

1191

c = 0;

1192

}

1193

else if (c >= kUTF16LowSurrogateBegin && c <= kUTF16LowSurrogateEnd) /* high surrogate */

1194

{

1195

uint n, m;

1196

1197

n = c;

1198

1199

m = ReadCharFromStream(in);

1200

if (m < 0)

1201

return EndOfStream;

1202

1203

if (m >= kUTF16HighSurrogateBegin && m <= kUTF16HighSurrogateEnd) /* low surrogate */

1204

{

1205

/* pair found, recombine them */

1206

c = (n - kUTF16LowSurrogateBegin) * 0x400 + (m - kUTF16HighSurrogateBegin) + 0x10000;

1207

1208

/* check for invalid pairs */

1209

if (((c & 0x0000FFFE) == 0x0000FFFE) ||

1210

((c & 0x0000FFFF) == 0x0000FFFF) ||

1211

(c < kUTF16SurrogatesBegin))

1212

{

1213

ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c);

1214

c = 0;

1215

}

1216

}

1217

else

1218

{

1219

/* not a valid pair */

1220

ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c);

1221

c = 0;

1222

/* should we unget the just read char? */

1223

}

1224

}

1225

else

1226

{

1227

/* no recombination needed */

1228

}

1229

}

1230

1231

#endif

1232

1233

if (in->encoding == MACROMAN)

1234

c = DecodeMacRoman(c);

1235

1236

/* produced e.g. as a side-effect of smart quotes in Word */

1237

/* but can't happen if using MACROMAN encoding */

1238

if (127 < c && c < 160)

1239

{

1240

int c1, replaceMode;

1241

1242

/* set error position just before offending character */

1243

in->lexer->lines = in->curline;

1244

in->lexer->columns = in->curcol;

1245

1246

if ((in->encoding == WIN1252) || (ReplacementCharEncoding == WIN1252))

1247

c1 = DecodeWin1252(c);

1248

else if (ReplacementCharEncoding == MACROMAN)

1249

c1 = DecodeMacRoman(c);

1250

1251

replaceMode = c1?REPLACED_CHAR:DISCARDED_CHAR;

1252

1253

if ((c1 == 0) && (in->encoding == WIN1252) || (in->encoding == MACROMAN))

1254

ReportEncodingError(in->lexer, VENDOR_SPECIFIC_CHARS | replaceMode, c);

1255

else if ((in->encoding != WIN1252) && (in->encoding != MACROMAN))

1256

ReportEncodingError(in->lexer, INVALID_SGML_CHARS | replaceMode, c);

1257

1258

c = c1;

1259

}

1260

1261

if (c == 0)

1262

continue; /* illegal char is discarded */

1263

1264

in->curcol++;

1265

break;

1266

}

1267

1268

return c;

1269

}

1270

1271

void UngetChar(int c, StreamIn *in)

1272

{

1273

if (c == EndOfStream)

1274

{

1275

/* tidy_out(errout, "Attempt to UngetChar EOF\n"); */ /* debug */

1276

}

1277

1278

in->pushed = yes;

1279

1280

if (in->bufpos >= CHARBUF_SIZE)

1281

{

1282

memcpy(in->charbuf, in->charbuf + 1, CHARBUF_SIZE - 1);

1283

(in->bufpos)--;

1284

}

1285

in->charbuf[(in->bufpos)++] = c;

1286

1287

if (c == '\n')

1288

--(in->curline);

1289

1290

in->curcol = in->lastcol;

1291

}

1292

1293

/* like strdup but using MemAlloc */

1294

char *wstrdup(char *str)

1295

{

1296

char *s, *p;

1297

int len;

1298

1299

if (str == null)

1300

return null;

1301

1302

for (len = 0; str[len] != '\0'; ++len);

1303

1304

s = (char *)MemAlloc(sizeof(char)*(1+len));

1305

for (p = s; (*p++ = *str++););

1306

return s;

1307

}

1308

1309

/* like strndup but using MemAlloc */

1310

char *wstrndup(char *str, int len)

1311

{

1312

char *s, *p;

1313

1314

if (str == null || len < 0)

1315

return null;

1316

1317

s = (char *)MemAlloc(sizeof(char)*(1+len));

1318

1319

p = s;

1320

1321

while (len-- > 0 && (*p++ = *str++));

1322

1323

*p = '\0';

1324

return s;

1325

}

1326

1327

/* exactly same as strncpy */

1328

void wstrncpy(char *s1, char *s2, int size)

1329

{

1330

if (s1 != null && s2 != null)

1331

{

1332

if (size >= 0)

1333

{

1334

while (size--)

1335

*s1++ = *s2++;

1336

}

1337

else

1338

while ((*s1++ = *s2++));

1339

}

1340

}

1341

1342

void wstrcpy(char *s1, char *s2)

1343

{

1344

while ((*s1++ = *s2++));

1345

}

1346

1347

void wstrcat(char *s1, char *s2)

1348

{

1349

while (*s1)

1350

++s1;

1351

1352

while ((*s1++ = *s2++));

1353

}

1354

1355

/* exactly same as strcmp */

1356

int wstrcmp(char *s1, char *s2)

1357

{

1358

int c;

1359

1360

while ((c = *s1) == *s2)

1361

{

1362

if (c == '\0')

1363

return 0;

1364

1365

++s1;

1366

++s2;

1367

}

1368

1369

return (*s1 > *s2 ? 1 : -1);

1370

}

1371

1372

/* returns byte count, not char count */

1373

int wstrlen(char *str)

1374

{

1375

int len = 0;

1376

1377

while(*str++)

1378

++len;

1379

1380

return len;

1381

}

1382

1383

1384

MS C 4.2 doesn't include strcasecmp.

1385

Note that tolower and toupper won't

1386

work on chars > 127

1387

1388

int wstrcasecmp(char *s1, char *s2)

1389

{

1390

uint c;

1391

1392

while (c = (uint)(*s1), ToLower(c) == ToLower((uint)(*s2)))

1393

{

1394

if (c == '\0')

1395

return 0;

1396

1397

++s1;

1398

++s2;

1399

}

1400

1401

return (*s1 > *s2 ? 1 : -1);

1402

}

1403

1404

int wstrncmp(char *s1, char *s2, int n)

1405

{

1406

int c;

1407

1408

while ((c = *s1) == *s2)

1409

{

1410

if (c == '\0')

1411

return 0;

1412

1413

if (n == 0)

1414

return 0;

1415

1416

++s1;

1417

++s2;

1418

--n;

1419

}

1420

1421

if (n == 0)

1422

return 0;

1423

1424

return (*s1 > *s2 ? 1 : -1);

1425

}

1426

1427

int wstrncasecmp(char *s1, char *s2, int n)

1428

{

1429

int c;

1430

1431

while (c = *s1, tolower(c) == tolower(*s2))

1432

{

1433

if (c == '\0')

1434

return 0;

1435

1436

if (n == 0)

1437

return 0;

1438

1439

++s1;

1440

++s2;

1441

--n;

1442

}

1443

1444

if (n == 0)

1445

return 0;

1446

1447

return (*s1 > *s2 ? 1 : -1);

1448

}

1449

1450

/* return offset of cc from beginning of s1,

1451

** -1 if not found.

1452

1453

int wstrnchr( char *s1, int len1, char cc )

1454

{

1455

int i;

1456

char* cp = s1;

1457

1458

for ( i = 0; i < len1; ++i, ++cp )

1459

{

1460

if ( *cp == cc )

1461

return i;

1462

}

1463

1464

return -1;

1465

}

1466

1467

Bool wsubstrn( char *s1, int len1, char *s2 )

1468

{

1469

int i, len2 = wstrlen(s2);

1470

1471

for (i = 0; i <= len1 - len2; ++i)

1472

{

1473

if (wstrncmp(s1+i, s2, len2) == 0)

1474

return yes;

1475

}

1476

1477

return no;

1478

}

1479

1480

Bool wsubstrncase(char *s1, int len1, char *s2 )

1481

{

1482

int i, len2 = wstrlen(s2);

1483

1484

for (i = 0; i <= len1 - len2; ++i)

1485

{

1486

if (wstrncasecmp(s1+i, s2, len2) == 0)

1487

return yes;

1488

}

1489

1490

return no;

1491

}

1492

1493

Bool wsubstr(char *s1, char *s2)

1494

{

1495

int i, len1 = wstrlen(s1), len2 = wstrlen(s2);

1496

1497

for (i = 0; i <= len1 - len2; ++i)

1498

{

1499

if (wstrncasecmp(s1+i, s2, len2) == 0)

1500

return yes;

1501

}

1502

1503

return no;

1504

}

1505

1506

/* transform string to lower case */

1507

char *wstrtolower(char *s)

1508

{

1509

int i;

1510

for (i = 0; i < wstrlen(s); ++i)

1511

s[i] = ToLower(s[i]);

1512

1513

return s;

1514

}

1515

1516

/* output UTF-8 bytes to output stream */

1517

static void outcUTF8Bytes(Out *out, unsigned char *buf, int *count)

1518

{

1519

int i;

1520

1521

for (i = 0; i < *count; i++)

1522

{

1523

putc(buf[i], out->fp);

1524

}

1525

}

1526

1527

/* For mac users, should we map Unicode back to MacRoman? */

1528

void outc(uint c, Out *out)

1529

{

1530

uint ch;

1531

1532

#if 1

1533

if (out->encoding == MACROMAN)

1534

{

1535

if (c < 128)

1536

putc(c, out->fp);

1537

else

1538

{

1539

int i;

1540

1541

for (i = 128; i < 256; i++)

1542

if (Mac2Unicode[i - 128] == c)

1543

{

1544

putc(i, out->fp);

1545

break;

1546

}

1547

}

1548

}

1549

else

1550

#endif

1551

1552

#if 1

1553

if (out->encoding == WIN1252)

1554

{

1555

if (c < 128 || (c > 159 && c < 256))

1556

putc(c, out->fp);

1557

else

1558

{

1559

int i;

1560

1561

for (i = 128; i < 160; i++)

1562

if (Win2Unicode[i - 128] == c)

1563

{

1564

putc(i, out->fp);

1565

break;

1566

}

1567

}

1568

}

1569

else

1570

#endif

1571

1572

if (out->encoding == UTF8)

1573

#if 0

1574

{

1575

if (c < 128)

1576

putc(c, out->fp);

1577

else if (c <= 0x7FF)

1578

{

1579

ch = (0xC0 | (c >> 6)); putc(ch, out->fp);

1580

ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);

1581

}

1582

else if (c <= 0xFFFF)

1583

{

1584

ch = (0xE0 | (c >> 12)); putc(ch, out->fp);

1585

ch = (0x80 | ((c >> 6) & 0x3F)); putc(ch, out->fp);

1586

ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);

1587

}

1588

else if (c <= 0x1FFFFF)

1589

{

1590

ch = (0xF0 | (c >> 18)); putc(ch, out->fp);

1591

ch = (0x80 | ((c >> 12) & 0x3F)); putc(ch, out->fp);

1592

ch = (0x80 | ((c >> 6) & 0x3F)); putc(ch, out->fp);

1593

ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);

1594

}

1595

else

1596

{

1597

ch = (0xF8 | (c >> 24)); putc(ch, out->fp);

1598

ch = (0x80 | ((c >> 18) & 0x3F)); putc(ch, out->fp);

1599

ch = (0x80 | ((c >> 12) & 0x3F)); putc(ch, out->fp);

1600

ch = (0x80 | ((c >> 6) & 0x3F)); putc(ch, out->fp);

1601

ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);

1602

}

1603

}

1604

#else

1605

{

1606

int count = 0;

1607

1608

EncodeCharToUTF8Bytes(c, null, out, outcUTF8Bytes, &count);

1609

if (count <= 0)

1610

{

1611

/* ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */

1612

/* replacement char 0xFFFD encoded as UTF-8 */

1613

putc(0xEF, out->fp); putc(0xBF, out->fp); putc(0xBF, out->fp);

1614

}

1615

}

1616

#endif

1617

else if (out->encoding == ISO2022)

1618

{

1619

if (c == 0x1b) /* ESC */

1620

out->state = FSM_ESC;

1621

else

1622

{

1623

switch (out->state)

1624

{

1625

case FSM_ESC:

1626

if (c == '$')

1627

out->state = FSM_ESCD;

1628

else if (c == '(')

1629

out->state = FSM_ESCP;

1630

else

1631

out->state = FSM_ASCII;

1632

break;

1633

1634

case FSM_ESCD:

1635

if (c == '(')

1636

out->state = FSM_ESCDP;

1637

else

1638

out->state = FSM_NONASCII;

1639

break;

1640

1641

case FSM_ESCDP:

1642

out->state = FSM_NONASCII;

1643

break;

1644

1645

case FSM_ESCP:

1646

out->state = FSM_ASCII;

1647

break;

1648

1649

case FSM_NONASCII:

1650

c &= 0x7F;

1651

break;

1652

}

1653

}

1654

1655

putc(c, out->fp);

1656

}

1657

1658

#if SUPPORT_UTF16_ENCODINGS

1659

1660

else if (out->encoding == UTF16LE || out->encoding == UTF16BE || out->encoding == UTF16)

1661

{

1662

int i, numChars = 1;

1663

uint theChars[2];

1664

1665

if (c > kMaxUTF16FromUCS4)

1666

{

1667

/* invalid UTF-16 value */

1668

/* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */

1669

c = 0;

1670

numChars = 0;

1671

}

1672

else if (c >= kUTF16SurrogatesBegin)

1673

{

1674

/* encode surrogate pairs */

1675

1676

/* check for invalid pairs */

1677

if (((c & 0x0000FFFE) == 0x0000FFFE) ||

1678

((c & 0x0000FFFF) == 0x0000FFFF))

1679

{

1680

/* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */

1681

c = 0;

1682

numChars = 0;

1683

}

1684

else

1685

{

1686

theChars[0] = (c - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;

1687

theChars[1] = (c - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;

1688

1689

/* output both */

1690

numChars = 2;

1691

}

1692

}

1693

else

1694

{

1695

/* just put the char out */

1696

theChars[0] = c;

1697

}

1698

1699

for (i = 0; i < numChars; i++)

1700

{

1701

c = theChars[i];

1702

1703

if (out->encoding == UTF16LE)

1704

{

1705

ch = c & 0xFF; putc(ch, out->fp);

1706

ch = (c >> 8) & 0xFF; putc(ch, out->fp);

1707

}

1708

1709

else if (out->encoding == UTF16BE || out->encoding == UTF16)

1710

{

1711

ch = (c >> 8) & 0xFF; putc(ch, out->fp);

1712

ch = c & 0xFF; putc(ch, out->fp);

1713

}

1714

}

1715

}

1716

1717

#endif

1718

1719

#if SUPPORT_ASIAN_ENCODINGS

1720

1721

/* #431953 - start RJ */

1722

else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)

1723

{

1724

if (c < 128)

1725

putc(c, out->fp);

1726

else

1727

{

1728

ch = (c >> 8) & 0xFF; putc(ch, out->fp);

1729

ch = c & 0xFF; putc(ch, out->fp);

1730

}

1731

}

1732

/* #431953 - end RJ */

1733

1734

#endif

1735

1736

else

1737

putc(c, out->fp);

1738

}

1739

1740

/* Output a Byte Order Mark if required */

1741

void outBOM(Out *out)

1742

{

1743

if (

1744

out->encoding == UTF8

1745

1746

#if SUPPORT_UTF16_ENCODINGS

1747

1748

|| out->encoding == UTF16LE

1749

|| out->encoding == UTF16BE

1750

|| out->encoding == UTF16

1751

1752

#endif

1753

)

1754

outc(UNICODE_BOM, out); /* this will take care of encoding the BOM correctly */

1755

}

1756

1757

1758

first time initialization which should

1759

precede reading the command line

1760

1761

void InitTidy(void)

1762

{

1763

InitMap();

1764

InitAttrs();

1765

InitTags();

1766

InitEntities();

1767

InitConfig();

1768

1769

totalerrors = totalwarnings = 0;

1770

XmlTags = XmlOut = HideEndTags = UpperCaseTags =

1771

MakeBare = MakeClean = writeback = OnlyErrors = no;

1772

1773

input = null;

1774

errfile = null;

1775

errout = stderr;

1776

1777

#ifdef CONFIG_FILE

1778

ParseConfigFile(CONFIG_FILE);

1779

#endif

1780

}

1781

1782

1783

call this when you have finished with tidy

1784

to free the hash tables and other resources

1785

1786

void DeInitTidy(void)

1787

{

1788

FreeTags();

1789

FreeAttrTable();

1790

FreeEntities();

1791

FreeConfig();

1792

FreePrintBuf();

1793

}

1794

1795

int main(int argc, char **argv)

1796

{

1797

char *file, *prog;

1798

Node *document, *doctype;

1799

Lexer *lexer;

1800

char *s, c, *arg, *current_errorfile = "stderr";

1801

Out out; /* normal output stream */

1802

Bool InputHadBOM = no;

1803

1804

#if PRESERVE_FILE_TIMES

1805

struct utimbuf filetimes;

1806

struct stat sbuf;

1807

#endif

1808

Bool haveFileTimes;

1809

1810

InitTidy();

1811

1812

/* look for env var "HTML_TIDY" */

1813

/* then for ~/.tidyrc (on Unix) */

1814

1815

if ((file = getenv("HTML_TIDY")))

1816

ParseConfigFile(file);

1817

#ifdef SUPPORT_GETPWNAM

1818

else

1819

ParseConfigFile("~/.tidyrc");

1820

#endif /* SUPPORT_GETPWNAM */

1821

1822

/* read command line */

1823

1824

prog = argv[0];

1825

1826

while (argc > 0)

1827

{

1828

if (argc > 1 && argv[1][0] == '-')

1829

{

1830

/* support -foo and --foo */

1831

arg = argv[1] + 1;

1832

#if 0

1833

if (arg[0] == '-')

1834

++arg;

1835

#endif

1836

/* #427667 - fix by Randy Waki 04 Aug 00 */

1837

1838

if (wstrcasecmp(arg, "indent") == 0)

1839

IndentContent = yes;

1840

else */ if (wstrcasecmp(arg, "xml") == 0)

1841

XmlTags = yes;

1842

else if (wstrcasecmp(arg, "asxml") == 0 ||

1843

wstrcasecmp(arg, "asxhtml") == 0)

1844

{

1845

xHTML = yes;

1846

}

1847

else if (wstrcasecmp(arg, "ashtml") == 0)

1848

{

1849

HtmlOut = yes;

1850

}

1851

else if (wstrcasecmp(arg, "indent") == 0)

1852

{

1853

IndentContent = yes;

1854

SmartIndent = yes;

1855

}

1856

else if (wstrcasecmp(arg, "omit") == 0)

1857

HideEndTags = yes;

1858

else if (wstrcasecmp(arg, "upper") == 0)

1859

UpperCaseTags = yes;

1860

else if (wstrcasecmp(arg, "clean") == 0)

1861

MakeClean = yes;

1862

else if (wstrcasecmp(arg, "bare") == 0)

1863

MakeBare = yes;

1864

else if (wstrcasecmp(arg, "raw") == 0)

1865

AdjustCharEncoding(RAW);

1866

else if (wstrcasecmp(arg, "ascii") == 0)

1867

AdjustCharEncoding(ASCII);

1868

else if (wstrcasecmp(arg, "latin1") == 0)

1869

AdjustCharEncoding(LATIN1);

1870

else if (wstrcasecmp(arg, "utf8") == 0)

1871

AdjustCharEncoding(UTF8);

1872

else if (wstrcasecmp(arg, "iso2022") == 0)

1873

AdjustCharEncoding(ISO2022);

1874

else if (wstrcasecmp(arg, "mac") == 0)

1875

AdjustCharEncoding(MACROMAN);

1876

1877

#if SUPPORT_UTF16_ENCODINGS

1878

1879

else if (wstrcasecmp(arg, "utf16le") == 0)

1880

AdjustCharEncoding(UTF16LE);

1881

else if (wstrcasecmp(arg, "utf16be") == 0)

1882

AdjustCharEncoding(UTF16BE);

1883

else if (wstrcasecmp(arg, "utf16") == 0)

1884

AdjustCharEncoding(UTF16);

1885

1886

#endif

1887

1888

else if (wstrcasecmp(arg, "win1252") == 0)

1889

AdjustCharEncoding(WIN1252);

1890

1891

#if SUPPORT_ASIAN_ENCODINGS

1892

1893

else if (wstrcasecmp(arg, "shiftjis") == 0) /* #431953 - RJ */

1894

AdjustCharEncoding(SHIFTJIS);

1895

else if (wstrcasecmp(arg, "big5") == 0) /* #431953 - RJ */

1896

AdjustCharEncoding(BIG5);

1897

1898

#endif

1899

1900

else if (wstrcasecmp(arg, "numeric") == 0)

1901

NumEntities = yes;

1902

else if (wstrcasecmp(arg, "modify") == 0)

1903

writeback = yes;

1904

else if (wstrcasecmp(arg, "change") == 0) /* obsolete */

1905

writeback = yes;

1906

else if (wstrcasecmp(arg, "update") == 0) /* obsolete */

1907

writeback = yes;

1908

else if (wstrcasecmp(arg, "errors") == 0)

1909

OnlyErrors = yes;

1910

else if (wstrcasecmp(arg, "quiet") == 0)

1911

Quiet = yes;

1912

else if (wstrcasecmp(arg, "slides") == 0)

1913

BurstSlides = yes;

1914

else if (wstrcasecmp(arg, "help") == 0 ||

1915

wstrcasecmp(arg, "h") == 0 ||

1916

*arg == '?')

1917

{

1918

HelpText(stdout, prog);

1919

1920

DeInitTidy(); /* called to free hash tables etc. */

1921

return 0; /* was return 1 */

1922

}

1923

else if (wstrcasecmp(arg, "help-config") == 0)

1924

{

1925

PrintConfigOptions(stdout, no);

1926

1927

DeInitTidy(); /* called to free hash tables etc. */

1928

return 0;

1929

1930

/* break; */

1931

1932

--argc;

1933

++argv;

1934

continue;

1935

1936

}

1937

else if (wstrcasecmp(arg, "show-config") == 0)

1938

{

1939

AdjustConfig(); /* ensure config is self-consistent */

1940

PrintConfigOptions(errout, yes);

1941

1942

DeInitTidy(); /* called to free hash tables etc. */

1943

return 0;

1944

1945

/* break; */

1946

1947

--argc;

1948

++argv;

1949

continue;

1950

1951

}

1952

else if (wstrcasecmp(arg, "config") == 0)

1953

{

1954

if (argc >= 3)

1955

{

1956

ParseConfigFile(argv[2]);

1957

--argc;

1958

++argv;

1959

}

1960

}

1961

1962

#if SUPPORT_ASIAN_ENCODINGS

1963

1964

/* #431953 - start RJ */

1965

else if (wstrcasecmp(arg, "language") == 0 ||

1966

wstrcasecmp(arg, "lang") == 0)

1967

{

1968

if (argc >= 3)

1969

{

1970

Language = argv[2];

1971

--argc;

1972

++argv;

1973

}

1974

}

1975

/* #431953 - end RJ */

1976

1977

#endif

1978

1979

else if (wstrcasecmp(arg, "file") == 0 ||

1980

wstrcasecmp(arg, "-file") == 0 ||

1981

wstrcasecmp(arg, "f") == 0)

1982

{

1983

if (argc >= 3)

1984

{

1985

/* create copy that can be freed by FreeConfig() */

1986

errfile = wstrdup(argv[2]);

1987

--argc;

1988

++argv;

1989

}

1990

}

1991

else if (wstrcasecmp(arg, "wrap") == 0 ||

1992

wstrcasecmp(arg, "-wrap") == 0 ||

1993

wstrcasecmp(arg, "w") == 0)

1994

{

1995

if (argc >= 3)

1996

{

1997

sscanf(argv[2], "%d", &wraplen);

1998

--argc;

1999

++argv;

2000

}

2001

}

2002

else if (wstrcasecmp(arg, "version") == 0 ||

2003

wstrcasecmp(arg, "-version") == 0 ||

2004

wstrcasecmp(arg, "v") == 0)

2005

{

2006

ShowVersion(errout);

2007

/* called to free hash tables etc. */

2008

DeInitTidy();

2009

return 0;

2010

2011

}

2012

else if (strncmp(argv[1], "--", 2) == 0)

2013

{

2014

if (ParseConfig(argv[1] + 2, argv[2]))

2015

{

2016

++argv;

2017

--argc;

2018

}

2019

}

2020

else

2021

{

2022

s = argv[1];

2023

2024

while ((c = *++s))

2025

{

2026

if (c == 'i')

2027

{

2028

IndentContent = yes;

2029

SmartIndent = yes;

2030

}

2031

else if (c == 'o')

2032

HideEndTags = yes;

2033

else if (c == 'u')

2034

UpperCaseTags = yes;

2035

else if (c == 'c')

2036

MakeClean = yes;

2037

else if (c == 'b')

2038

MakeBare = yes;

2039

else if (c == 'n')

2040

NumEntities = yes;

2041

else if (c == 'm')

2042

writeback = yes;

2043

else if (c == 'e')

2044

OnlyErrors = yes;

2045

else if (c == 'q')

2046

Quiet = yes;

2047

else

2048

UnknownOption(stderr, c);

2049

}

2050

}

2051

2052

--argc;

2053

++argv;

2054

continue;

2055

}

2056

2057

/* ensure config is self-consistent */

2058

AdjustConfig();

2059

2060

/* user specified error file */

2061

if (errfile)

2062

{

2063

FILE *fp;

2064

2065

/* is it same as the currently opened file? */

2066

2067

/* this comparison could be an issue on filesystems that are not case-sensitive */

2068

/* e.g. Mac OS HFS; but if we use wstrcasecmp(), we will have the same issue on */

2069

/* file systems that are case-sensitive - e.g. UFS */

2070

if (wstrcmp(errfile, current_errorfile) != 0)

2071

{

2072

/* no so close previous error file */

2073

2074

if (errout != stderr)

2075

fclose(errout);

2076

2077

/* and try to open the new error file */

2078

fp = fopen(errfile, "w");

2079

2080

if (fp != null)

2081

{

2082

errout = fp;

2083

current_errorfile = errfile;

2084

}

2085

else /* can't be opened so fall back to stderr */

2086

{

2087

errout = stderr;

2088

current_errorfile = "stderr";

2089

}

2090

}

2091

}

2092

2093

haveFileTimes = no;

2094

2095

if (argc > 1)

2096

{

2097

file = argv[1];

2098

input = fopen(file, "r");

2099

2100

#if PRESERVE_FILE_TIMES

2101

/* get last modified time */

2102

if (KeepFileTimes && input && fstat(fileno(input), &sbuf) != -1)

2103

{

2104

filetimes.actime = sbuf.st_atime;

2105

filetimes.modtime = sbuf.st_mtime;

2106

haveFileTimes = yes;

2107

}

2108

#endif

2109

}

2110

else

2111

{

2112

input = stdin;

2113

file = "stdin";

2114

}

2115

2116

if (input != null)

2117

{

2118

lexer = NewLexer(OpenInput(input));

2119

lexer->errout = errout;

2120

2121

2122

store pointer to lexer in input stream

2123

to allow character encoding errors to be

2124

reported

2125

2126

lexer->in->lexer = lexer;

2127

2128

SetFilename(file); /* #431895 - fix by Dave Bryan 04 Jan 01 */

2129

2130

if (!Quiet)

2131

HelloMessage(errout, release_date, file);

2132

2133

/* skip byte order mark */

2134

if (lexer->in->encoding == UTF8

2135

2136

#if SUPPORT_UTF16_ENCODINGS

2137

2138

|| lexer->in->encoding == UTF16LE

2139

|| lexer->in->encoding == UTF16BE

2140

|| lexer->in->encoding == UTF16

2141

2142

#endif

2143

2144

)

2145

{

2146

uint c = ReadChar(lexer->in);

2147

2148

if (c == UNICODE_BOM)

2149

InputHadBOM = yes;

2150

else

2151

UngetChar(c, lexer->in);

2152

}

2153

2154

/* Tidy doesn't alter the doctype for generic XML docs */

2155

if (XmlTags)

2156

{

2157

document = ParseXMLDocument(lexer);

2158

2159

if (!CheckNodeIntegrity(document))

2160

{

2161

fprintf(stderr, "\nPanic - tree has lost its integrity\n");

2162

exit(1);

2163

}

2164

}

2165

else

2166

{

2167

lexer->warnings = 0;

2168

2169

document = ParseDocument(lexer);

2170

2171

if (!CheckNodeIntegrity(document))

2172

{

2173

fprintf(stderr, "\nPanic - tree has lost its integrity\n");

2174

exit(1);

2175

}

2176

2177

/* simplifies <b><b> ... </b> ...</b> etc. */

2178

NestedEmphasis(document);

2179

2180

/* cleans up <dir>indented text</dir> etc. */

2181

List2BQ(document);

2182

BQ2Div(document);

2183

2184

/* replaces i by em and b by strong */

2185

if (LogicalEmphasis)

2186

EmFromI(document);

2187

2188

if (Word2000 && IsWord2000(document))

2189

{

2190

/* prune Word2000's <![if ...]> ... <![endif]> */

2191

DropSections(lexer, document);

2192

2193

/* drop style & class attributes and empty p, span elements */

2194

CleanWord2000(lexer, document);

2195

}

2196

2197

/* replaces presentational markup by style rules */

2198

if (MakeClean || DropFontTags)

2199

CleanTree(lexer, document);

2200

2201

if (!CheckNodeIntegrity(document))

2202

{

2203

fprintf(stderr, "\nPanic - tree has lost its integrity\n");

2204

exit(1);

2205

}

2206

2207

if (document->content)

2208

{

2209

if (xHTML)

2210

SetXHTMLDocType(lexer, document);

2211

else

2212

FixDocType(lexer, document);

2213

2214

if (TidyMark)

2215

AddGenerator(lexer, document);

2216

}

2217

2218

/* ensure presence of initial <?XML version="1.0"?> */

2219

if (XmlOut && XmlPi)

2220

FixXmlDecl(lexer, document);

2221

2222

2223

totalwarnings += lexer->warnings;

2224

totalerrors += lexer->errors;

2225

2226

2227

if (!Quiet && document->content)

2228

{

2229

doctype = FindDocType(document);

2230

ReportVersion(errout, lexer, file, doctype);

2231

/* ReportNumWarnings(errout, lexer); */

2232

}

2233

}

2234

2235

if (input != stdin)

2236

{

2237

fclose(input);

2238

}

2239

2240

MemFree(lexer->in);

2241

2242

totalwarnings += lexer->warnings;

2243

totalerrors += lexer->errors;

2244

2245

if (!Quiet)

2246

ReportNumWarnings(errout, lexer);

2247

2248

if (lexer->errors > 0 && !ForceOutput)

2249

NeedsAuthorIntervention(errout);

2250

2251

out.state = FSM_ASCII;

2252

out.encoding = outCharEncoding;

2253

2254

if (!OnlyErrors && (lexer->errors == 0 || ForceOutput))

2255

{

2256

if (BurstSlides)

2257

{

2258

Node *body, *doctype;

2259

2260

2261

remove doctype to avoid potential clash with

2262

markup introduced when bursting into slides

2263

2264

/* discard the document type */

2265

doctype = FindDocType(document);

2266

2267

if (doctype)

2268

DiscardElement(doctype);

2269

2270

/* slides use transitional features */

2271

lexer->versions |= VERS_HTML40_LOOSE;

2272

2273

/* and patch up doctype to match */

2274

if (xHTML)

2275

SetXHTMLDocType(lexer, document);

2276

else

2277

FixDocType(lexer, document);

2278

2279

2280

/* find the body element which may be implicit */

2281

body = FindBody(document);

2282

2283

if (body)

2284

{

2285

ReportNumberOfSlides(errout, CountSlides(body));

2286

CreateSlides(lexer, document);

2287

}

2288

else

2289

MissingBody(errout);

2290

}

2291

else if (writeback && (input = fopen(file, "w")))

2292

{

2293

out.fp = input;

2294

2295

/* Output a Byte Order Mark if required */

2296

if (OutputBOM || (InputHadBOM && SmartBOM))

2297

outBOM(&out);

2298

2299

if (!FindDocType(document))

2300

NumEntities = yes;

2301

2302

if (XmlOut && !xHTML /*XmlTags*/) /* #427826 - fix by Dave Raggett 01 Sep 00 */

2303

PPrintXMLTree(&out, null, 0, lexer, document);

2304

/* Feature request #434940 - fix by Dave Raggett/Ignacio Vazquez-Abrams 21 Jun 01 */

2305

else if (BodyOnly)

2306

PrintBody(&out, lexer, document);

2307

else

2308

PPrintTree(&out, null, 0, lexer, document);

2309

2310

PFlushLine(&out, 0);

2311

2312

#if PRESERVE_FILE_TIMES

2313

2314

#if UTIME_NEEDS_CLOSED_FILE

2315

/* close the file first */

2316

fclose(input);

2317

#endif

2318

2319

/* set file last accessed/modified times to original values */

2320

if (haveFileTimes)

2321

#if !HAS_FUTIME

2322

utime(file, &filetimes);

2323

#else

2324

futime(fileno(input), &filetimes);

2325

#endif

2326

2327

#if !UTIME_NEEDS_CLOSED_FILE

2328

/* close the file later */

2329

fclose(input);

2330

#endif

2331

2332

#else

2333

2334

fclose(input);

2335

2336

#endif /* PRESERVFILETIMES */

2337

}

2338

else

2339

{

2340

out.fp = stdout;

2341

2342

/* Output a Byte Order Mark if required */

2343

if (OutputBOM || (InputHadBOM && SmartBOM))

2344

outBOM(&out);

2345

2346

if (!FindDocType(document))

2347

NumEntities = yes;

2348

2349

if (XmlOut && !xHTML /*XmlTags*/) /* #427826 - fix by Dave Raggett 01 Sep 00 */

2350

PPrintXMLTree(&out, null, 0, lexer, document);

2351

/* Feature request #434940 - fix by Dave Raggett/Ignacio Vazquez-Abrams 21 Jun 01 */

2352

else if (BodyOnly)

2353

PrintBody(&out, lexer, document);

2354

else

2355

PPrintTree(&out, null, 0, lexer, document);

2356

2357

PFlushLine(&out, 0);

2358

}

2359

2360

}

2361

2362

if (!Quiet)

2363

ErrorSummary(lexer);

2364

2365

FreeNode(document);

2366

FreeLexer(lexer);

2367

}

2368

else

2369

UnknownFile(errout, prog, file);

2370

2371

--argc;

2372

++argv;

2373

2374

if (argc <= 1)

2375

break;

2376

}

2377

2378

if (totalerrors + totalwarnings > 0 && !Quiet)

2379

GeneralInfo(errout);

2380

2381

if (errout != stderr)

2382

fclose(errout);

2383

2384

/* called to free hash tables etc. */

2385

DeInitTidy();

2386

2387

/* return status can be used by scripts */

2388

2389

if (totalerrors > 0)

2390

return 2;

2391

2392

if (totalwarnings > 0)

2393

return 1;

2394

2395

/* 0 signifies all is ok */

2396

return 0;

2397

}

2398

Older »