~ubuntu-branches/ubuntu/precise/tidy/precise-updates

« back to all changes in this revision

Viewing changes to src/lexer.c

Committer: Bazaar Package Importer
Author(s): Jason Thomas
Date: 2005-04-20 11:22:49 UTC
mto: (3.1.1 lenny)
mto: This revision was merged to the branch mainline in revision 2.
Revision ID: james.westby@ubuntu.com-20050420112249-epdnkgi03ubep83z

Tags: upstream-20050415

Import upstream version 20050415

files added:

Makefile.am

Makefile.in

aclocal.m4

build

build/gmake

build/gmake/Makefile

build/gmake/readme.txt

build/gnuauto

build/gnuauto/Makefile.am

build/gnuauto/configure.in

build/gnuauto/console

build/gnuauto/console/Makefile.am

build/gnuauto/include

build/gnuauto/include/Makefile.am

build/gnuauto/readme.txt

build/gnuauto/setup.sh

build/gnuauto/src

build/gnuauto/src/Makefile.am

build/msvc

build/msvc/MakeDLL.vc6

build/msvc/Makefile.vc6

build/msvc/tidy.dsp

build/msvc/tidy.dsw

build/msvc/tidydll.dsp

build/msvc/tidylib.dsp

build/readme.txt

build/rpm

build/rpm/readme.txt

build/rpm/tidy.spec

config.guess

config.sub

configure

configure.in

console

console/Makefile.am

console/Makefile.in

console/tab2space.c

console/tidy.c

depcomp

include

include/Makefile.am

include/Makefile.in

include/buffio.h

include/fileio.h

include/platform.h

include/tidy.h

include/tidyenum.h

install-sh

ltmain.sh

missing

readme.txt

setup.sh

src/Makefile.am

src/Makefile.in

src/access.c

src/access.h

src/alloc.c

src/attrask.c

src/attrdict.c

src/attrdict.h

src/attrget.c

src/attrs.c

src/attrs.h

src/buffio.c

src/charsets.c

src/charsets.h

src/clean.c

src/clean.h

src/config.c

src/config.h

src/entities.c

src/entities.h

src/fileio.c

src/forward.h

src/iconvtc.c

src/iconvtc.h

src/istack.c

src/lexer.c

src/lexer.h

src/localize.c

src/message.h

src/parser.c

src/parser.h

src/pprint.c

src/pprint.h

src/streamio.c

src/streamio.h

src/tagask.c

src/tags.c

src/tags.h

src/tidy-int.h

src/tidylib.c

src/tmbstr.c

src/tmbstr.h

src/utf8.c

src/utf8.h

src/win32tc.c

src/win32tc.h

Show diffs side-by-side

added added

removed removed

src/lexer.c

/* lexer.c -- Lexer for html parser

See tidy.h for the copyright notice.

CVS Info :

$Author: arnaud02 $

$Date: 2005/03/22 17:36:03 $

$Revision: 1.169 $

Given a file stream fp it returns a sequence of tokens.

GetToken(fp) gets the next token

UngetToken(fp) provides one level undo

The tags include an attribute list:

- linked list of attribute/value nodes

- each node has 2 NULL-terminated strings.

- entities are replaced in attribute values

white space is compacted if not in preformatted mode

If not in preformatted mode then leading white space

is discarded and subsequent white space sequences

compacted to single space characters.

If XmlTags is no then Tag names are folded to upper

case and attribute names to lower case.

Not yet done:

- Doctype subset and marked sections

#include "tidy-int.h"

#include "lexer.h"

#include "parser.h"

#include "entities.h"

#include "streamio.h"

#include "message.h"

#include "tmbstr.h"

#include "clean.h"

#include "utf8.h"

#include "streamio.h"

/* Forward references

/* swallows closing '>' */

static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );

static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty,

Node **asp, Node **php );

static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,

Bool *isempty, int *pdelim );

static Node *ParseDocTypeDecl(TidyDocImpl* doc);

static void AddAttrToList( AttVal** list, AttVal* av );

/* used to classify characters for lexical purposes */

#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)

static uint lexmap[128];

#define IsValidXMLAttrName(name) IsValidXMLID(name)

#define IsValidXMLElemName(name) IsValidXMLID(name)

static struct _doctypes

{

uint score;

uint vers;

ctmbstr name;

ctmbstr fpi;

ctmbstr si;

} const W3C_Doctypes[] =

{

{ 2, HT20, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, },

{ 2, HT20, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, },

{ 2, HT20, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, },

{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, },

{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, },

{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, },

{ 6, H40S, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" },

{ 8, H40T, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" },

{ 7, H40F, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" },

{ 3, H41S, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" },

{ 5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" },

{ 4, H41F, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" },

{ 9, X10S, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" },

{ 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" },

{ 10, X10F, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" },

{ 12, XH11, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" },

{ 13, XB10, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" },

/* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */

#if 0

100

{ 14, XP10, "XHTML Print 1.0", "-//W3C//DTD XHTML-Print 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd" },

101

{ 14, XP10, "XHTML Print 1.0", "-//PWG//DTD XHTML-Print 1.0//EN", "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" },

102

#endif

103

/* final entry */

104

{ 0, 0, NULL, NULL, NULL }

105

};

106

107

int HTMLVersion(TidyDocImpl* doc)

108

{

109

uint i;

110

uint j = 0;

111

uint score = 0;

112

uint vers = doc->lexer->versions;

113

uint dtver = doc->lexer->doctype;

114

TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);

115

Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&

116

!cfgBool(doc, TidyHtmlOut);

117

Bool html4 = dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose || VERS_FROM40 & dtver;

118

119

for (i = 0; W3C_Doctypes[i].name; ++i)

120

{

121

if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||

122

(html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers)))

123

continue;

124

125

if (vers & W3C_Doctypes[i].vers &&

126

(W3C_Doctypes[i].score < score || !score))

127

{

128

score = W3C_Doctypes[i].score;

129

j = i;

130

}

131

}

132

133

if (score)

134

return W3C_Doctypes[j].vers;

135

136

return VERS_UNKNOWN;

137

}

138

139

ctmbstr GetFPIFromVers(uint vers)

140

{

141

uint i;

142

143

for (i = 0; W3C_Doctypes[i].name; ++i)

144

if (W3C_Doctypes[i].vers == vers)

145

return W3C_Doctypes[i].fpi;

146

147

return NULL;

148

}

149

150

static ctmbstr GetSIFromVers(uint vers)

151

{

152

uint i;

153

154

for (i = 0; W3C_Doctypes[i].name; ++i)

155

if (W3C_Doctypes[i].vers == vers)

156

return W3C_Doctypes[i].si;

157

158

return NULL;

159

}

160

161

static ctmbstr GetNameFromVers(uint vers)

162

{

163

uint i;

164

165

for (i = 0; W3C_Doctypes[i].name; ++i)

166

if (W3C_Doctypes[i].vers == vers)

167

return W3C_Doctypes[i].name;

168

169

return NULL;

170

}

171

172

static uint GetVersFromFPI(ctmbstr fpi)

173

{

174

uint i;

175

176

for (i = 0; W3C_Doctypes[i].name; ++i)

177

if (tmbstrcasecmp(W3C_Doctypes[i].fpi, fpi) == 0)

178

return W3C_Doctypes[i].vers;

179

180

return 0;

181

}

182

183

/* everything is allowed in proprietary version of HTML */

184

/* this is handled here rather than in the tag/attr dicts */

185

void ConstrainVersion(TidyDocImpl* doc, uint vers)

186

{

187

doc->lexer->versions &= (vers | VERS_PROPRIETARY);

188

}

189

190

Bool IsWhite(uint c)

191

{

192

uint map = MAP(c);

193

194

return (Bool)(map & white);

195

}

196

197

Bool IsNewline(uint c)

198

{

199

uint map = MAP(c);

200

return (Bool)(map & newline);

201

}

202

203

Bool IsDigit(uint c)

204

{

205

uint map;

206

207

map = MAP(c);

208

209

return (Bool)(map & digit);

210

}

211

212

Bool IsLetter(uint c)

213

{

214

uint map;

215

216

map = MAP(c);

217

218

return (Bool)(map & letter);

219

}

220

221

Bool IsNamechar(uint c)

222

{

223

uint map = MAP(c);

224

return (Bool)(map & namechar);

225

}

226

227

Bool IsXMLLetter(uint c)

228

{

229

return ((c >= 0x41 && c <= 0x5a) ||

230

(c >= 0x61 && c <= 0x7a) ||

231

(c >= 0xc0 && c <= 0xd6) ||

232

(c >= 0xd8 && c <= 0xf6) ||

233

(c >= 0xf8 && c <= 0xff) ||

234

(c >= 0x100 && c <= 0x131) ||

235

(c >= 0x134 && c <= 0x13e) ||

236

(c >= 0x141 && c <= 0x148) ||

237

(c >= 0x14a && c <= 0x17e) ||

238

(c >= 0x180 && c <= 0x1c3) ||

239

(c >= 0x1cd && c <= 0x1f0) ||

240

(c >= 0x1f4 && c <= 0x1f5) ||

241

(c >= 0x1fa && c <= 0x217) ||

242

(c >= 0x250 && c <= 0x2a8) ||

243

(c >= 0x2bb && c <= 0x2c1) ||

244

c == 0x386 ||

245

(c >= 0x388 && c <= 0x38a) ||

246

c == 0x38c ||

247

(c >= 0x38e && c <= 0x3a1) ||

248

(c >= 0x3a3 && c <= 0x3ce) ||

249

(c >= 0x3d0 && c <= 0x3d6) ||

250

c == 0x3da ||

251

c == 0x3dc ||

252

c == 0x3de ||

253

c == 0x3e0 ||

254

(c >= 0x3e2 && c <= 0x3f3) ||

255

(c >= 0x401 && c <= 0x40c) ||

256

(c >= 0x40e && c <= 0x44f) ||

257

(c >= 0x451 && c <= 0x45c) ||

258

(c >= 0x45e && c <= 0x481) ||

259

(c >= 0x490 && c <= 0x4c4) ||

260

(c >= 0x4c7 && c <= 0x4c8) ||

261

(c >= 0x4cb && c <= 0x4cc) ||

262

(c >= 0x4d0 && c <= 0x4eb) ||

263

(c >= 0x4ee && c <= 0x4f5) ||

264

(c >= 0x4f8 && c <= 0x4f9) ||

265

(c >= 0x531 && c <= 0x556) ||

266

c == 0x559 ||

267

(c >= 0x561 && c <= 0x586) ||

268

(c >= 0x5d0 && c <= 0x5ea) ||

269

(c >= 0x5f0 && c <= 0x5f2) ||

270

(c >= 0x621 && c <= 0x63a) ||

271

(c >= 0x641 && c <= 0x64a) ||

272

(c >= 0x671 && c <= 0x6b7) ||

273

(c >= 0x6ba && c <= 0x6be) ||

274

(c >= 0x6c0 && c <= 0x6ce) ||

275

(c >= 0x6d0 && c <= 0x6d3) ||

276

c == 0x6d5 ||

277

(c >= 0x6e5 && c <= 0x6e6) ||

278

(c >= 0x905 && c <= 0x939) ||

279

c == 0x93d ||

280

(c >= 0x958 && c <= 0x961) ||

281

(c >= 0x985 && c <= 0x98c) ||

282

(c >= 0x98f && c <= 0x990) ||

283

(c >= 0x993 && c <= 0x9a8) ||

284

(c >= 0x9aa && c <= 0x9b0) ||

285

c == 0x9b2 ||

286

(c >= 0x9b6 && c <= 0x9b9) ||

287

(c >= 0x9dc && c <= 0x9dd) ||

288

(c >= 0x9df && c <= 0x9e1) ||

289

(c >= 0x9f0 && c <= 0x9f1) ||

290

(c >= 0xa05 && c <= 0xa0a) ||

291

(c >= 0xa0f && c <= 0xa10) ||

292

(c >= 0xa13 && c <= 0xa28) ||

293

(c >= 0xa2a && c <= 0xa30) ||

294

(c >= 0xa32 && c <= 0xa33) ||

295

(c >= 0xa35 && c <= 0xa36) ||

296

(c >= 0xa38 && c <= 0xa39) ||

297

(c >= 0xa59 && c <= 0xa5c) ||

298

c == 0xa5e ||

299

(c >= 0xa72 && c <= 0xa74) ||

300

(c >= 0xa85 && c <= 0xa8b) ||

301

c == 0xa8d ||

302

(c >= 0xa8f && c <= 0xa91) ||

303

(c >= 0xa93 && c <= 0xaa8) ||

304

(c >= 0xaaa && c <= 0xab0) ||

305

(c >= 0xab2 && c <= 0xab3) ||

306

(c >= 0xab5 && c <= 0xab9) ||

307

c == 0xabd ||

308

c == 0xae0 ||

309

(c >= 0xb05 && c <= 0xb0c) ||

310

(c >= 0xb0f && c <= 0xb10) ||

311

(c >= 0xb13 && c <= 0xb28) ||

312

(c >= 0xb2a && c <= 0xb30) ||

313

(c >= 0xb32 && c <= 0xb33) ||

314

(c >= 0xb36 && c <= 0xb39) ||

315

c == 0xb3d ||

316

(c >= 0xb5c && c <= 0xb5d) ||

317

(c >= 0xb5f && c <= 0xb61) ||

318

(c >= 0xb85 && c <= 0xb8a) ||

319

(c >= 0xb8e && c <= 0xb90) ||

320

(c >= 0xb92 && c <= 0xb95) ||

321

(c >= 0xb99 && c <= 0xb9a) ||

322

c == 0xb9c ||

323

(c >= 0xb9e && c <= 0xb9f) ||

324

(c >= 0xba3 && c <= 0xba4) ||

325

(c >= 0xba8 && c <= 0xbaa) ||

326

(c >= 0xbae && c <= 0xbb5) ||

327

(c >= 0xbb7 && c <= 0xbb9) ||

328

(c >= 0xc05 && c <= 0xc0c) ||

329

(c >= 0xc0e && c <= 0xc10) ||

330

(c >= 0xc12 && c <= 0xc28) ||

331

(c >= 0xc2a && c <= 0xc33) ||

332

(c >= 0xc35 && c <= 0xc39) ||

333

(c >= 0xc60 && c <= 0xc61) ||

334

(c >= 0xc85 && c <= 0xc8c) ||

335

(c >= 0xc8e && c <= 0xc90) ||

336

(c >= 0xc92 && c <= 0xca8) ||

337

(c >= 0xcaa && c <= 0xcb3) ||

338

(c >= 0xcb5 && c <= 0xcb9) ||

339

c == 0xcde ||

340

(c >= 0xce0 && c <= 0xce1) ||

341

(c >= 0xd05 && c <= 0xd0c) ||

342

(c >= 0xd0e && c <= 0xd10) ||

343

(c >= 0xd12 && c <= 0xd28) ||

344

(c >= 0xd2a && c <= 0xd39) ||

345

(c >= 0xd60 && c <= 0xd61) ||

346

(c >= 0xe01 && c <= 0xe2e) ||

347

c == 0xe30 ||

348

(c >= 0xe32 && c <= 0xe33) ||

349

(c >= 0xe40 && c <= 0xe45) ||

350

(c >= 0xe81 && c <= 0xe82) ||

351

c == 0xe84 ||

352

(c >= 0xe87 && c <= 0xe88) ||

353

c == 0xe8a ||

354

c == 0xe8d ||

355

(c >= 0xe94 && c <= 0xe97) ||

356

(c >= 0xe99 && c <= 0xe9f) ||

357

(c >= 0xea1 && c <= 0xea3) ||

358

c == 0xea5 ||

359

c == 0xea7 ||

360

(c >= 0xeaa && c <= 0xeab) ||

361

(c >= 0xead && c <= 0xeae) ||

362

c == 0xeb0 ||

363

(c >= 0xeb2 && c <= 0xeb3) ||

364

c == 0xebd ||

365

(c >= 0xec0 && c <= 0xec4) ||

366

(c >= 0xf40 && c <= 0xf47) ||

367

(c >= 0xf49 && c <= 0xf69) ||

368

(c >= 0x10a0 && c <= 0x10c5) ||

369

(c >= 0x10d0 && c <= 0x10f6) ||

370

c == 0x1100 ||

371

(c >= 0x1102 && c <= 0x1103) ||

372

(c >= 0x1105 && c <= 0x1107) ||

373

c == 0x1109 ||

374

(c >= 0x110b && c <= 0x110c) ||

375

(c >= 0x110e && c <= 0x1112) ||

376

c == 0x113c ||

377

c == 0x113e ||

378

c == 0x1140 ||

379

c == 0x114c ||

380

c == 0x114e ||

381

c == 0x1150 ||

382

(c >= 0x1154 && c <= 0x1155) ||

383

c == 0x1159 ||

384

(c >= 0x115f && c <= 0x1161) ||

385

c == 0x1163 ||

386

c == 0x1165 ||

387

c == 0x1167 ||

388

c == 0x1169 ||

389

(c >= 0x116d && c <= 0x116e) ||

390

(c >= 0x1172 && c <= 0x1173) ||

391

c == 0x1175 ||

392

c == 0x119e ||

393

c == 0x11a8 ||

394

c == 0x11ab ||

395

(c >= 0x11ae && c <= 0x11af) ||

396

(c >= 0x11b7 && c <= 0x11b8) ||

397

c == 0x11ba ||

398

(c >= 0x11bc && c <= 0x11c2) ||

399

c == 0x11eb ||

400

c == 0x11f0 ||

401

c == 0x11f9 ||

402

(c >= 0x1e00 && c <= 0x1e9b) ||

403

(c >= 0x1ea0 && c <= 0x1ef9) ||

404

(c >= 0x1f00 && c <= 0x1f15) ||

405

(c >= 0x1f18 && c <= 0x1f1d) ||

406

(c >= 0x1f20 && c <= 0x1f45) ||

407

(c >= 0x1f48 && c <= 0x1f4d) ||

408

(c >= 0x1f50 && c <= 0x1f57) ||

409

c == 0x1f59 ||

410

c == 0x1f5b ||

411

c == 0x1f5d ||

412

(c >= 0x1f5f && c <= 0x1f7d) ||

413

(c >= 0x1f80 && c <= 0x1fb4) ||

414

(c >= 0x1fb6 && c <= 0x1fbc) ||

415

c == 0x1fbe ||

416

(c >= 0x1fc2 && c <= 0x1fc4) ||

417

(c >= 0x1fc6 && c <= 0x1fcc) ||

418

(c >= 0x1fd0 && c <= 0x1fd3) ||

419

(c >= 0x1fd6 && c <= 0x1fdb) ||

420

(c >= 0x1fe0 && c <= 0x1fec) ||

421

(c >= 0x1ff2 && c <= 0x1ff4) ||

422

(c >= 0x1ff6 && c <= 0x1ffc) ||

423

c == 0x2126 ||

424

(c >= 0x212a && c <= 0x212b) ||

425

c == 0x212e ||

426

(c >= 0x2180 && c <= 0x2182) ||

427

(c >= 0x3041 && c <= 0x3094) ||

428

(c >= 0x30a1 && c <= 0x30fa) ||

429

(c >= 0x3105 && c <= 0x312c) ||

430

(c >= 0xac00 && c <= 0xd7a3) ||

431

(c >= 0x4e00 && c <= 0x9fa5) ||

432

c == 0x3007 ||

433

(c >= 0x3021 && c <= 0x3029) ||

434

(c >= 0x4e00 && c <= 0x9fa5) ||

435

c == 0x3007 ||

436

(c >= 0x3021 && c <= 0x3029));

437

}

438

439

Bool IsXMLNamechar(uint c)

440

{

441

return (IsXMLLetter(c) ||

442

c == '.' || c == '_' ||

443

c == ':' || c == '-' ||

444

(c >= 0x300 && c <= 0x345) ||

445

(c >= 0x360 && c <= 0x361) ||

446

(c >= 0x483 && c <= 0x486) ||

447

(c >= 0x591 && c <= 0x5a1) ||

448

(c >= 0x5a3 && c <= 0x5b9) ||

449

(c >= 0x5bb && c <= 0x5bd) ||

450

c == 0x5bf ||

451

(c >= 0x5c1 && c <= 0x5c2) ||

452

c == 0x5c4 ||

453

(c >= 0x64b && c <= 0x652) ||

454

c == 0x670 ||

455

(c >= 0x6d6 && c <= 0x6dc) ||

456

(c >= 0x6dd && c <= 0x6df) ||

457

(c >= 0x6e0 && c <= 0x6e4) ||

458

(c >= 0x6e7 && c <= 0x6e8) ||

459

(c >= 0x6ea && c <= 0x6ed) ||

460

(c >= 0x901 && c <= 0x903) ||

461

c == 0x93c ||

462

(c >= 0x93e && c <= 0x94c) ||

463

c == 0x94d ||

464

(c >= 0x951 && c <= 0x954) ||

465

(c >= 0x962 && c <= 0x963) ||

466

(c >= 0x981 && c <= 0x983) ||

467

c == 0x9bc ||

468

c == 0x9be ||

469

c == 0x9bf ||

470

(c >= 0x9c0 && c <= 0x9c4) ||

471

(c >= 0x9c7 && c <= 0x9c8) ||

472

(c >= 0x9cb && c <= 0x9cd) ||

473

c == 0x9d7 ||

474

(c >= 0x9e2 && c <= 0x9e3) ||

475

c == 0xa02 ||

476

c == 0xa3c ||

477

c == 0xa3e ||

478

c == 0xa3f ||

479

(c >= 0xa40 && c <= 0xa42) ||

480

(c >= 0xa47 && c <= 0xa48) ||

481

(c >= 0xa4b && c <= 0xa4d) ||

482

(c >= 0xa70 && c <= 0xa71) ||

483

(c >= 0xa81 && c <= 0xa83) ||

484

c == 0xabc ||

485

(c >= 0xabe && c <= 0xac5) ||

486

(c >= 0xac7 && c <= 0xac9) ||

487

(c >= 0xacb && c <= 0xacd) ||

488

(c >= 0xb01 && c <= 0xb03) ||

489

c == 0xb3c ||

490

(c >= 0xb3e && c <= 0xb43) ||

491

(c >= 0xb47 && c <= 0xb48) ||

492

(c >= 0xb4b && c <= 0xb4d) ||

493

(c >= 0xb56 && c <= 0xb57) ||

494

(c >= 0xb82 && c <= 0xb83) ||

495

(c >= 0xbbe && c <= 0xbc2) ||

496

(c >= 0xbc6 && c <= 0xbc8) ||

497

(c >= 0xbca && c <= 0xbcd) ||

498

c == 0xbd7 ||

499

(c >= 0xc01 && c <= 0xc03) ||

500

(c >= 0xc3e && c <= 0xc44) ||

501

(c >= 0xc46 && c <= 0xc48) ||

502

(c >= 0xc4a && c <= 0xc4d) ||

503

(c >= 0xc55 && c <= 0xc56) ||

504

(c >= 0xc82 && c <= 0xc83) ||

505

(c >= 0xcbe && c <= 0xcc4) ||

506

(c >= 0xcc6 && c <= 0xcc8) ||

507

(c >= 0xcca && c <= 0xccd) ||

508

(c >= 0xcd5 && c <= 0xcd6) ||

509

(c >= 0xd02 && c <= 0xd03) ||

510

(c >= 0xd3e && c <= 0xd43) ||

511

(c >= 0xd46 && c <= 0xd48) ||

512

(c >= 0xd4a && c <= 0xd4d) ||

513

c == 0xd57 ||

514

c == 0xe31 ||

515

(c >= 0xe34 && c <= 0xe3a) ||

516

(c >= 0xe47 && c <= 0xe4e) ||

517

c == 0xeb1 ||

518

(c >= 0xeb4 && c <= 0xeb9) ||

519

(c >= 0xebb && c <= 0xebc) ||

520

(c >= 0xec8 && c <= 0xecd) ||

521

(c >= 0xf18 && c <= 0xf19) ||

522

c == 0xf35 ||

523

c == 0xf37 ||

524

c == 0xf39 ||

525

c == 0xf3e ||

526

c == 0xf3f ||

527

(c >= 0xf71 && c <= 0xf84) ||

528

(c >= 0xf86 && c <= 0xf8b) ||

529

(c >= 0xf90 && c <= 0xf95) ||

530

c == 0xf97 ||

531

(c >= 0xf99 && c <= 0xfad) ||

532

(c >= 0xfb1 && c <= 0xfb7) ||

533

c == 0xfb9 ||

534

(c >= 0x20d0 && c <= 0x20dc) ||

535

c == 0x20e1 ||

536

(c >= 0x302a && c <= 0x302f) ||

537

c == 0x3099 ||

538

c == 0x309a ||

539

(c >= 0x30 && c <= 0x39) ||

540

(c >= 0x660 && c <= 0x669) ||

541

(c >= 0x6f0 && c <= 0x6f9) ||

542

(c >= 0x966 && c <= 0x96f) ||

543

(c >= 0x9e6 && c <= 0x9ef) ||

544

(c >= 0xa66 && c <= 0xa6f) ||

545

(c >= 0xae6 && c <= 0xaef) ||

546

(c >= 0xb66 && c <= 0xb6f) ||

547

(c >= 0xbe7 && c <= 0xbef) ||

548

(c >= 0xc66 && c <= 0xc6f) ||

549

(c >= 0xce6 && c <= 0xcef) ||

550

(c >= 0xd66 && c <= 0xd6f) ||

551

(c >= 0xe50 && c <= 0xe59) ||

552

(c >= 0xed0 && c <= 0xed9) ||

553

(c >= 0xf20 && c <= 0xf29) ||

554

c == 0xb7 ||

555

c == 0x2d0 ||

556

c == 0x2d1 ||

557

c == 0x387 ||

558

c == 0x640 ||

559

c == 0xe46 ||

560

c == 0xec6 ||

561

c == 0x3005 ||

562

(c >= 0x3031 && c <= 0x3035) ||

563

(c >= 0x309d && c <= 0x309e) ||

564

(c >= 0x30fc && c <= 0x30fe));

565

}

566

567

Bool IsLower(uint c)

568

{

569

uint map = MAP(c);

570

571

return (Bool)(map & lowercase);

572

}

573

574

Bool IsUpper(uint c)

575

{

576

uint map = MAP(c);

577

578

return (Bool)(map & uppercase);

579

}

580

581

uint ToLower(uint c)

582

{

583

uint map = MAP(c);

584

585

if (map & uppercase)

586

c += 'a' - 'A';

587

588

return c;

589

}

590

591

uint ToUpper(uint c)

592

{

593

uint map = MAP(c);

594

595

if (map & lowercase)

596

c += (uint) ('A' - 'a' );

597

598

return c;

599

}

600

601

char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps )

602

{

603

if ( !cfgBool(doc, TidyXmlTags) )

604

{

605

if ( tocaps )

606

{

607

c = (tmbchar) ToUpper(c);

608

}

609

else /* force to lower case */

610

{

611

c = (tmbchar) ToLower(c);

612

}

613

}

614

return c;

615

}

616

617

618

619

return last character in string

620

this is useful when trailing quotemark

621

is missing on an attribute

622

623

static tmbchar LastChar( tmbstr str )

624

{

625

if ( str && *str )

626

{

627

int n = tmbstrlen(str);

628

return str[n-1];

629

}

630

return 0;

631

}

632

633

634

node->type is one of these:

635

636

#define TextNode 1

637

#define StartTag 2

638

#define EndTag 3

639

#define StartEndTag 4

640

641

642

Lexer* NewLexer( TidyDocImpl* doc )

643

{

644

Lexer* lexer = (Lexer*) MemAlloc( sizeof(Lexer) );

645

646

if ( lexer != NULL )

647

{

648

ClearMemory( lexer, sizeof(Lexer) );

649

650

lexer->lines = 1;

651

lexer->columns = 1;

652

lexer->state = LEX_CONTENT;

653

654

lexer->versions = (VERS_ALL|VERS_PROPRIETARY);

655

lexer->doctype = VERS_UNKNOWN;

656

lexer->root = &doc->root;

657

}

658

return lexer;

659

}

660

661

Bool EndOfInput( TidyDocImpl* doc )

662

{

663

assert( doc->docIn != NULL );

664

return ( !doc->docIn->pushed && IsEOF(doc->docIn) );

665

}

666

667

void FreeLexer( TidyDocImpl* doc )

668

{

669

Lexer *lexer = doc->lexer;

670

if ( lexer )

671

{

672

FreeStyles( doc );

673

674

if ( lexer->pushed )

675

FreeNode( doc, lexer->token );

676

677

while ( lexer->istacksize > 0 )

678

PopInline( doc, NULL );

679

680

MemFree( lexer->istack );

681

MemFree( lexer->lexbuf );

682

MemFree( lexer );

683

doc->lexer = NULL;

684

}

685

}

686

687

/* Lexer uses bigger memory chunks than pprint as

688

** it must hold the entire input document. not just

689

** the last line or three.

690

691

void AddByte( Lexer *lexer, tmbchar ch )

692

{

693

if ( lexer->lexsize + 2 >= lexer->lexlength )

694

{

695

tmbstr buf = NULL;

696

uint allocAmt = lexer->lexlength;

697

while ( lexer->lexsize + 2 >= allocAmt )

698

{

699

if ( allocAmt == 0 )

700

allocAmt = 8192;

701

else

702

allocAmt *= 2;

703

}

704

buf = (tmbstr) MemRealloc( lexer->lexbuf, allocAmt );

705

if ( buf )

706

{

707

ClearMemory( buf + lexer->lexlength,

708

allocAmt - lexer->lexlength );

709

lexer->lexbuf = buf;

710

lexer->lexlength = allocAmt;

711

}

712

}

713

714

lexer->lexbuf[ lexer->lexsize++ ] = ch;

715

lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */

716

}

717

718

static void ChangeChar( Lexer *lexer, tmbchar c )

719

{

720

if ( lexer->lexsize > 0 )

721

{

722

lexer->lexbuf[ lexer->lexsize-1 ] = c;

723

}

724

}

725

726

/* store character c as UTF-8 encoded byte stream */

727

void AddCharToLexer( Lexer *lexer, uint c )

728

{

729

int i, err, count = 0;

730

tmbchar buf[10] = {0};

731

732

err = EncodeCharToUTF8Bytes( c, buf, NULL, &count );

733

if (err)

734

{

735

#if 0 && defined(_DEBUG)

736

fprintf( stderr, "lexer UTF-8 encoding error for U+%x : ", c );

737

#endif

738

/* replacement character 0xFFFD encoded as UTF-8 */

739

buf[0] = (byte) 0xEF;

740

buf[1] = (byte) 0xBF;

741

buf[2] = (byte) 0xBD;

742

count = 3;

743

}

744

745

for ( i = 0; i < count; ++i )

746

AddByte( lexer, buf[i] );

747

}

748

749

static void AddStringToLexer( Lexer *lexer, ctmbstr str )

750

{

751

uint c;

752

753

/* Many (all?) compilers will sign-extend signed chars (the default) when

754

** converting them to unsigned integer values. We must cast our char to

755

** unsigned char before assigning it to prevent this from happening.

756

757

while( 0 != (c = (unsigned char) *str++ ))

758

AddCharToLexer( lexer, c );

759

}

760

761

762

No longer attempts to insert missing ';' for unknown

763

enitities unless one was present already, since this

764

gives unexpected results.

765

766

For example: <a href="something.htm?foo&bar&fred">

767

was tidied to: <a href="something.htm?foo&bar;&fred;">

768

rather than: <a href="something.htm?foo&bar&fred">

769

770

My thanks for Maurice Buxton for spotting this.

771

772

Also Randy Waki pointed out the following case for the

773

04 Aug 00 version (bug #433012):

774

775

For example: <a href="something.htm?id=1&lang=en">

776

was tidied to: <a href="something.htm?id=1&lang;=en">

777

rather than: <a href="something.htm?id=1&lang=en">

778

779

where "lang" is a known entity (#9001), but browsers would

780

misinterpret "&lang;" because it had a value > 256.

781

782

So the case of an apparently known entity with a value > 256 and

783

missing a semicolon is handled specially.

784

785

"ParseEntity" is also a bit of a misnomer - it handles entities and

786

numeric character references. Invalid NCR's are now reported.

787

788

static void ParseEntity( TidyDocImpl* doc, int mode )

789

{

790

uint start;

791

Bool first = yes, semicolon = no, found = no;

792

Bool isXml = cfgBool( doc, TidyXmlTags );

793

uint c, ch, startcol, entver = 0;

794

Lexer* lexer = doc->lexer;

795

796

start = lexer->lexsize - 1; /* to start at "&" */

797

startcol = doc->docIn->curcol - 1;

798

799

while ( (c = ReadChar(doc->docIn)) != EndOfStream )

800

{

801

if ( c == ';' )

802

{

803

semicolon = yes;

804

break;

805

}

806

807

if (first && c == '#')

808

{

809

#if SUPPORT_ASIAN_ENCODINGS

810

if ( !cfgBool(doc, TidyNCR) ||

811

cfg(doc, TidyInCharEncoding) == BIG5 ||

812

cfg(doc, TidyInCharEncoding) == SHIFTJIS )

813

{

814

UngetChar('#', doc->docIn);

815

return;

816

}

817

#endif

818

AddCharToLexer( lexer, c );

819

first = no;

820

continue;

821

}

822

823

first = no;

824

825

if ( IsNamechar(c) )

826

{

827

AddCharToLexer( lexer, c );

828

continue;

829

}

830

831

/* otherwise put it back */

832

833

UngetChar( c, doc->docIn );

834

break;

835

}

836

837

/* make sure entity is NULL terminated */

838

lexer->lexbuf[lexer->lexsize] = '\0';

839

840

/* Should contrain version to XML/XHTML if '

841

** is encountered. But this is not possible with

842

** Tidy's content model bit mask.

843

844

if ( tmbstrcmp(lexer->lexbuf+start, "&apos") == 0

845

&& !cfgBool(doc, TidyXmlOut)

846

&& !lexer->isvoyager

847

&& !cfgBool(doc, TidyXhtmlOut) )

848

ReportEntityError( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 );

849

850

/* Lookup entity code and version

851

852

found = EntityInfo( lexer->lexbuf+start, isXml, &ch, &entver );

853

854

/* deal with unrecognized or invalid entities */

855

/* #433012 - fix by Randy Waki 17 Feb 01 */

856

/* report invalid NCR's - Terry Teague 01 Sep 01 */

857

if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') )

858

{

859

/* set error position just before offending character */

860

lexer->lines = doc->docIn->curline;

861

lexer->columns = startcol;

862

863

if (lexer->lexsize > start + 1)

864

{

865

if (ch >= 128 && ch <= 159)

866

{

867

/* invalid numeric character reference */

868

869

uint c1 = 0;

870

int replaceMode = DISCARDED_CHAR;

871

872

if ( ReplacementCharEncoding == WIN1252 )

873

c1 = DecodeWin1252( ch );

874

else if ( ReplacementCharEncoding == MACROMAN )

875

c1 = DecodeMacRoman( ch );

876

877

if ( c1 )

878

replaceMode = REPLACED_CHAR;

879

880

if ( c != ';' ) /* issue warning if not terminated by ';' */

881

ReportEntityError( doc, MISSING_SEMICOLON_NCR,

882

lexer->lexbuf+start, c );

883

884

ReportEncodingError(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR);

885

886

if ( c1 )

887

{

888

/* make the replacement */

889

lexer->lexsize = start;

890

AddCharToLexer( lexer, c1 );

891

semicolon = no;

892

}

893

else

894

{

895

/* discard */

896

lexer->lexsize = start;

897

semicolon = no;

898

}

899

900

}

901

else

902

ReportEntityError( doc, UNKNOWN_ENTITY,

903

lexer->lexbuf+start, ch );

904

905

if (semicolon)

906

AddCharToLexer( lexer, ';' );

907

}

908

else /* naked & */

909

ReportEntityError( doc, UNESCAPED_AMPERSAND,

910

lexer->lexbuf+start, ch );

911

}

912

else

913

{

914

if ( c != ';' ) /* issue warning if not terminated by ';' */

915

{

916

/* set error position just before offending chararcter */

917

lexer->lines = doc->docIn->curline;

918

lexer->columns = startcol;

919

ReportEntityError( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c );

920

}

921

922

lexer->lexsize = start;

923

if ( ch == 160 && (mode & Preformatted) )

924

ch = ' ';

925

AddCharToLexer( lexer, ch );

926

927

if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) )

928

AddStringToLexer( lexer, "amp;" );

929

930

/* Detect extended vs. basic entities */

931

ConstrainVersion( doc, entver );

932

}

933

}

934

935

static tmbchar ParseTagName( TidyDocImpl* doc )

936

{

937

Lexer *lexer = doc->lexer;

938

uint c = lexer->lexbuf[ lexer->txtstart ];

939

Bool xml = cfgBool(doc, TidyXmlTags);

940

941

/* fold case of first character in buffer */

942

if (!xml && IsUpper(c))

943

lexer->lexbuf[lexer->txtstart] = (tmbchar) ToLower(c);

944

945

while ((c = ReadChar(doc->docIn)) != EndOfStream)

946

{

947

if ((!xml && !IsNamechar(c)) ||

948

(xml && !IsXMLNamechar(c)))

949

break;

950

951

/* fold case of subsequent characters */

952

if (!xml && IsUpper(c))

953

c = ToLower(c);

954

955

AddCharToLexer(lexer, c);

956

}

957

958

lexer->txtend = lexer->lexsize;

959

return (tmbchar) c;

960

}

961

962

963

Used for elements and text nodes

964

element name is NULL for text nodes

965

start and end are offsets into lexbuf

966

which contains the textual content of

967

all elements in the parse tree.

968

969

parent and content allow traversal

970

of the parse tree in any direction.

971

attributes are represented as a linked

972

list of AttVal nodes which hold the

973

strings for attribute/value pairs.

974

975

976

977

Node *NewNode(Lexer *lexer)

978

{

979

Node* node = (Node*) MemAlloc( sizeof(Node) );

980

ClearMemory( node, sizeof(Node) );

981

if ( lexer )

982

{

983

node->line = lexer->lines;

984

node->column = lexer->columns;

985

}

986

node->type = TextNode;

987

return node;

988

}

989

990

/* used to clone heading nodes when split by an <HR> */

991

Node *CloneNode( TidyDocImpl* doc, Node *element )

992

{

993

Lexer* lexer = doc->lexer;

994

Node *node = NewNode( lexer );

995

996

node->start = lexer->lexsize;

997

node->end = lexer->lexsize;

998

999

if ( element )

1000

{

1001

node->parent = element->parent;

1002

node->type = element->type;

1003

node->closed = element->closed;

1004

node->implicit = element->implicit;

1005

node->tag = element->tag;

1006

node->element = tmbstrdup( element->element );

1007

node->attributes = DupAttrs( doc, element->attributes );

1008

}

1009

return node;

1010

}

1011

1012

/* free node's attributes */

1013

void FreeAttrs( TidyDocImpl* doc, Node *node )

1014

{

1015

1016

while ( node->attributes )

1017

{

1018

AttVal *av = node->attributes;

1019

1020

if ( av->attribute )

1021

{

1022

if ( (attrIsID(av) || attrIsNAME(av)) &&

1023

IsAnchorElement(doc, node) )

1024

{

1025

RemoveAnchorByNode( doc, node );

1026

}

1027

}

1028

1029

node->attributes = av->next;

1030

FreeAttribute( doc, av );

1031

}

1032

}

1033

1034

/* doesn't repair attribute list linkage */

1035

void FreeAttribute( TidyDocImpl* doc, AttVal *av )

1036

{

1037

FreeNode( doc, av->asp );

1038

FreeNode( doc, av->php );

1039

MemFree( av->attribute );

1040

MemFree( av->value );

1041

MemFree( av );

1042

}

1043

1044

/* detach attribute from node

1045

1046

void DetachAttribute( Node *node, AttVal *attr )

1047

{

1048

AttVal *av, *prev = NULL;

1049

1050

for ( av = node->attributes; av; av = av->next )

1051

{

1052

if ( av == attr )

1053

{

1054

if ( prev )

1055

prev->next = attr->next;

1056

else

1057

node->attributes = attr->next;

1058

break;

1059

}

1060

prev = av;

1061

}

1062

}

1063

1064

/* detach attribute from node then free it

1065

1066

void RemoveAttribute( TidyDocImpl* doc, Node *node, AttVal *attr )

1067

{

1068

DetachAttribute( node, attr );

1069

FreeAttribute( doc, attr );

1070

}

1071

1072

1073

Free document nodes by iterating through peers and recursing

1074

through children. Set next to NULL before calling FreeNode()

1075

to avoid freeing peer nodes. Doesn't patch up prev/next links.

1076

1077

void FreeNode( TidyDocImpl* doc, Node *node )

1078

{

1079

while ( node )

1080

{

1081

Node* next = node->next;

1082

1083

FreeAttrs( doc, node );

1084

FreeNode( doc, node->content );

1085

MemFree( node->element );

1086

#ifdef TIDY_STORE_ORIGINAL_TEXT

1087

if (node->otext)

1088

MemFree(node->otext);

1089

#endif

1090

if (RootNode != node->type)

1091

MemFree( node );

1092

else

1093

node->content = NULL;

1094

1095

node = next;

1096

}

1097

}

1098

1099

#ifdef TIDY_STORE_ORIGINAL_TEXT

1100

void StoreOriginalTextInToken(TidyDocImpl* doc, Node* node, uint count)

1101

{

1102

if (!doc->storeText)

1103

return;

1104

1105

if (count >= doc->docIn->otextlen)

1106

return;

1107

1108

if (!doc->docIn->otextsize)

1109

return;

1110

1111

if (count == 0)

1112

{

1113

node->otext = doc->docIn->otextbuf;

1114

doc->docIn->otextbuf = NULL;

1115

doc->docIn->otextlen = 0;

1116

doc->docIn->otextsize = 0;

1117

}

1118

else

1119

{

1120

uint len = doc->docIn->otextlen;

1121

tmbstr buf1 = (tmbstr)MemAlloc(len - count + 1);

1122

tmbstr buf2 = (tmbstr)MemAlloc(count + 1);

1123

uint i, j;

1124

1125

/* strncpy? */

1126

1127

for (i = 0; i < len - count; ++i)

1128

buf1[i] = doc->docIn->otextbuf[i];

1129

1130

buf1[i] = 0;

1131

1132

for (j = 0; j + i < len; ++j)

1133

buf2[j] = doc->docIn->otextbuf[j + i];

1134

1135

buf2[j] = 0;

1136

1137

MemFree(doc->docIn->otextbuf);

1138

node->otext = buf1;

1139

doc->docIn->otextbuf = buf2;

1140

doc->docIn->otextlen = count;

1141

doc->docIn->otextsize = count + 1;

1142

}

1143

}

1144

#endif

1145

1146

Node* TextToken( Lexer *lexer )

1147

{

1148

Node *node = NewNode( lexer );

1149

node->start = lexer->txtstart;

1150

node->end = lexer->txtend;

1151

return node;

1152

}

1153

1154

/* used for creating preformatted text from Word2000 */

1155

Node *NewLineNode( Lexer *lexer )

1156

{

1157

Node *node = NewNode( lexer );

1158

node->start = lexer->lexsize;

1159

AddCharToLexer( lexer, (uint)'\n' );

1160

node->end = lexer->lexsize;

1161

return node;

1162

}

1163

1164

/* used for adding a   for Word2000 */

1165

Node* NewLiteralTextNode( Lexer *lexer, ctmbstr txt )

1166

{

1167

Node *node = NewNode( lexer );

1168

node->start = lexer->lexsize;

1169

AddStringToLexer( lexer, txt );

1170

node->end = lexer->lexsize;

1171

return node;

1172

}

1173

1174

static Node* TagToken( TidyDocImpl* doc, NodeType type )

1175

{

1176

Lexer* lexer = doc->lexer;

1177

Node* node = NewNode( lexer );

1178

node->type = type;

1179

node->element = tmbstrndup( lexer->lexbuf + lexer->txtstart,

1180

lexer->txtend - lexer->txtstart );

1181

node->start = lexer->txtstart;

1182

node->end = lexer->txtstart;

1183

1184

if ( type == StartTag || type == StartEndTag || type == EndTag )

1185

FindTag(doc, node);

1186

1187

return node;

1188

}

1189

1190

static Node* NewToken(TidyDocImpl* doc, NodeType type)

1191

{

1192

Lexer* lexer = doc->lexer;

1193

Node* node = NewNode(lexer);

1194

node->type = type;

1195

node->start = lexer->txtstart;

1196

node->end = lexer->txtend;

1197

#ifdef TIDY_STORE_ORIGINAL_TEXT

1198

StoreOriginalTextInToken(doc, node, 0);

1199

#endif

1200

return node;

1201

}

1202

1203

#define CommentToken(doc) NewToken(doc, CommentTag)

1204

#define DocTypeToken(doc) NewToken(doc, DocTypeTag)

1205

#define PIToken(doc) NewToken(doc, ProcInsTag)

1206

#define AspToken(doc) NewToken(doc, AspTag)

1207

#define JsteToken(doc) NewToken(doc, JsteTag)

1208

#define PhpToken(doc) NewToken(doc, PhpTag)

1209

#define XmlDeclToken(doc) NewToken(doc, XmlDecl)

1210

#define SectionToken(doc) NewToken(doc, SectionTag)

1211

#define CDATAToken(doc) NewToken(doc, CDATATag)

1212

1213

void AddStringLiteral( Lexer* lexer, ctmbstr str )

1214

{

1215

byte c;

1216

while(0 != (c = *str++) )

1217

AddCharToLexer( lexer, c );

1218

}

1219

1220

void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len )

1221

{

1222

byte c;

1223

int ix;

1224

1225

for ( ix=0; ix < len && (c = *str++); ++ix )

1226

AddCharToLexer(lexer, c);

1227

}

1228

1229

/* find doctype element */

1230

Node *FindDocType( TidyDocImpl* doc )

1231

{

1232

Node* node;

1233

for ( node = (doc ? doc->root.content : NULL);

1234

node && node->type != DocTypeTag;

1235

node = node->next )

1236

/**/;

1237

return node;

1238

}

1239

1240

/* find parent container element */

1241

Node* FindContainer( Node* node )

1242

{

1243

for ( node = (node ? node->parent : NULL);

1244

node && nodeHasCM(node, CM_INLINE);

1245

node = node->parent )

1246

/**/;

1247

1248

return node;

1249

}

1250

1251

1252

/* find html element */

1253

Node *FindHTML( TidyDocImpl* doc )

1254

{

1255

Node *node;

1256

for ( node = (doc ? doc->root.content : NULL);

1257

node && !nodeIsHTML(node);

1258

node = node->next )

1259

/**/;

1260

1261

return node;

1262

}

1263

1264

/* find XML Declaration */

1265

Node *FindXmlDecl(TidyDocImpl* doc)

1266

{

1267

Node *node;

1268

for ( node = (doc ? doc->root.content : NULL);

1269

node && !(node->type == XmlDecl);

1270

node = node->next )

1271

/**/;

1272

1273

return node;

1274

}

1275

1276

1277

Node *FindHEAD( TidyDocImpl* doc )

1278

{

1279

Node *node = FindHTML( doc );

1280

1281

if ( node )

1282

{

1283

for ( node = node->content;

1284

node && !nodeIsHEAD(node);

1285

node = node->next )

1286

/**/;

1287

}

1288

1289

return node;

1290

}

1291

1292

Node *FindTITLE(TidyDocImpl* doc)

1293

{

1294

Node *node = FindHEAD(doc);

1295

1296

if (node)

1297

for (node = node->content;

1298

node && !nodeIsTITLE(node);

1299

node = node->next) {}

1300

1301

return node;

1302

}

1303

1304

Node *FindBody( TidyDocImpl* doc )

1305

{

1306

Node *node = ( doc ? doc->root.content : NULL );

1307

1308

while ( node && !nodeIsHTML(node) )

1309

node = node->next;

1310

1311

if (node == NULL)

1312

return NULL;

1313

1314

node = node->content;

1315

while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) )

1316

node = node->next;

1317

1318

if ( node && nodeIsFRAMESET(node) )

1319

{

1320

node = node->content;

1321

while ( node && !nodeIsNOFRAMES(node) )

1322

node = node->next;

1323

1324

if ( node )

1325

{

1326

node = node->content;

1327

while ( node && !nodeIsBODY(node) )

1328

node = node->next;

1329

}

1330

}

1331

1332

return node;

1333

}

1334

1335

/* add meta element for Tidy */

1336

Bool AddGenerator( TidyDocImpl* doc )

1337

{

1338

AttVal *attval;

1339

Node *node;

1340

Node *head = FindHEAD( doc );

1341

tmbchar buf[256];

1342

1343

if (head)

1344

{

1345

#ifdef PLATFORM_NAME

1346

tmbsnprintf(buf, sizeof(buf), "HTML Tidy for "PLATFORM_NAME" (vers %s), see www.w3.org",

1347

tidyReleaseDate());

1348

#else

1349

tmbsnprintf(buf, sizeof(buf), "HTML Tidy (vers %s), see www.w3.org", tidyReleaseDate());

1350

#endif

1351

1352

for ( node = head->content; node; node = node->next )

1353

{

1354

if ( nodeIsMETA(node) )

1355

{

1356

attval = AttrGetById(node, TidyAttr_NAME);

1357

1358

if (AttrValueIs(attval, "generator"))

1359

{

1360

attval = AttrGetById(node, TidyAttr_CONTENT);

1361

1362

if (AttrHasValue(attval) &&

1363

tmbstrncasecmp(attval->value, "HTML Tidy", 9) == 0)

1364

{

1365

/* update the existing content to reflect the */

1366

/* actual version of Tidy currently being used */

1367

1368

MemFree(attval->value);

1369

attval->value = tmbstrdup(buf);

1370

return no;

1371

}

1372

}

1373

}

1374

}

1375

1376

if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )

1377

{

1378

node = InferredTag(doc, TidyTag_META);

1379

AddAttribute( doc, node, "name", "generator" );

1380

AddAttribute( doc, node, "content", buf );

1381

InsertNodeAtStart( head, node );

1382

return yes;

1383

}

1384

}

1385

1386

return no;

1387

}

1388

1389

/* examine <!DOCTYPE> to identify version */

1390

uint FindGivenVersion( TidyDocImpl* doc, Node* doctype )

1391

{

1392

AttVal * fpi = GetAttrByName(doctype, "PUBLIC");

1393

uint vers;

1394

1395

if (!fpi || !fpi->value)

1396

return VERS_UNKNOWN;

1397

1398

vers = GetVersFromFPI(fpi->value);

1399

1400

if (VERS_XHTML & vers)

1401

{

1402

SetOptionBool(doc, TidyXmlOut, yes);

1403

SetOptionBool(doc, TidyXhtmlOut, yes);

1404

doc->lexer->isvoyager = yes;

1405

}

1406

1407

/* todo: add a warning if case does not match? */

1408

MemFree(fpi->value);

1409

fpi->value = tmbstrdup(GetFPIFromVers(vers));

1410

1411

return vers;

1412

}

1413

1414

/* return guessed version */

1415

uint ApparentVersion( TidyDocImpl* doc )

1416

{

1417

if ((doc->lexer->doctype == XH11 ||

1418

doc->lexer->doctype == XB10) &&

1419

(doc->lexer->versions & doc->lexer->doctype))

1420

return doc->lexer->doctype;

1421

else

1422

return HTMLVersion(doc);

1423

}

1424

1425

ctmbstr HTMLVersionNameFromCode( uint vers, Bool ARG_UNUSED(isXhtml) )

1426

{

1427

ctmbstr name = GetNameFromVers(vers);

1428

1429

/* this test has moved to ReportMarkupVersion() in localize.c, for localization reasons */

1430

1431

if (!name)

1432

name = "HTML Proprietary";

1433

1434

1435

return name;

1436

}

1437

1438

/* Put DOCTYPE declaration between the

1439

** <?xml version "1.0" ... ?> declaration, if any,

1440

** and the <html> tag. Should also work for any comments,

1441

** etc. that may precede the <html> tag.

1442

1443

1444

static Node* NewDocTypeNode( TidyDocImpl* doc )

1445

{

1446

Node* doctype = NULL;

1447

Node* html = FindHTML( doc );

1448

Node* root = &doc->root;

1449

if ( !html )

1450

return NULL;

1451

1452

doctype = NewNode( NULL );

1453

doctype->type = DocTypeTag;

1454

doctype->next = html;

1455

doctype->parent = root;

1456

1457

if ( html == root->content )

1458

{

1459

/* No <?xml ... ?> declaration. */

1460

root->content->prev = doctype;

1461

root->content = doctype;

1462

doctype->prev = NULL;

1463

}

1464

else

1465

{

1466

/* we have an <?xml ... ?> declaration. */

1467

doctype->prev = html->prev;

1468

doctype->prev->next = doctype;

1469

}

1470

html->prev = doctype;

1471

return doctype;

1472

}

1473

1474

Bool SetXHTMLDocType( TidyDocImpl* doc )

1475

{

1476

Lexer *lexer = doc->lexer;

1477

Node *doctype = FindDocType( doc );

1478

TidyDoctypeModes dtmode = cfg(doc, TidyDoctypeMode);

1479

ctmbstr pub = "PUBLIC";

1480

ctmbstr sys = "SYSTEM";

1481

1482

lexer->versionEmitted = ApparentVersion( doc );

1483

1484

if (dtmode == TidyDoctypeOmit)

1485

{

1486

if (doctype)

1487

DiscardElement(doc, doctype);

1488

return yes;

1489

}

1490

1491

if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype))

1492

return no;

1493

1494

if (!doctype)

1495

{

1496

doctype = NewDocTypeNode(doc);

1497

doctype->element = tmbstrdup("html");

1498

}

1499

else

1500

{

1501

doctype->element = tmbstrtolower(doctype->element);

1502

}

1503

1504

switch(dtmode)

1505

{

1506

case TidyDoctypeStrict:

1507

/* XHTML 1.0 Strict */

1508

RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10S));

1509

RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10S));

1510

lexer->versionEmitted = X10S;

1511

break;

1512

case TidyDoctypeLoose:

1513

/* XHTML 1.0 Transitional */

1514

RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10T));

1515

RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10T));

1516

lexer->versionEmitted = X10T;

1517

break;

1518

case TidyDoctypeUser:

1519

/* user defined document type declaration */

1520

RepairAttrValue(doc, doctype, pub, cfgStr(doc, TidyDoctype));

1521

RepairAttrValue(doc, doctype, sys, "");

1522

break;

1523

case TidyDoctypeAuto:

1524

if (lexer->versions & XH11 && lexer->doctype == XH11)

1525

{

1526

if (!GetAttrByName(doctype, sys))

1527

RepairAttrValue(doc, doctype, sys, GetSIFromVers(XH11));

1528

lexer->versionEmitted = XH11;

1529

return yes;

1530

}

1531

else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40))

1532

{

1533

RepairAttrValue(doc, doctype, pub, GetFPIFromVers(XH11));

1534

RepairAttrValue(doc, doctype, sys, GetSIFromVers(XH11));

1535

lexer->versionEmitted = XH11;

1536

}

1537

else if (lexer->versions & XB10 && lexer->doctype == XB10)

1538

{

1539

if (!GetAttrByName(doctype, sys))

1540

RepairAttrValue(doc, doctype, sys, GetSIFromVers(XB10));

1541

lexer->versionEmitted = XB10;

1542

return yes;

1543

}

1544

else if (lexer->versions & VERS_HTML40_STRICT)

1545

{

1546

RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10S));

1547

RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10S));

1548

lexer->versionEmitted = X10S;

1549

}

1550

else if (lexer->versions & VERS_FRAMESET)

1551

{

1552

RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10F));

1553

RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10F));

1554

lexer->versionEmitted = X10F;

1555

}

1556

else if (lexer->versions & VERS_LOOSE)

1557

{

1558

RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10T));

1559

RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10T));

1560

lexer->versionEmitted = X10T;

1561

}

1562

else

1563

{

1564

if (doctype)

1565

DiscardElement(doc, doctype);

1566

return no;

1567

}

1568

break;

1569

}

1570

1571

return no;

1572

}

1573

1574

/* fixup doctype if missing */

1575

Bool FixDocType( TidyDocImpl* doc )

1576

{

1577

Lexer* lexer = doc->lexer;

1578

Node* doctype = FindDocType( doc );

1579

uint dtmode = cfg( doc, TidyDoctypeMode );

1580

uint guessed = VERS_UNKNOWN;

1581

Bool hadSI = no;

1582

1583

if (dtmode == TidyDoctypeAuto &&

1584

lexer->versions & lexer->doctype &&

1585

!(VERS_XHTML & lexer->doctype && !lexer->isvoyager)

1586

&& FindDocType(doc))

1587

{

1588

lexer->versionEmitted = lexer->doctype;

1589

return yes;

1590

}

1591

1592

if (dtmode == TidyDoctypeOmit)

1593

{

1594

if (doctype)

1595

DiscardElement( doc, doctype );

1596

lexer->versionEmitted = ApparentVersion( doc );

1597

return yes;

1598

}

1599

1600

if (cfgBool(doc, TidyXmlOut))

1601

return yes;

1602

1603

if (doctype)

1604

hadSI = GetAttrByName(doctype, "SYSTEM") != NULL;

1605

1606

if ((dtmode == TidyDoctypeStrict ||

1607

dtmode == TidyDoctypeLoose) && doctype)

1608

{

1609

DiscardElement(doc, doctype);

1610

doctype = NULL;

1611

}

1612

1613

switch (dtmode)

1614

{

1615

case TidyDoctypeStrict:

1616

guessed = H41S;

1617

break;

1618

case TidyDoctypeLoose:

1619

guessed = H41T;

1620

break;

1621

case TidyDoctypeAuto:

1622

guessed = HTMLVersion(doc);

1623

break;

1624

}

1625

1626

lexer->versionEmitted = guessed;

1627

if (guessed == VERS_UNKNOWN)

1628

return no;

1629

1630

if (doctype)

1631

{

1632

doctype->element = tmbstrtolower(doctype->element);

1633

}

1634

else

1635

{

1636

doctype = NewDocTypeNode(doc);

1637

doctype->element = tmbstrdup("html");

1638

}

1639

1640

RepairAttrValue(doc, doctype, "PUBLIC", GetFPIFromVers(guessed));

1641

1642

if (hadSI)

1643

RepairAttrValue(doc, doctype, "SYSTEM", GetSIFromVers(guessed));

1644

1645

return yes;

1646

}

1647

1648

/* ensure XML document starts with <?xml version="1.0"?> */

1649

/* add encoding attribute if not using ASCII or UTF-8 output */

1650

Bool FixXmlDecl( TidyDocImpl* doc )

1651

{

1652

Node* xml;

1653

AttVal *version, *encoding;

1654

Lexer*lexer = doc->lexer;

1655

Node* root = &doc->root;

1656

1657

if ( root->content && root->content->type == XmlDecl )

1658

{

1659

xml = root->content;

1660

}

1661

else

1662

{

1663

xml = NewNode(lexer);

1664

xml->type = XmlDecl;

1665

xml->next = root->content;

1666

1667

if ( root->content )

1668

{

1669

root->content->prev = xml;

1670

xml->next = root->content;

1671

}

1672

1673

root->content = xml;

1674

}

1675

1676

version = GetAttrByName(xml, "version");

1677

encoding = GetAttrByName(xml, "encoding");

1678

1679

1680

We need to insert a check if declared encoding

1681

and output encoding mismatch and fix the XML

1682

declaration accordingly!!!

1683

1684

1685

if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 )

1686

{

1687

ctmbstr enc = GetEncodingNameFromTidyId(cfg(doc, TidyOutCharEncoding));

1688

if ( enc )

1689

AddAttribute( doc, xml, "encoding", enc );

1690

}

1691

1692

if ( version == NULL )

1693

AddAttribute( doc, xml, "version", "1.0" );

1694

return yes;

1695

}

1696

1697

Node* InferredTag(TidyDocImpl* doc, TidyTagId id)

1698

{

1699

Lexer *lexer = doc->lexer;

1700

Node *node = NewNode( lexer );

1701

const Dict* dict = LookupTagDef(id);

1702

1703

assert( dict != NULL );

1704

1705

node->type = StartTag;

1706

node->implicit = yes;

1707

node->element = tmbstrdup(dict->name);

1708

node->tag = dict;

1709

node->start = lexer->txtstart;

1710

node->end = lexer->txtend;

1711

1712

return node;

1713

}

1714

1715

Bool ExpectsContent(Node *node)

1716

{

1717

if (node->type != StartTag)

1718

return no;

1719

1720

/* unknown element? */

1721

if (node->tag == NULL)

1722

return yes;

1723

1724

if (node->tag->model & CM_EMPTY)

1725

return no;

1726

1727

return yes;

1728

}

1729

1730

1731

create a text node for the contents of

1732

a CDATA element like style or script

1733

which ends with </foo> for some foo.

1734

1735

1736

#define CDATA_INTERMEDIATE 1

1737

#define CDATA_STARTTAG 2

1738

#define CDATA_ENDTAG 3

1739

1740

Node *GetCDATA( TidyDocImpl* doc, Node *container )

1741

{

1742

Lexer* lexer = doc->lexer;

1743

uint start = 0;

1744

int nested = 0;

1745

int state = CDATA_INTERMEDIATE;

1746

uint i;

1747

Bool isEmpty = yes;

1748

Bool matches = no;

1749

uint c;

1750

Bool hasSrc = AttrGetById(container, TidyAttr_SRC) != NULL;

1751

1752

lexer->lines = doc->docIn->curline;

1753

lexer->columns = doc->docIn->curcol;

1754

lexer->waswhite = no;

1755

lexer->txtstart = lexer->txtend = lexer->lexsize;

1756

1757

/* seen start tag, look for matching end tag */

1758

while ((c = ReadChar(doc->docIn)) != EndOfStream)

1759

{

1760

AddCharToLexer(lexer, c);

1761

lexer->txtend = lexer->lexsize;

1762

1763

if (state == CDATA_INTERMEDIATE)

1764

{

1765

if (c != '<')

1766

{

1767

if (isEmpty && !IsWhite(c))

1768

isEmpty = no;

1769

continue;

1770

}

1771

1772

c = ReadChar(doc->docIn);

1773

1774

if (IsLetter(c))

1775

{

1776

/* <head><script src=foo><meta name=foo content=bar>*/

1777

if (hasSrc && isEmpty && nodeIsSCRIPT(container))

1778

{

1779

/* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */

1780

lexer->lexsize = lexer->txtstart;

1781

UngetChar(c, doc->docIn);

1782

UngetChar('<', doc->docIn);

1783

return NULL;

1784

}

1785

AddCharToLexer(lexer, c);

1786

start = lexer->lexsize - 1;

1787

state = CDATA_STARTTAG;

1788

}

1789

else if (c == '/')

1790

{

1791

AddCharToLexer(lexer, c);

1792

1793

c = ReadChar(doc->docIn);

1794

1795

if (!IsLetter(c))

1796

{

1797

UngetChar(c, doc->docIn);

1798

continue;

1799

}

1800

UngetChar(c, doc->docIn);

1801

1802

start = lexer->lexsize;

1803

state = CDATA_ENDTAG;

1804

}

1805

else if (c == '\\')

1806

{

1807

/* recognize document.write("<script><\/script>") */

1808

AddCharToLexer(lexer, c);

1809

1810

c = ReadChar(doc->docIn);

1811

1812

if (c != '/')

1813

{

1814

UngetChar(c, doc->docIn);

1815

continue;

1816

}

1817

1818

AddCharToLexer(lexer, c);

1819

c = ReadChar(doc->docIn);

1820

1821

if (!IsLetter(c))

1822

{

1823

UngetChar(c, doc->docIn);

1824

continue;

1825

}

1826

UngetChar(c, doc->docIn);

1827

1828

start = lexer->lexsize;

1829

state = CDATA_ENDTAG;

1830

}

1831

else

1832

{

1833

UngetChar(c, doc->docIn);

1834

}

1835

}

1836

/* '<' + Letter found */

1837

else if (state == CDATA_STARTTAG)

1838

{

1839

if (IsLetter(c))

1840

continue;

1841

1842

matches = tmbstrncasecmp(container->element, lexer->lexbuf + start,

1843

tmbstrlen(container->element)) == 0;

1844

if (matches)

1845

nested++;

1846

1847

state = CDATA_INTERMEDIATE;

1848

}

1849

/* '<' + '/' + Letter found */

1850

else if (state == CDATA_ENDTAG)

1851

{

1852

if (IsLetter(c))

1853

continue;

1854

1855

matches = tmbstrncasecmp(container->element, lexer->lexbuf + start,

1856

tmbstrlen(container->element)) == 0;

1857

1858

if (isEmpty && !matches)

1859

{

1860

/* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */

1861

1862

for (i = lexer->lexsize - 1; i >= start; --i)

1863

UngetChar((uint)lexer->lexbuf[i], doc->docIn);

1864

UngetChar('/', doc->docIn);

1865

UngetChar('<', doc->docIn);

1866

break;

1867

}

1868

1869

if (matches && nested-- <= 0)

1870

{

1871

for (i = lexer->lexsize - 1; i >= start; --i)

1872

UngetChar((uint)lexer->lexbuf[i], doc->docIn);

1873

UngetChar('/', doc->docIn);

1874

UngetChar('<', doc->docIn);

1875

lexer->lexsize -= (lexer->lexsize - start) + 2;

1876

break;

1877

}

1878

else if (lexer->lexbuf[start - 2] != '\\')

1879

{

1880

/* if the end tag is not already escaped using backslash */

1881

lexer->lines = doc->docIn->curline;

1882

lexer->columns = doc->docIn->curcol - 3;

1883

ReportError(doc, NULL, NULL, BAD_CDATA_CONTENT);

1884

1885

/* if javascript insert backslash before / */

1886

if (IsJavaScript(container))

1887

{

1888

for (i = lexer->lexsize; i > start-1; --i)

1889

lexer->lexbuf[i] = lexer->lexbuf[i-1];

1890

1891

lexer->lexbuf[start-1] = '\\';

1892

lexer->lexsize++;

1893

}

1894

}

1895

state = CDATA_INTERMEDIATE;

1896

}

1897

}

1898

if (isEmpty)

1899

lexer->lexsize = lexer->txtstart = lexer->txtend;

1900

else

1901

lexer->txtend = lexer->lexsize;

1902

1903

if (c == EndOfStream)

1904

ReportError(doc, container, NULL, MISSING_ENDTAG_FOR );

1905

1906

/* if (lexer->txtend > lexer->txtstart) */

1907

return TextToken(lexer);

1908

1909

return NULL;

1910

}

1911

1912

void UngetToken( TidyDocImpl* doc )

1913

{

1914

doc->lexer->pushed = yes;

1915

}

1916

1917

#ifdef TIDY_STORE_ORIGINAL_TEXT

1918

#define CondReturnTextNode(doc, skip) \

1919

if (lexer->txtend > lexer->txtstart) \

1920

{ \

1921

lexer->token = TextToken(lexer); \

1922

StoreOriginalTextInToken(doc, lexer->token, skip); \

1923

return lexer->token; \

1924

}

1925

#else

1926

#define CondReturnTextNode(doc, skip) \

1927

if (lexer->txtend > lexer->txtstart) \

1928

{ \

1929

lexer->token = TextToken(lexer); \

1930

return lexer->token; \

1931

}

1932

#endif

1933

1934

1935

modes for GetToken()

1936

1937

MixedContent -- for elements which don't accept PCDATA

1938

Preformatted -- white space preserved as is

1939

IgnoreMarkup -- for CDATA elements such as script, style

1940

1941

1942

Node* GetToken( TidyDocImpl* doc, uint mode )

1943

{

1944

Lexer* lexer = doc->lexer;

1945

uint c, badcomment = 0;

1946

Bool isempty = no;

1947

AttVal *attributes = NULL;

1948

1949

if (lexer->pushed)

1950

{

1951

/* duplicate inlines in preference to pushed text nodes when appropriate */

1952

if (lexer->token->type != TextNode || (!lexer->insert && !lexer->inode))

1953

{

1954

lexer->pushed = no;

1955

return lexer->token;

1956

}

1957

}

1958

1959

/* at start of block elements, unclosed inline

1960

elements are inserted into the token stream */

1961

1962

if (lexer->insert || lexer->inode)

1963

{

1964

if (lexer->pushed)

1965

{

1966

lexer->pushed = no;

1967

FreeNode( doc, lexer->token );

1968

}

1969

return lexer->token = InsertedToken( doc );

1970

}

1971

1972

if (mode == CdataContent)

1973

{

1974

assert( lexer->parent != NULL );

1975

if (lexer->pushed)

1976

{

1977

lexer->pushed = no;

1978

FreeNode( doc, lexer->token );

1979

}

1980

return lexer->token = GetCDATA(doc, lexer->parent);

1981

}

1982

1983

lexer->lines = doc->docIn->curline;

1984

lexer->columns = doc->docIn->curcol;

1985

lexer->waswhite = no;

1986

1987

lexer->txtstart = lexer->txtend = lexer->lexsize;

1988

1989

while ((c = ReadChar(doc->docIn)) != EndOfStream)

1990

{

1991

if (lexer->insertspace && !(mode & IgnoreWhitespace))

1992

{

1993

AddCharToLexer(lexer, ' ');

1994

lexer->waswhite = yes;

1995

lexer->insertspace = no;

1996

}

1997

1998

if (c == 160 && (mode & Preformatted))

1999

c = ' ';

2000

2001

AddCharToLexer(lexer, c);

2002

2003

switch (lexer->state)

2004

{

2005

case LEX_CONTENT: /* element content */

2006

2007

2008

Discard white space if appropriate. Its cheaper

2009

to do this here rather than in parser methods

2010

for elements that don't have mixed content.

2011

2012

if (IsWhite(c) && (mode == IgnoreWhitespace)

2013

&& lexer->lexsize == lexer->txtstart + 1)

2014

{

2015

--(lexer->lexsize);

2016

lexer->waswhite = no;

2017

lexer->lines = doc->docIn->curline;

2018

lexer->columns = doc->docIn->curcol;

2019

continue;

2020

}

2021

2022

if (c == '<')

2023

{

2024

lexer->state = LEX_GT;

2025

continue;

2026

}

2027

2028

if (IsWhite(c))

2029

{

2030

/* was previous character white? */

2031

if (lexer->waswhite)

2032

{

2033

if (mode != Preformatted && mode != IgnoreMarkup)

2034

{

2035

--(lexer->lexsize);

2036

lexer->lines = doc->docIn->curline;

2037

lexer->columns = doc->docIn->curcol;

2038

}

2039

}

2040

else /* prev character wasn't white */

2041

{

2042

lexer->waswhite = yes;

2043

2044

if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')

2045

ChangeChar(lexer, ' ');

2046

}

2047

2048

continue;

2049

}

2050

else if (c == '&' && mode != IgnoreMarkup)

2051

ParseEntity( doc, mode );

2052

2053

/* this is needed to avoid trimming trailing whitespace */

2054

if (mode == IgnoreWhitespace)

2055

mode = MixedContent;

2056

2057

lexer->waswhite = no;

2058

continue;

2059

2060

case LEX_GT: /* < */

2061

2062

/* check for endtag */

2063

if (c == '/')

2064

{

2065

if ((c = ReadChar(doc->docIn)) == EndOfStream)

2066

{

2067

UngetChar(c, doc->docIn);

2068

continue;

2069

}

2070

2071

AddCharToLexer(lexer, c);

2072

2073

if (IsLetter(c))

2074

{

2075

lexer->lexsize -= 3;

2076

lexer->txtend = lexer->lexsize;

2077

UngetChar(c, doc->docIn);

2078

lexer->state = LEX_ENDTAG;

2079

lexer->lexbuf[lexer->lexsize] = '\0'; /* debug */

2080

doc->docIn->curcol -= 2;

2081

2082

/* if some text before the </ return it now */

2083

if (lexer->txtend > lexer->txtstart)

2084

{

2085

/* trim space character before end tag */

2086

if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ')

2087

{

2088

lexer->lexsize -= 1;

2089

lexer->txtend = lexer->lexsize;

2090

}

2091

lexer->token = TextToken(lexer);

2092

#ifdef TIDY_STORE_ORIGINAL_TEXT

2093

StoreOriginalTextInToken(doc, lexer->token, 3);

2094

#endif

2095

return lexer->token;

2096

}

2097

2098

continue; /* no text so keep going */

2099

}

2100

2101

/* otherwise treat as CDATA */

2102

lexer->waswhite = no;

2103

lexer->state = LEX_CONTENT;

2104

continue;

2105

}

2106

2107

if (mode == IgnoreMarkup)

2108

{

2109

/* otherwise treat as CDATA */

2110

lexer->waswhite = no;

2111

lexer->state = LEX_CONTENT;

2112

continue;

2113

}

2114

2115

2116

look out for comments, doctype or marked sections

2117

this isn't quite right, but its getting there ...

2118

2119

if (c == '!')

2120

{

2121

c = ReadChar(doc->docIn);

2122

2123

if (c == '-')

2124

{

2125

c = ReadChar(doc->docIn);

2126

2127

if (c == '-')

2128

{

2129

lexer->state = LEX_COMMENT; /* comment */

2130

lexer->lexsize -= 2;

2131

lexer->txtend = lexer->lexsize;

2132

2133

CondReturnTextNode(doc, 4)

2134

2135

lexer->txtstart = lexer->lexsize;

2136

continue;

2137

}

2138

2139

ReportError(doc, NULL, NULL, MALFORMED_COMMENT );

2140

}

2141

else if (c == 'd' || c == 'D')

2142

{

2143

/* todo: check for complete "<!DOCTYPE" not just <!D */

2144

2145

uint skip = 0;

2146

2147

lexer->state = LEX_DOCTYPE; /* doctype */

2148

lexer->lexsize -= 2;

2149

lexer->txtend = lexer->lexsize;

2150

mode = IgnoreWhitespace;

2151

2152

/* skip until white space or '>' */

2153

2154

for (;;)

2155

{

2156

c = ReadChar(doc->docIn);

2157

++skip;

2158

2159

if (c == EndOfStream || c == '>')

2160

{

2161

UngetChar(c, doc->docIn);

2162

break;

2163

}

2164

2165

2166

if (!IsWhite(c))

2167

continue;

2168

2169

/* and skip to end of whitespace */

2170

2171

for (;;)

2172

{

2173

c = ReadChar(doc->docIn);

2174

++skip;

2175

2176

if (c == EndOfStream || c == '>')

2177

{

2178

UngetChar(c, doc->docIn);

2179

break;

2180

}

2181

2182

2183

if (IsWhite(c))

2184

continue;

2185

2186

UngetChar(c, doc->docIn);

2187

break;

2188

}

2189

2190

break;

2191

}

2192

2193

CondReturnTextNode(doc, (skip + 3))

2194

2195

lexer->txtstart = lexer->lexsize;

2196

continue;

2197

}

2198

else if (c == '[')

2199

{

2200

/* Word 2000 embeds <![if ...]> ... <![endif]> sequences */

2201

lexer->lexsize -= 2;

2202

lexer->state = LEX_SECTION;

2203

lexer->txtend = lexer->lexsize;

2204

2205

CondReturnTextNode(doc, 2)

2206

2207

lexer->txtstart = lexer->lexsize;

2208

continue;

2209

}

2210

2211

2212

2213

/* else swallow characters up to and including next '>' */

2214

while ((c = ReadChar(doc->docIn)) != '>')

2215

{

2216

if (c == EndOfStream)

2217

{

2218

UngetChar(c, doc->docIn);

2219

break;

2220

}

2221

}

2222

2223

lexer->lexsize -= 2;

2224

lexer->lexbuf[lexer->lexsize] = '\0';

2225

lexer->state = LEX_CONTENT;

2226

continue;

2227

}

2228

2229

2230

processing instructions

2231

2232

2233

if (c == '?')

2234

{

2235

lexer->lexsize -= 2;

2236

lexer->state = LEX_PROCINSTR;

2237

lexer->txtend = lexer->lexsize;

2238

2239

CondReturnTextNode(doc, 2)

2240

2241

lexer->txtstart = lexer->lexsize;

2242

continue;

2243

}

2244

2245

/* Microsoft ASP's e.g. <% ... server-code ... %> */

2246

if (c == '%')

2247

{

2248

lexer->lexsize -= 2;

2249

lexer->state = LEX_ASP;

2250

lexer->txtend = lexer->lexsize;

2251

2252

CondReturnTextNode(doc, 2)

2253

2254

lexer->txtstart = lexer->lexsize;

2255

continue;

2256

}

2257

2258

/* Netscapes JSTE e.g. <# ... server-code ... #> */

2259

if (c == '#')

2260

{

2261

lexer->lexsize -= 2;

2262

lexer->state = LEX_JSTE;

2263

lexer->txtend = lexer->lexsize;

2264

2265

CondReturnTextNode(doc, 2)

2266

2267

lexer->txtstart = lexer->lexsize;

2268

continue;

2269

}

2270

2271

/* check for start tag */

2272

if (IsLetter(c))

2273

{

2274

UngetChar(c, doc->docIn); /* push back letter */

2275

UngetChar('<', doc->docIn);

2276

--(doc->docIn->curcol);

2277

lexer->lexsize -= 2; /* discard "<" + letter */

2278

lexer->txtend = lexer->lexsize;

2279

lexer->state = LEX_STARTTAG; /* ready to read tag name */

2280

2281

CondReturnTextNode(doc, 2)

2282

2283

/* lexer->txtstart = lexer->lexsize; missing here? */

2284

continue; /* no text so keep going */

2285

}

2286

2287

/* fix for bug 762102 */

2288

if (c == '&')

2289

{

2290

UngetChar(c, doc->docIn);

2291

--(lexer->lexsize);

2292

}

2293

2294

/* otherwise treat as CDATA */

2295

lexer->state = LEX_CONTENT;

2296

lexer->waswhite = no;

2297

continue;

2298

2299

case LEX_ENDTAG: /* </letter */

2300

lexer->txtstart = lexer->lexsize - 1;

2301

doc->docIn->curcol += 2;

2302

c = ParseTagName( doc );

2303

lexer->token = TagToken( doc, EndTag ); /* create endtag token */

2304

lexer->lexsize = lexer->txtend = lexer->txtstart;

2305

2306

/* skip to '>' */

2307

while ( c != '>' && c != EndOfStream )

2308

{

2309

c = ReadChar(doc->docIn);

2310

}

2311

2312

if (c == EndOfStream)

2313

{

2314

FreeNode( doc, lexer->token );

2315

continue;

2316

}

2317

2318

lexer->state = LEX_CONTENT;

2319

lexer->waswhite = no;

2320

#ifdef TIDY_STORE_ORIGINAL_TEXT

2321

StoreOriginalTextInToken(doc, lexer->token, 0); /* hmm... */

2322

#endif

2323

return lexer->token; /* the endtag token */

2324

2325

case LEX_STARTTAG: /* first letter of tagname */

2326

c = ReadChar(doc->docIn);

2327

ChangeChar(lexer, (tmbchar)c);

2328

lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */

2329

c = ParseTagName( doc );

2330

isempty = no;

2331

attributes = NULL;

2332

lexer->token = TagToken( doc, (isempty ? StartEndTag : StartTag) );

2333

2334

/* parse attributes, consuming closing ">" */

2335

if (c != '>')

2336

{

2337

if (c == '/')

2338

UngetChar(c, doc->docIn);

2339

2340

attributes = ParseAttrs( doc, &isempty );

2341

}

2342

2343

if (isempty)

2344

lexer->token->type = StartEndTag;

2345

2346

lexer->token->attributes = attributes;

2347

lexer->lexsize = lexer->txtend = lexer->txtstart;

2348

2349

/* swallow newline following start tag */

2350

/* special check needed for CRLF sequence */

2351

/* this doesn't apply to empty elements */

2352

/* nor to preformatted content that needs escaping */

2353

2354

if ((mode != Preformatted && ExpectsContent(lexer->token))

2355

|| nodeIsBR(lexer->token) || nodeIsHR(lexer->token))

2356

{

2357

c = ReadChar(doc->docIn);

2358

2359

if (c != '\n' && c != '\f')

2360

UngetChar(c, doc->docIn);

2361

2362

lexer->waswhite = yes; /* to swallow leading whitespace */

2363

}

2364

else

2365

lexer->waswhite = no;

2366

2367

lexer->state = LEX_CONTENT;

2368

if (lexer->token->tag == NULL)

2369

ReportFatal( doc, NULL, lexer->token, UNKNOWN_ELEMENT );

2370

else if ( !cfgBool(doc, TidyXmlTags) )

2371

{

2372

Node* curr = lexer->token;

2373

ConstrainVersion( doc, curr->tag->versions );

2374

2375

if ( curr->tag->versions & VERS_PROPRIETARY )

2376

{

2377

if ( !cfgBool(doc, TidyMakeClean) ||

2378

( !nodeIsNOBR(curr) && !nodeIsWBR(curr) ) )

2379

{

2380

ReportError(doc, NULL, curr, PROPRIETARY_ELEMENT );

2381

2382

if ( nodeIsLAYER(curr) )

2383

doc->badLayout |= USING_LAYER;

2384

else if ( nodeIsSPACER(curr) )

2385

doc->badLayout |= USING_SPACER;

2386

else if ( nodeIsNOBR(curr) )

2387

doc->badLayout |= USING_NOBR;

2388

}

2389

}

2390

2391

RepairDuplicateAttributes( doc, curr );

2392

}

2393

#ifdef TIDY_STORE_ORIGINAL_TEXT

2394

StoreOriginalTextInToken(doc, lexer->token, 0);

2395

#endif

2396

return lexer->token; /* return start tag */

2397

2398

case LEX_COMMENT: /* seen  */

2399

2400

if (c != '-')

2401

continue;

2402

2403

c = ReadChar(doc->docIn);

2404

AddCharToLexer(lexer, c);

2405

2406

if (c != '-')

2407

continue;

2408

2409

end_comment:

2410

c = ReadChar(doc->docIn);

2411

2412

if (c == '>')

2413

{

2414

if (badcomment)

2415

ReportError(doc, NULL, NULL, MALFORMED_COMMENT );

2416

2417

/* do not store closing -- in lexbuf */

2418

lexer->lexsize -= 2;

2419

lexer->txtend = lexer->lexsize;

2420

lexer->lexbuf[lexer->lexsize] = '\0';

2421

lexer->state = LEX_CONTENT;

2422

lexer->waswhite = no;

2423

lexer->token = CommentToken(doc);

2424

2425

/* now look for a line break */

2426

2427

c = ReadChar(doc->docIn);

2428

2429

if (c == '\n')

2430

lexer->token->linebreak = yes;

2431

else

2432

UngetChar(c, doc->docIn);

2433

2434

return lexer->token;

2435

}

2436

2437

/* note position of first such error in the comment */

2438

if (!badcomment)

2439

{

2440

lexer->lines = doc->docIn->curline;

2441

lexer->columns = doc->docIn->curcol - 3;

2442

}

2443

2444

badcomment++;

2445

2446

if ( cfgBool(doc, TidyFixComments) )

2447

lexer->lexbuf[lexer->lexsize - 2] = '=';

2448

2449

AddCharToLexer(lexer, c);

2450

2451

/* if '-' then look for '>' to end the comment */

2452

if (c == '-')

2453

goto end_comment;

2454

2455

/* otherwise continue to look for --> */

2456

lexer->lexbuf[lexer->lexsize - 2] = '=';

2457

continue;

2458

2459

case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */

2460

2461

/* use ParseDocTypeDecl() to tokenize doctype declaration */

2462

UngetChar(c, doc->docIn);

2463

lexer->lexsize -= 1;

2464

lexer->token = ParseDocTypeDecl(doc);

2465

2466

lexer->txtend = lexer->lexsize;

2467

lexer->lexbuf[lexer->lexsize] = '\0';

2468

lexer->state = LEX_CONTENT;

2469

lexer->waswhite = no;

2470

2471

/* make a note of the version named by the 1st doctype */

2472

if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags))

2473

lexer->doctype = FindGivenVersion(doc, lexer->token);

2474

return lexer->token;

2475

2476

case LEX_PROCINSTR: /* seen <? so look for '>' */

2477

/* check for PHP preprocessor instructions <?php ... ?> */

2478

2479

if (lexer->lexsize - lexer->txtstart == 3)

2480

{

2481

if (tmbstrncmp(lexer->lexbuf + lexer->txtstart, "php", 3) == 0)

2482

{

2483

lexer->state = LEX_PHP;

2484

continue;

2485

}

2486

}

2487

2488

if (lexer->lexsize - lexer->txtstart == 4)

2489

{

2490

if (tmbstrncmp(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 &&

2491

IsWhite(lexer->lexbuf[lexer->txtstart + 3]))

2492

{

2493

lexer->state = LEX_XMLDECL;

2494

attributes = NULL;

2495

continue;

2496

}

2497

}

2498

2499

if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */

2500

{

2501

if (c != '?')

2502

continue;

2503

2504

/* now look for '>' */

2505

c = ReadChar(doc->docIn);

2506

2507

if (c == EndOfStream)

2508

{

2509

ReportError(doc, NULL, NULL, UNEXPECTED_END_OF_FILE );

2510

UngetChar(c, doc->docIn);

2511

continue;

2512

}

2513

2514

AddCharToLexer(lexer, c);

2515

}

2516

2517

2518

if (c != '>')

2519

continue;

2520

2521

lexer->lexsize -= 1;

2522

2523

if (lexer->lexsize)

2524

{

2525

uint i;

2526

Bool closed;

2527

2528

for (i = 0; i < lexer->lexsize - lexer->txtstart &&

2529

!IsWhite(lexer->lexbuf[i + lexer->txtstart]); ++i)

2530

/**/;

2531

2532

closed = lexer->lexbuf[lexer->lexsize - 1] == '?';

2533

2534

if (closed)

2535

lexer->lexsize -= 1;

2536

2537

lexer->txtstart += i;

2538

lexer->txtend = lexer->lexsize;

2539

lexer->lexbuf[lexer->lexsize] = '\0';

2540

2541

lexer->token = PIToken(doc);

2542

lexer->token->closed = closed;

2543

lexer->token->element = tmbstrndup(lexer->lexbuf +

2544

lexer->txtstart - i, i);

2545

}

2546

else

2547

{

2548

lexer->txtend = lexer->lexsize;

2549

lexer->lexbuf[lexer->lexsize] = '\0';

2550

lexer->token = PIToken(doc);

2551

}

2552

2553

lexer->state = LEX_CONTENT;

2554

lexer->waswhite = no;

2555

return lexer->token;

2556

2557

case LEX_ASP: /* seen <% so look for "%>" */

2558

if (c != '%')

2559

continue;

2560

2561

/* now look for '>' */

2562

c = ReadChar(doc->docIn);

2563

2564

2565

if (c != '>')

2566

{

2567

UngetChar(c, doc->docIn);

2568

continue;

2569

}

2570

2571

lexer->lexsize -= 1;

2572

lexer->txtend = lexer->lexsize;

2573

lexer->lexbuf[lexer->lexsize] = '\0';

2574

lexer->state = LEX_CONTENT;

2575

lexer->waswhite = no;

2576

return lexer->token = AspToken(doc);

2577

2578

case LEX_JSTE: /* seen <# so look for "#>" */

2579

if (c != '#')

2580

continue;

2581

2582

/* now look for '>' */

2583

c = ReadChar(doc->docIn);

2584

2585

2586

if (c != '>')

2587

{

2588

UngetChar(c, doc->docIn);

2589

continue;

2590

}

2591

2592

lexer->lexsize -= 1;

2593

lexer->txtend = lexer->lexsize;

2594

lexer->lexbuf[lexer->lexsize] = '\0';

2595

lexer->state = LEX_CONTENT;

2596

lexer->waswhite = no;

2597

return lexer->token = JsteToken(doc);

2598

2599

case LEX_PHP: /* seen "<?php" so look for "?>" */

2600

if (c != '?')

2601

continue;

2602

2603

/* now look for '>' */

2604

c = ReadChar(doc->docIn);

2605

2606

if (c != '>')

2607

{

2608

UngetChar(c, doc->docIn);

2609

continue;

2610

}

2611

2612

lexer->lexsize -= 1;

2613

lexer->txtend = lexer->lexsize;

2614

lexer->lexbuf[lexer->lexsize] = '\0';

2615

lexer->state = LEX_CONTENT;

2616

lexer->waswhite = no;

2617

return lexer->token = PhpToken(doc);

2618

2619

case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */

2620

2621

if (IsWhite(c) && c != '?')

2622

continue;

2623

2624

/* get pseudo-attribute */

2625

if (c != '?')

2626

{

2627

tmbstr name;

2628

Node *asp, *php;

2629

AttVal *av = NULL;

2630

int pdelim = 0;

2631

isempty = no;

2632

2633

UngetChar(c, doc->docIn);

2634

2635

name = ParseAttribute( doc, &isempty, &asp, &php );

2636

2637

if (!name)

2638

{

2639

/* fix for http://tidy.sf.net/bug/788031 */

2640

lexer->lexsize -= 1;

2641

lexer->txtend = lexer->txtstart;

2642

lexer->lexbuf[lexer->txtend] = '\0';

2643

lexer->state = LEX_CONTENT;

2644

lexer->waswhite = no;

2645

lexer->token = XmlDeclToken(doc);

2646

lexer->token->attributes = attributes;

2647

return lexer->token;

2648

}

2649

2650

av = NewAttribute();

2651

av->attribute = name;

2652

av->value = ParseValue( doc, name, yes, &isempty, &pdelim );

2653

av->delim = pdelim;

2654

av->dict = FindAttribute( doc, av );

2655

2656

AddAttrToList( &attributes, av );

2657

/* continue; */

2658

}

2659

2660

/* now look for '>' */

2661

c = ReadChar(doc->docIn);

2662

2663

if (c != '>')

2664

{

2665

UngetChar(c, doc->docIn);

2666

continue;

2667

}

2668

lexer->lexsize -= 1;

2669

lexer->txtend = lexer->txtstart;

2670

lexer->lexbuf[lexer->txtend] = '\0';

2671

lexer->state = LEX_CONTENT;

2672

lexer->waswhite = no;

2673

lexer->token = XmlDeclToken(doc);

2674

lexer->token->attributes = attributes;

2675

return lexer->token;

2676

2677

case LEX_SECTION: /* seen "<![" so look for "]>" */

2678

if (c == '[')

2679

{

2680

if (lexer->lexsize == (lexer->txtstart + 6) &&

2681

tmbstrncmp(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0)

2682

{

2683

lexer->state = LEX_CDATA;

2684

lexer->lexsize -= 6;

2685

continue;

2686

}

2687

}

2688

2689

if (c != ']')

2690

continue;

2691

2692

/* now look for '>' */

2693

c = ReadChar(doc->docIn);

2694

2695

if (c != '>')

2696

{

2697

UngetChar(c, doc->docIn);

2698

continue;

2699

}

2700

2701

lexer->lexsize -= 1;

2702

lexer->txtend = lexer->lexsize;

2703

lexer->lexbuf[lexer->lexsize] = '\0';

2704

lexer->state = LEX_CONTENT;

2705

lexer->waswhite = no;

2706

return lexer->token = SectionToken(doc);

2707

2708

case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */

2709

if (c != ']')

2710

continue;

2711

2712

/* now look for ']' */

2713

c = ReadChar(doc->docIn);

2714

2715

if (c != ']')

2716

{

2717

UngetChar(c, doc->docIn);

2718

continue;

2719

}

2720

2721

/* now look for '>' */

2722

c = ReadChar(doc->docIn);

2723

2724

if (c != '>')

2725

{

2726

UngetChar(c, doc->docIn);

2727

continue;

2728

}

2729

2730

lexer->lexsize -= 1;

2731

lexer->txtend = lexer->lexsize;

2732

lexer->lexbuf[lexer->lexsize] = '\0';

2733

lexer->state = LEX_CONTENT;

2734

lexer->waswhite = no;

2735

return lexer->token = CDATAToken(doc);

2736

}

2737

}

2738

2739

if (lexer->state == LEX_CONTENT) /* text string */

2740

{

2741

lexer->txtend = lexer->lexsize;

2742

2743

if (lexer->txtend > lexer->txtstart)

2744

{

2745

UngetChar(c, doc->docIn);

2746

2747

if (lexer->lexbuf[lexer->lexsize - 1] == ' ')

2748

{

2749

lexer->lexsize -= 1;

2750

lexer->txtend = lexer->lexsize;

2751

}

2752

lexer->token = TextToken(lexer);

2753

#ifdef TIDY_STORE_ORIGINAL_TEXT

2754

StoreOriginalTextInToken(doc, lexer->token, 0); /* ? */

2755

#endif

2756

return lexer->token;

2757

}

2758

}

2759

else if (lexer->state == LEX_COMMENT) /* comment */

2760

{

2761

if (c == EndOfStream)

2762

ReportError(doc, NULL, NULL, MALFORMED_COMMENT );

2763

2764

lexer->txtend = lexer->lexsize;

2765

lexer->lexbuf[lexer->lexsize] = '\0';

2766

lexer->state = LEX_CONTENT;

2767

lexer->waswhite = no;

2768

return lexer->token = CommentToken(doc);

2769

}

2770

2771

return 0;

2772

}

2773

2774

static void MapStr( ctmbstr str, uint code )

2775

{

2776

while ( *str )

2777

{

2778

uint i = (byte) *str++;

2779

lexmap[i] |= code;

2780

}

2781

}

2782

2783

void InitMap(void)

2784

{

2785

MapStr("\r\n\f", newline|white);

2786

MapStr(" \t", white);

2787

MapStr("-.:_", namechar);

2788

MapStr("0123456789", digit|namechar);

2789

MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar);

2790

MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar);

2791

}

2792

2793

2794

parser for ASP within start tags

2795

2796

Some people use ASP for to customize attributes

2797

Tidy isn't really well suited to dealing with ASP

2798

This is a workaround for attributes, but won't

2799

deal with the case where the ASP is used to tailor

2800

the attribute value. Here is an example of a work

2801

around for using ASP in attribute values:

2802

2803

href='<%=rsSchool.Fields("ID").Value%>'

2804

2805

where the ASP that generates the attribute value

2806

is masked from Tidy by the quotemarks.

2807

2808

2809

2810

static Node *ParseAsp( TidyDocImpl* doc )

2811

{

2812

Lexer* lexer = doc->lexer;

2813

uint c;

2814

Node *asp = NULL;

2815

2816

lexer->txtstart = lexer->lexsize;

2817

2818

for (;;)

2819

{

2820

if ((c = ReadChar(doc->docIn)) == EndOfStream)

2821

break;

2822

2823

AddCharToLexer(lexer, c);

2824

2825

2826

if (c != '%')

2827

continue;

2828

2829

if ((c = ReadChar(doc->docIn)) == EndOfStream)

2830

break;

2831

2832

AddCharToLexer(lexer, c);

2833

2834

if (c == '>')

2835

{

2836

lexer->lexsize -= 2;

2837

break;

2838

}

2839

}

2840

2841

lexer->txtend = lexer->lexsize;

2842

if (lexer->txtend > lexer->txtstart)

2843

asp = AspToken(doc);

2844

2845

lexer->txtstart = lexer->txtend;

2846

return asp;

2847

}

2848

2849

2850

2851

PHP is like ASP but is based upon XML

2852

processing instructions, e.g. <?php ... ?>

2853

2854

static Node *ParsePhp( TidyDocImpl* doc )

2855

{

2856

Lexer* lexer = doc->lexer;

2857

uint c;

2858

Node *php = NULL;

2859

2860

lexer->txtstart = lexer->lexsize;

2861

2862

for (;;)

2863

{

2864

if ((c = ReadChar(doc->docIn)) == EndOfStream)

2865

break;

2866

2867

AddCharToLexer(lexer, c);

2868

2869

2870

if (c != '?')

2871

continue;

2872

2873

if ((c = ReadChar(doc->docIn)) == EndOfStream)

2874

break;

2875

2876

AddCharToLexer(lexer, c);

2877

2878

if (c == '>')

2879

{

2880

lexer->lexsize -= 2;

2881

break;

2882

}

2883

}

2884

2885

lexer->txtend = lexer->lexsize;

2886

if (lexer->txtend > lexer->txtstart)

2887

php = PhpToken(doc);

2888

2889

lexer->txtstart = lexer->txtend;

2890

return php;

2891

}

2892

2893

/* consumes the '>' terminating start tags */

2894

static tmbstr ParseAttribute( TidyDocImpl* doc, Bool *isempty,

2895

Node **asp, Node **php)

2896

{

2897

Lexer* lexer = doc->lexer;

2898

int start, len = 0;

2899

tmbstr attr = NULL;

2900

uint c, lastc;

2901

2902

*asp = NULL; /* clear asp pointer */

2903

*php = NULL; /* clear php pointer */

2904

2905

/* skip white space before the attribute */

2906

2907

for (;;)

2908

{

2909

c = ReadChar( doc->docIn );

2910

2911

2912

if (c == '/')

2913

{

2914

c = ReadChar( doc->docIn );

2915

2916

if (c == '>')

2917

{

2918

*isempty = yes;

2919

return NULL;

2920

}

2921

2922

UngetChar(c, doc->docIn);

2923

c = '/';

2924

break;

2925

}

2926

2927

if (c == '>')

2928

return NULL;

2929

2930

if (c =='<')

2931

{

2932

c = ReadChar(doc->docIn);

2933

2934

if (c == '%')

2935

{

2936

*asp = ParseAsp( doc );

2937

return NULL;

2938

}

2939

else if (c == '?')

2940

{

2941

*php = ParsePhp( doc );

2942

return NULL;

2943

}

2944

2945

UngetChar(c, doc->docIn);

2946

UngetChar('<', doc->docIn);

2947

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT );

2948

return NULL;

2949

}

2950

2951

if (c == '=')

2952

{

2953

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN );

2954

continue;

2955

}

2956

2957

if (c == '"' || c == '\'')

2958

{

2959

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );

2960

continue;

2961

}

2962

2963

if (c == EndOfStream)

2964

{

2965

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );

2966

UngetChar(c, doc->docIn);

2967

return NULL;

2968

}

2969

2970

2971

if (!IsWhite(c))

2972

break;

2973

}

2974

2975

start = lexer->lexsize;

2976

lastc = c;

2977

2978

for (;;)

2979

{

2980

/* but push back '=' for parseValue() */

2981

if (c == '=' || c == '>')

2982

{

2983

UngetChar(c, doc->docIn);

2984

break;

2985

}

2986

2987

if (c == '<' || c == EndOfStream)

2988

{

2989

UngetChar(c, doc->docIn);

2990

break;

2991

}

2992

2993

if (lastc == '-' && (c == '"' || c == '\''))

2994

{

2995

lexer->lexsize--;

2996

--len;

2997

UngetChar(c, doc->docIn);

2998

break;

2999

}

3000

3001

if (IsWhite(c))

3002

break;

3003

3004

/* what should be done about non-namechar characters? */

3005

/* currently these are incorporated into the attr name */

3006

3007

if ( !cfgBool(doc, TidyXmlTags) && IsUpper(c) )

3008

c = ToLower(c);

3009

3010

AddCharToLexer( lexer, c );

3011

lastc = c;

3012

c = ReadChar(doc->docIn);

3013

}

3014

3015

/* handle attribute names with multibyte chars */

3016

len = lexer->lexsize - start;

3017

attr = (len > 0 ? tmbstrndup(lexer->lexbuf+start, len) : NULL);

3018

lexer->lexsize = start;

3019

return attr;

3020

}

3021

3022

3023

invoked when < is seen in place of attribute value

3024

but terminates on whitespace if not ASP, PHP or Tango

3025

this routine recognizes ' and " quoted strings

3026

3027

static int ParseServerInstruction( TidyDocImpl* doc )

3028

{

3029

Lexer* lexer = doc->lexer;

3030

uint c;

3031

int delim = '"';

3032

Bool isrule = no;

3033

3034

c = ReadChar(doc->docIn);

3035

AddCharToLexer(lexer, c);

3036

3037

/* check for ASP, PHP or Tango */

3038

if (c == '%' || c == '?' || c == '@')

3039

isrule = yes;

3040

3041

for (;;)

3042

{

3043

c = ReadChar(doc->docIn);

3044

3045

if (c == EndOfStream)

3046

break;

3047

3048

if (c == '>')

3049

{

3050

if (isrule)

3051

AddCharToLexer(lexer, c);

3052

else

3053

UngetChar(c, doc->docIn);

3054

3055

break;

3056

}

3057

3058

/* if not recognized as ASP, PHP or Tango */

3059

/* then also finish value on whitespace */

3060

if (!isrule)

3061

{

3062

if (IsWhite(c))

3063

break;

3064

}

3065

3066

AddCharToLexer(lexer, c);

3067

3068

if (c == '"')

3069

{

3070

3071

{

3072

c = ReadChar(doc->docIn);

3073

if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */

3074

{

3075

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );

3076

UngetChar(c, doc->docIn);

3077

return 0;

3078

}

3079

if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */

3080

{

3081

UngetChar(c, doc->docIn);

3082

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT );

3083

return 0;

3084

}

3085

AddCharToLexer(lexer, c);

3086

}

3087

while (c != '"');

3088

delim = '\'';

3089

continue;

3090

}

3091

3092

if (c == '\'')

3093

{

3094

3095

{

3096

c = ReadChar(doc->docIn);

3097

if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */

3098

{

3099

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );

3100

UngetChar(c, doc->docIn);

3101

return 0;

3102

}

3103

if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */

3104

{

3105

UngetChar(c, doc->docIn);

3106

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT );

3107

return 0;

3108

}

3109

AddCharToLexer(lexer, c);

3110

}

3111

while (c != '\'');

3112

}

3113

}

3114

3115

return delim;

3116

}

3117

3118

/* values start with "=" or " = " etc. */

3119

/* doesn't consume the ">" at end of start tag */

3120

3121

static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name,

3122

Bool foldCase, Bool *isempty, int *pdelim)

3123

{

3124

Lexer* lexer = doc->lexer;

3125

int len = 0, start;

3126

Bool seen_gt = no;

3127

Bool munge = yes;

3128

uint c, lastc, delim, quotewarning;

3129

tmbstr value;

3130

3131

delim = (tmbchar) 0;

3132

*pdelim = '"';

3133

3134

3135

Henry Zrepa reports that some folk are using the

3136

embed element with script attributes where newlines

3137

are significant and must be preserved

3138

3139

if ( cfgBool(doc, TidyLiteralAttribs) )

3140

munge = no;

3141

3142

/* skip white space before the '=' */

3143

3144

for (;;)

3145

{

3146

c = ReadChar(doc->docIn);

3147

3148

if (c == EndOfStream)

3149

{

3150

UngetChar(c, doc->docIn);

3151

break;

3152

}

3153

3154

if (!IsWhite(c))

3155

break;

3156

}

3157

3158

3159

c should be '=' if there is a value

3160

other legal possibilities are white

3161

space, '/' and '>'

3162

3163

3164

if (c != '=' && c != '"' && c != '\'')

3165

{

3166

UngetChar(c, doc->docIn);

3167

return NULL;

3168

}

3169

3170

/* skip white space after '=' */

3171

3172

for (;;)

3173

{

3174

c = ReadChar(doc->docIn);

3175

3176

if (c == EndOfStream)

3177

{

3178

UngetChar(c, doc->docIn);

3179

break;

3180

}

3181

3182

if (!IsWhite(c))

3183

break;

3184

}

3185

3186

/* check for quote marks */

3187

3188

if (c == '"' || c == '\'')

3189

delim = c;

3190

else if (c == '<')

3191

{

3192

start = lexer->lexsize;

3193

AddCharToLexer(lexer, c);

3194

*pdelim = ParseServerInstruction( doc );

3195

len = lexer->lexsize - start;

3196

lexer->lexsize = start;

3197

return (len > 0 ? tmbstrndup(lexer->lexbuf+start, len) : NULL);

3198

}

3199

else

3200

UngetChar(c, doc->docIn);

3201

3202

3203

and read the value string

3204

check for quote mark if needed

3205

3206

3207

quotewarning = 0;

3208

start = lexer->lexsize;

3209

c = '\0';

3210

3211

for (;;)

3212

{

3213

lastc = c; /* track last character */

3214

c = ReadChar(doc->docIn);

3215

3216

if (c == EndOfStream)

3217

{

3218

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );

3219

UngetChar(c, doc->docIn);

3220

break;

3221

}

3222

3223

if (delim == (tmbchar)0)

3224

{

3225

if (c == '>')

3226

{

3227

UngetChar(c, doc->docIn);

3228

break;

3229

}

3230

3231

if (c == '"' || c == '\'')

3232

{

3233

uint q = c;

3234

3235

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );

3236

3237

/* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */

3238

/* this doesn't handle <a title=foo"/> which browsers treat as */

3239

/* 'foo"/' nor <a title=foo" /> which browser treat as 'foo"' */

3240

3241

c = ReadChar(doc->docIn);

3242

if (c == '>')

3243

{

3244

AddCharToLexer(lexer, q);

3245

UngetChar(c, doc->docIn);

3246

break;

3247

}

3248

else

3249

{

3250

UngetChar(c, doc->docIn);

3251

c = q;

3252

}

3253

}

3254

3255

if (c == '<')

3256

{

3257

UngetChar(c, doc->docIn);

3258

c = '>';

3259

UngetChar(c, doc->docIn);

3260

ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT );

3261

break;

3262

}

3263

3264

3265

For cases like <br clear=all/> need to avoid treating /> as

3266

part of the attribute value, however care is needed to avoid

3267

so treating <a href=http://www.acme.com/> in this way, which

3268

would map the <a> tag to <a href="http://www.acme.com"/>

3269

3270

if (c == '/')

3271

{

3272

/* peek ahead in case of /> */

3273

c = ReadChar(doc->docIn);

3274

3275

if ( c == '>' && !IsUrl(doc, name) )

3276

{

3277

*isempty = yes;

3278

UngetChar(c, doc->docIn);

3279

break;

3280

}

3281

3282

/* unget peeked character */

3283

UngetChar(c, doc->docIn);

3284

c = '/';

3285

}

3286

}

3287

else /* delim is '\'' or '"' */

3288

{

3289

if (c == delim)

3290

break;

3291

3292

if (c == '\n' || c == '<' || c == '>')

3293

++quotewarning;

3294

3295

if (c == '>')

3296

seen_gt = yes;

3297

}

3298

3299

if (c == '&')

3300

{

3301

AddCharToLexer(lexer, c);

3302

ParseEntity( doc, 0 );

3303

if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge)

3304

ChangeChar(lexer, ' ');

3305

continue;

3306

}

3307

3308

3309

kludge for JavaScript attribute values

3310

with line continuations in string literals

3311

3312

if (c == '\\')

3313

{

3314

c = ReadChar(doc->docIn);

3315

3316

if (c != '\n')

3317

{

3318

UngetChar(c, doc->docIn);

3319

c = '\\';

3320

}

3321

}

3322

3323

if (IsWhite(c))

3324

{

3325

if ( delim == 0 )

3326

break;

3327

3328

if (munge)

3329

{

3330

/* discard line breaks in quoted URLs */

3331

/* #438650 - fix by Randy Waki */

3332

if ( c == '\n' && IsUrl(doc, name) )

3333

{

3334

/* warn that we discard this newline */

3335

ReportAttrError( doc, lexer->token, NULL, NEWLINE_IN_URI);

3336

continue;

3337

}

3338

3339

c = ' ';

3340

3341

if (lastc == ' ')

3342

continue;

3343

}

3344

}

3345

else if (foldCase && IsUpper(c))

3346

c = ToLower(c);

3347

3348

AddCharToLexer(lexer, c);

3349

}

3350

3351

if (quotewarning > 10 && seen_gt && munge)

3352

{

3353

3354

there is almost certainly a missing trailing quote mark

3355

as we have see too many newlines, < or > characters.

3356

3357

an exception is made for Javascript attributes and the

3358

javascript URL scheme which may legitimately include < and >,

3359

and for attributes starting with "<xml " as generated by

3360

Microsoft Office.

3361

3362

if ( !IsScript(doc, name) &&

3363

!(IsUrl(doc, name) && tmbstrncmp(lexer->lexbuf+start, "javascript:", 11) == 0) &&

3364

!(tmbstrncmp(lexer->lexbuf+start, "<xml ", 5) == 0)

3365

)

3366

ReportFatal( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE );

3367

}

3368

3369

len = lexer->lexsize - start;

3370

lexer->lexsize = start;

3371

3372

3373

if (len > 0 || delim)

3374

{

3375

/* ignore leading and trailing white space for all but title, alt, value */

3376

/* and prompts attributes unless --literal-attributes is set to yes */

3377

/* #994841 - Whitespace is removed from value attributes */

3378

3379

if (munge &&

3380

tmbstrcasecmp(name, "alt") &&

3381

tmbstrcasecmp(name, "title") &&

3382

tmbstrcasecmp(name, "value") &&

3383

tmbstrcasecmp(name, "prompt"))

3384

{

3385

while (IsWhite(lexer->lexbuf[start+len-1]))

3386

--len;

3387

3388

while (IsWhite(lexer->lexbuf[start]) && start < len)

3389

{

3390

++start;

3391

--len;

3392

}

3393

}

3394

3395

value = tmbstrndup(lexer->lexbuf + start, len);

3396

}

3397

else

3398

value = NULL;

3399

3400

/* note delimiter if given */

3401

*pdelim = (delim ? delim : '"');

3402

3403

return value;

3404

}

3405

3406

/* attr must be non-NULL */

3407

Bool IsValidAttrName( ctmbstr attr )

3408

{

3409

uint i, c = attr[0];

3410

3411

/* first character should be a letter */

3412

if (!IsLetter(c))

3413

return no;

3414

3415

/* remaining characters should be namechars */

3416

for( i = 1; i < tmbstrlen(attr); i++)

3417

{

3418

c = attr[i];

3419

3420

if (IsNamechar(c))

3421

continue;

3422

3423

return no;

3424

}

3425

3426

return yes;

3427

}

3428

3429

/* create a new attribute */

3430

AttVal *NewAttribute(void)

3431

{

3432

AttVal *av = (AttVal*) MemAlloc( sizeof(AttVal) );

3433

ClearMemory( av, sizeof(AttVal) );

3434

return av;

3435

}

3436

3437

/* create a new attribute with given name and value */

3438

AttVal* NewAttributeEx( TidyDocImpl* doc, ctmbstr name, ctmbstr value,

3439

int delim )

3440

{

3441

AttVal *av = NewAttribute();

3442

av->attribute = tmbstrdup(name);

3443

av->value = tmbstrdup(value);

3444

av->delim = delim;

3445

av->dict = FindAttribute( doc, av );

3446

return av;

3447

}

3448

3449

static void AddAttrToList( AttVal** list, AttVal* av )

3450

{

3451

if ( *list == NULL )

3452

*list = av;

3453

else

3454

{

3455

AttVal* here = *list;

3456

while ( here->next )

3457

here = here->next;

3458

here->next = av;

3459

}

3460

}

3461

3462

void InsertAttributeAtEnd( Node *node, AttVal *av )

3463

{

3464

AddAttrToList(&node->attributes, av);

3465

}

3466

3467

void InsertAttributeAtStart( Node *node, AttVal *av )

3468

{

3469

av->next = node->attributes;

3470

node->attributes = av;

3471

}

3472

3473

/* swallows closing '>' */

3474

3475

static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty )

3476

{

3477

Lexer* lexer = doc->lexer;

3478

AttVal *av, *list;

3479

tmbstr value;

3480

int delim;

3481

Node *asp, *php;

3482

3483

list = NULL;

3484

3485

while ( !EndOfInput(doc) )

3486

{

3487

tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php );

3488

3489

if (attribute == NULL)

3490

{

3491

/* check if attributes are created by ASP markup */

3492

if (asp)

3493

{

3494

av = NewAttribute();

3495

av->asp = asp;

3496

AddAttrToList( &list, av );

3497

continue;

3498

}

3499

3500

/* check if attributes are created by PHP markup */

3501

if (php)

3502

{

3503

av = NewAttribute();

3504

av->php = php;

3505

AddAttrToList( &list, av );

3506

continue;

3507

}

3508

3509

break;

3510

}

3511

3512

value = ParseValue( doc, attribute, no, isempty, &delim );

3513

3514

if (attribute && (IsValidAttrName(attribute) ||

3515

(cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute))))

3516

{

3517

av = NewAttribute();

3518

av->delim = delim;

3519

av->attribute = attribute;

3520

av->value = value;

3521

av->dict = FindAttribute( doc, av );

3522

AddAttrToList( &list, av );

3523

}

3524

else

3525

{

3526

av = NewAttribute();

3527

av->attribute = attribute;

3528

av->value = value;

3529

3530

if (LastChar(attribute) == '"')

3531

ReportAttrError( doc, lexer->token, av, MISSING_QUOTEMARK);

3532

else if (value == NULL)

3533

ReportAttrError(doc, lexer->token, av, MISSING_ATTR_VALUE);

3534

else

3535

ReportAttrError(doc, lexer->token, av, INVALID_ATTRIBUTE);

3536

3537

FreeAttribute( doc, av );

3538

}

3539

}

3540

3541

return list;

3542

}

3543

3544

3545

Returns document type declarations like

3546

3547

<!DOCTYPE foo PUBLIC "fpi" "sysid">

3548

<!DOCTYPE bar SYSTEM "sysid">

3549

<!DOCTYPE baz [ <!ENTITY ouml "&#246"> ]>

3550

3551

3552

3553

3554

3555

3556

3557

static Node *ParseDocTypeDecl(TidyDocImpl* doc)

3558

{

3559

Lexer *lexer = doc->lexer;

3560

int start = lexer->lexsize;

3561

ParseDocTypeDeclState state = DT_DOCTYPENAME;

3562

uint c;

3563

uint delim = 0;

3564

Bool hasfpi = yes;

3565

3566

Node* node = NewNode(lexer);

3567

node->type = DocTypeTag;

3568

node->start = lexer->txtstart;

3569

node->end = lexer->txtend;

3570

3571

lexer->waswhite = no;

3572

3573

/* todo: reset lexer->lexsize when appropriate to avoid wasting memory */

3574

3575

while ((c = ReadChar(doc->docIn)) != EndOfStream)

3576

{

3577

/* convert newlines to spaces */

3578

if (state != DT_INTSUBSET)

3579

c = c == '\n' ? ' ' : c;

3580

3581

/* convert white-space sequences to single space character */

3582

if (IsWhite(c) && state != DT_INTSUBSET)

3583

{

3584

if (!lexer->waswhite)

3585

{

3586

AddCharToLexer(lexer, c);

3587

lexer->waswhite = yes;

3588

}

3589

else

3590

{

3591

/* discard space */

3592

continue;

3593

}

3594

}

3595

else

3596

{

3597

AddCharToLexer(lexer, c);

3598

lexer->waswhite = no;

3599

}

3600

3601

switch(state)

3602

{

3603

case DT_INTERMEDIATE:

3604

/* determine what's next */

3605

if (ToUpper(c) == 'P' || ToUpper(c) == 'S')

3606

{

3607

start = lexer->lexsize - 1;

3608

state = DT_PUBLICSYSTEM;

3609

continue;

3610

}

3611

else if (c == '[')

3612

{

3613

start = lexer->lexsize;

3614

state = DT_INTSUBSET;

3615

continue;

3616

}

3617

else if (c == '\'' || c == '"')

3618

{

3619

start = lexer->lexsize;

3620

delim = c;

3621

state = DT_QUOTEDSTRING;

3622

continue;

3623

}

3624

else if (c == '>')

3625

{

3626

AttVal* si;

3627

3628

node->end = --(lexer->lexsize);

3629

3630

si = GetAttrByName(node, "SYSTEM");

3631

if (si)

3632

CheckUrl(doc, node, si);

3633

3634

if (!node->element || !IsValidXMLElemName(node->element))

3635

{

3636

ReportError(doc, NULL, NULL, MALFORMED_DOCTYPE);

3637

FreeNode(doc, node);

3638

return NULL;

3639

}

3640

#ifdef TIDY_STORE_ORIGINAL_TEXT

3641

StoreOriginalTextInToken(doc, node, 0);

3642

#endif

3643

return node;

3644

}

3645

else

3646

{

3647

/* error */

3648

}

3649

break;

3650

case DT_DOCTYPENAME:

3651

/* read document type name */

3652

if (IsWhite(c) || c == '>' || c == '[')

3653

{

3654

node->element = tmbstrndup(lexer->lexbuf + start,

3655

lexer->lexsize - start - 1);

3656

if (c == '>' || c == '[')

3657

{

3658

--(lexer->lexsize);

3659

UngetChar(c, doc->docIn);

3660

}

3661

3662

state = DT_INTERMEDIATE;

3663

continue;

3664

}

3665

break;

3666

case DT_PUBLICSYSTEM:

3667

/* read PUBLIC/SYSTEM */

3668

if (IsWhite(c) || c == '>')

3669

{

3670

char *attname = tmbstrndup(lexer->lexbuf + start,

3671

lexer->lexsize - start - 1);

3672

hasfpi = !(tmbstrcasecmp(attname, "SYSTEM") == 0);

3673

3674

MemFree(attname);

3675

3676

/* todo: report an error if SYSTEM/PUBLIC not uppercase */

3677

3678

if (c == '>')

3679

{

3680

--(lexer->lexsize);

3681

UngetChar(c, doc->docIn);

3682

}

3683

3684

state = DT_INTERMEDIATE;

3685

continue;

3686

}

3687

break;

3688

case DT_QUOTEDSTRING:

3689

/* read quoted string */

3690

if (c == delim)

3691

{

3692

char *value = tmbstrndup(lexer->lexbuf + start,

3693

lexer->lexsize - start - 1);

3694

AttVal* att = AddAttribute(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value);

3695

MemFree(value);

3696

att->delim = delim;

3697

hasfpi = no;

3698

state = DT_INTERMEDIATE;

3699

delim = 0;

3700

continue;

3701

}

3702

break;

3703

case DT_INTSUBSET:

3704

/* read internal subset */

3705

if (c == ']')

3706

{

3707

Node* subset;

3708

lexer->txtstart = start;

3709

lexer->txtend = lexer->lexsize - 1;

3710

subset = TextToken(lexer);

3711

InsertNodeAtEnd(node, subset);

3712

state = DT_INTERMEDIATE;

3713

}

3714

break;

3715

}

3716

}

3717

3718

/* document type declaration not finished */

3719

ReportError(doc, NULL, NULL, MALFORMED_DOCTYPE);

3720

FreeNode(doc, node);

3721

return NULL;

3722

}