~ubuntu-branches/ubuntu/gutsy/tidy/gutsy : revision 2

1

/*

2

clean.c -- clean up misuse of presentation markup

3

4

5

See tidy.c for the copyright notice.

4

5

See tidy.h for the copyright notice.

6

7

CVS Info :

8

9

$Author: krusch $

10

$Date: 2002/01/27 19:04:57 $

11

$Revision: 1.15 $

9

$Author: arnaud02 $

10

$Date: 2005/03/23 12:52:09 $

11

$Revision: 1.95 $

12

13

Filters from other formats such as Microsoft Word

14

often make excessive use of presentation markup such

43

A naive approach is to rely on string matching to test

44

when two property lists are the same. A better approach

45

would be to first sort the properties before matching.

46

47

*/

47

48

49

#include <stdio.h>

49

50

#include <stdlib.h>

50

51

#include <string.h>

51

#include "platform.h"

52

#include "html.h"

53

54

Node *CleanNode(Lexer *lexer, Node *node);

52

53

#include "tidy-int.h"

54

#include "clean.h"

55

#include "lexer.h"

56

#include "parser.h"

57

#include "attrs.h"

58

#include "message.h"

59

#include "tmbstr.h"

60

#include "utf8.h"

61

62

void RenameElem( Node* node, TidyTagId tid )

63

{

64

const Dict* dict = LookupTagDef( tid );

65

MemFree( node->element );

66

node->element = tmbstrdup( dict->name );

67

node->tag = dict;

68

}

55

69

56

70

static void FreeStyleProps(StyleProp *props)

57

71

{

67

81

}

68

82

}

69

83

70

static StyleProp *InsertProperty(StyleProp *props, char *name, char *value)

84

static StyleProp *InsertProperty( StyleProp* props, ctmbstr name, ctmbstr value )

71

85

{

72

86

StyleProp *first, *prev, *prop;

73

87

int cmp;

74

88

75

prev = null;

89

prev = NULL;

76

90

first = props;

77

91

78

92

while (props)

79

93

{

80

cmp = wstrcmp(props->name, name);

94

cmp = tmbstrcmp(props->name, name);

81

95

82

96

if (cmp == 0)

83

97

{

90

104

/* insert before this */

91

105

92

106

prop = (StyleProp *)MemAlloc(sizeof(StyleProp));

93

prop->name = wstrdup(name);

94

prop->value = wstrdup(value);

107

prop->name = tmbstrdup(name);

108

prop->value = tmbstrdup(value);

95

109

prop->next = props;

96

110

97

111

if (prev)

107

121

}

108

122

109

123

prop = (StyleProp *)MemAlloc(sizeof(StyleProp));

110

prop->name = wstrdup(name);

111

prop->value = wstrdup(value);

112

prop->next = null;

124

prop->name = tmbstrdup(name);

125

prop->value = tmbstrdup(value);

126

prop->next = NULL;

113

127

114

128

if (prev)

115

129

prev->next = prop;

123

137

Create sorted linked list of properties from style string

124

138

It temporarily places nulls in place of ':' and ';' to

125

139

delimit the strings for the property name and value.

126

Some systems don't allow you to null literal strings,

140

Some systems don't allow you to NULL literal strings,

127

141

so to avoid this, a copy is made first.

128

142

*/

129

static StyleProp *CreateProps(StyleProp *prop, char *style)

143

static StyleProp* CreateProps( StyleProp* prop, ctmbstr style )

130

144

{

131

char *name, *value, *name_end, *value_end;

145

tmbstr name, value = NULL, name_end, value_end, line;

132

146

Bool more;

133

147

134

style = wstrdup(style);

135

name = style;

148

line = tmbstrdup(style);

149

name = line;

136

150

137

151

while (*name)

138

152

{

155

169

if (*name_end != ':')

156

170

break;

157

171

158

while (*value == ' ')

172

while ( value && *value == ' ')

159

173

++value;

160

174

161

175

value_end = value;

188

202

break;

189

203

}

190

204

191

MemFree(style); /* free temporary copy */

205

MemFree(line); /* free temporary copy */

192

206

return prop;

193

207

}

194

208

195

static char *CreatePropString(StyleProp *props)

209

static tmbstr CreatePropString(StyleProp *props)

196

210

{

197

char *style, *p, *s;

198

int len;

211

tmbstr style, p, s;

212

uint len;

199

213

StyleProp *prop;

200

214

201

215

/* compute length */

202

216

203

217

for (len = 0, prop = props; prop; prop = prop->next)

204

218

{

205

len += wstrlen(prop->name) + 2;

206

len += wstrlen(prop->value) + 2;

219

len += tmbstrlen(prop->name) + 2;

220

if (prop->value)

221

len += tmbstrlen(prop->value) + 2;

207

222

}

208

223

209

style = (char *)MemAlloc(len+1);

224

style = (tmbstr) MemAlloc(len+1);

225

style[0] = '\0';

210

226

211

227

for (p = style, prop = props; prop; prop = prop->next)

212

228

{

213

229

s = prop->name;

214

230

215

while((*p++ = *s++));

216

217

*--p = ':';

218

*++p = ' ';

219

++p;

220

221

s = prop->value;

222

while((*p++ = *s++));

223

224

if (prop->next == null)

231

while((*p++ = *s++))

232

continue;

233

234

if (prop->value)

235

{

236

*--p = ':';

237

*++p = ' ';

238

++p;

239

240

s = prop->value;

241

while((*p++ = *s++))

242

continue;

243

}

244

if (prop->next == NULL)

225

245

break;

226

246

227

247

*--p = ';';

234

254

235

255

/*

236

256

create string with merged properties

237

*/

238

static char *AddProperty(char *style, char *property)

257

static tmbstr AddProperty( ctmbstr style, ctmbstr property )

239

258

{

259

tmbstr line;

240

260

StyleProp *prop;

241

261

242

prop = CreateProps(null, style);

262

prop = CreateProps(NULL, style);

243

263

prop = CreateProps(prop, property);

244

style = CreatePropString(prop);

264

line = CreatePropString(prop);

245

265

FreeStyleProps(prop);

246

return style;

266

return line;

247

267

}

268

*/

248

269

249

void FreeStyles(Lexer *lexer)

270

void FreeStyles( TidyDocImpl* doc )

250

271

{

251

Style *style, *next;

252

253

for (style = lexer->styles; style; style = next)

272

Lexer* lexer = doc->lexer;

273

if ( lexer )

254

274

{

255

next = style->next;

256

257

MemFree(style->tag);

258

MemFree(style->tag_class);

259

MemFree(style->properties);

260

MemFree(style);

275

TagStyle *style, *next;

276

for ( style = lexer->styles; style; style = next )

277

{

278

next = style->next;

279

MemFree( style->tag );

280

MemFree( style->tag_class );

281

MemFree( style->properties );

282

MemFree( style );

283

}

261

284

}

262

285

}

263

286

264

static char *GensymClass(char *tag)

287

static tmbstr GensymClass( TidyDocImpl* doc )

265

288

{

266

static int n = 1;

267

char buf[512]; /* CSSPrefix is limited to 256 characters */

289

tmbchar buf[512]; /* CSSPrefix is limited to 256 characters */

290

ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);

291

if ( pfx == NULL || *pfx == 0 )

292

pfx = "c";

268

293

269

sprintf(buf, "%s%d", (CSSPrefix ? CSSPrefix : "c"), n++); /* #508936 - CSS class naming for -clean option */

270

return wstrdup(buf);

294

tmbsnprintf(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );

295

return tmbstrdup(buf);

271

296

}

272

297

273

static char *FindStyle(Lexer *lexer, char *tag, char *properties)

298

static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )

274

299

{

275

Style *style;

300

Lexer* lexer = doc->lexer;

301

TagStyle* style;

276

302

277

303

for (style = lexer->styles; style; style=style->next)

278

304

{

279

if (wstrcmp(style->tag, tag) == 0 &&

280

wstrcmp(style->properties, properties) == 0)

305

if (tmbstrcmp(style->tag, tag) == 0 &&

306

tmbstrcmp(style->properties, properties) == 0)

281

307

return style->tag_class;

282

308

}

283

309

284

style = (Style *)MemAlloc(sizeof(Style));

285

style->tag = wstrdup(tag);

286

style->tag_class = GensymClass(tag);

287

style->properties = wstrdup(properties);

310

style = (TagStyle *)MemAlloc( sizeof(TagStyle) );

311

style->tag = tmbstrdup(tag);

312

style->tag_class = GensymClass( doc );

313

style->properties = tmbstrdup( properties );

288

314

style->next = lexer->styles;

289

315

lexer->styles = style;

290

316

return style->tag_class;

293

319

/*

294

320

Add class="foo" to node

295

321

*/

296

void AddClass(Node *node, char *classname)

322

void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )

297

323

{

298

AttVal *classattr = GetAttrByName(node, "class");

324

AttVal *classattr = AttrGetById(node, TidyAttr_CLASS);;

299

325

300

326

/*

301

327

if there already is a class attribute

302

then append class name after a space

328

then append class name after a space.

303

329

*/

304

330

if (classattr)

305

331

{

306

int len = wstrlen(classattr->value) +

307

wstrlen(classname) + 2;

308

char *s = (char *)MemAlloc(len *sizeof(char)); /* #427668 - was malloc() - fix by Arnaud BERCEGEAY 05 Aug 00 */

309

wstrcpy(s, classattr->value);

310

wstrcat(s, " ");

311

wstrcat(s, classname);

312

MemFree(classattr->value);

332

uint len = tmbstrlen(classattr->value) +

333

tmbstrlen(classname) + 2;

334

tmbstr s = (tmbstr) MemAlloc( len );

335

tmbstrcpy( s, classattr->value );

336

tmbstrcat( s, " " );

337

tmbstrcat( s, classname );

338

MemFree( classattr->value );

313

339

classattr->value = s;

314

340

}

315

341

else /* create new class attribute */

316

AddAttribute(node, "class", classname);

342

AddAttribute( doc, node, "class", classname );

317

343

}

318

344

319

345

325

351

326

352

Assumes that node doesn't have a class attribute

327

353

*/

328

static void Style2Rule(Lexer *lexer, Node *node)

354

static void Style2Rule( TidyDocImpl* doc, Node *node)

329

355

{

330

356

AttVal *styleattr, *classattr;

331

char *classname;

357

ctmbstr classname;

332

358

333

styleattr = GetAttrByName(node, "style");

359

styleattr = AttrGetById(node, TidyAttr_STYLE);

334

360

335

361

if (styleattr)

336

362

{

337

classname = FindStyle(lexer, node->element, styleattr->value);

338

classattr = GetAttrByName(node, "class");

363

/* fix for http://tidy.sf.net/bug/850215 */

364

if (!styleattr->value)

365

{

366

RemoveAttribute(doc, node, styleattr);

367

return;

368

}

369

370

classname = FindStyle( doc, node->element, styleattr->value );

371

classattr = AttrGetById(node, TidyAttr_CLASS);

339

372

340

373

/*

341

374

if there already is a class attribute

342

then append class name after a space

375

then append class name after an underscore

343

376

*/

344

377

if (classattr)

345

378

{

346

int len = wstrlen(classattr->value) +

347

wstrlen(classname) + 2;

348

char *s = (char *)MemAlloc(len *sizeof(char)); /* #427668 - was malloc() - fix by Arnaud BERCEGEAY 05 Aug 00 */

349

wstrcpy(s, classattr->value);

350

wstrcat(s, " ");

351

wstrcat(s, classname);

352

MemFree(classattr->value);

379

uint len = tmbstrlen(classattr->value) +

380

tmbstrlen(classname) + 2;

381

tmbstr s = (tmbstr) MemAlloc( len );

382

s[0] = '\0';

383

if (classattr->value)

384

{

385

tmbstrcpy(s, classattr->value);

386

tmbstrcat(s, " ");

387

}

388

tmbstrcat(s, classname);

389

if (classattr->value)

390

MemFree(classattr->value);

353

391

classattr->value = s;

354

RemoveAttribute(node, styleattr);

392

RemoveAttribute( doc, node, styleattr );

355

393

}

356

394

else /* reuse style attribute for class attribute */

357

395

{

358

396

MemFree(styleattr->attribute);

359

397

MemFree(styleattr->value);

360

styleattr->attribute = wstrdup("class");

361

styleattr->value = wstrdup(classname);

398

styleattr->attribute = tmbstrdup("class");

399

styleattr->value = tmbstrdup(classname);

362

400

}

363

401

}

364

402

}

365

403

366

static void AddColorRule(Lexer *lexer, char *selector, char *color)

404

static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )

367

405

{

368

if (color)

406

if ( selector && color )

369

407

{

370

408

AddStringLiteral(lexer, selector);

371

409

AddStringLiteral(lexer, " { color: ");

384

422

vlink="foo" -> :visited { color: foo }

385

423

alink="foo" -> :active { color: foo }

386

424

*/

387

static void CleanBodyAttrs(Lexer *lexer, Node *body)

425

static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )

388

426

{

389

AttVal *attr;

390

char *bgurl = null;

391

char *bgcolor = null;

392

char *color = null;

427

Lexer* lexer = doc->lexer;

428

tmbstr bgurl = NULL;

429

tmbstr bgcolor = NULL;

430

tmbstr color = NULL;

431

AttVal* attr;

393

432

394

attr = GetAttrByName(body, "background");

395

396

if (attr)

433

if (NULL != (attr = AttrGetById(body, TidyAttr_BACKGROUND)))

397

434

{

398

435

bgurl = attr->value;

399

attr->value = null;

400

RemoveAttribute(body, attr);

436

attr->value = NULL;

437

RemoveAttribute( doc, body, attr );

401

438

}

402

439

403

attr = GetAttrByName(body, "bgcolor");

404

405

if (attr)

440

if (NULL != (attr = AttrGetById(body, TidyAttr_BGCOLOR)))

406

441

{

407

442

bgcolor = attr->value;

408

attr->value = null;

409

RemoveAttribute(body, attr);

443

attr->value = NULL;

444

RemoveAttribute( doc, body, attr );

410

445

}

411

446

412

attr = GetAttrByName(body, "text");

413

414

if (attr)

447

if (NULL != (attr = AttrGetById(body, TidyAttr_TEXT)))

415

448

{

416

449

color = attr->value;

417

attr->value = null;

418

RemoveAttribute(body, attr);

450

attr->value = NULL;

451

RemoveAttribute( doc, body, attr );

419

452

}

420

453

421

if (bgurl || bgcolor || color)

454

if ( bgurl || bgcolor || color )

422

455

{

423

456

AddStringLiteral(lexer, " body {\n");

424

425

457

if (bgurl)

426

458

{

427

459

AddStringLiteral(lexer, " background-image: url(");

429

461

AddStringLiteral(lexer, ");\n");

430

462

MemFree(bgurl);

431

463

}

432

433

464

if (bgcolor)

434

465

{

435

466

AddStringLiteral(lexer, " background-color: ");

437

468

AddStringLiteral(lexer, ";\n");

438

469

MemFree(bgcolor);

439

470

}

440

441

471

if (color)

442

472

{

443

473

AddStringLiteral(lexer, " color: ");

449

479

AddStringLiteral(lexer, " }\n");

450

480

}

451

481

452

attr = GetAttrByName(body, "link");

453

454

if (attr)

482

if (NULL != (attr = AttrGetById(body, TidyAttr_LINK)))

455

483

{

456

484

AddColorRule(lexer, " :link", attr->value);

457

RemoveAttribute(body, attr);

485

RemoveAttribute( doc, body, attr );

458

486

}

459

487

460

attr = GetAttrByName(body, "vlink");

461

462

if (attr)

488

if (NULL != (attr = AttrGetById(body, TidyAttr_VLINK)))

463

489

{

464

490

AddColorRule(lexer, " :visited", attr->value);

465

RemoveAttribute(body, attr);

491

RemoveAttribute( doc, body, attr );

466

492

}

467

493

468

attr = GetAttrByName(body, "alink");

469

470

if (attr)

494

if (NULL != (attr = AttrGetById(body, TidyAttr_ALINK)))

471

495

{

472

496

AddColorRule(lexer, " :active", attr->value);

473

RemoveAttribute(body, attr);

497

RemoveAttribute( doc, body, attr );

474

498

}

475

499

}

476

500

477

static Bool NiceBody(Lexer *lexer, Node *doc)

501

static Bool NiceBody( TidyDocImpl* doc )

478

502

{

479

Node *body = FindBody(doc);

480

481

if (body)

503

Node* node = FindBody(doc);

504

if (node)

482

505

{

483

if (

484

GetAttrByName(body, "background") ||

485

GetAttrByName(body, "bgcolor") ||

486

GetAttrByName(body, "text") ||

487

GetAttrByName(body, "link") ||

488

GetAttrByName(body, "vlink") ||

489

GetAttrByName(body, "alink")

490

)

506

if (AttrGetById(node, TidyAttr_BACKGROUND) ||

507

AttrGetById(node, TidyAttr_BGCOLOR) ||

508

AttrGetById(node, TidyAttr_TEXT) ||

509

AttrGetById(node, TidyAttr_LINK) ||

510

AttrGetById(node, TidyAttr_VLINK) ||

511

AttrGetById(node, TidyAttr_ALINK))

491

512

{

492

lexer->badLayout |= USING_BODY;

513

doc->badLayout |= USING_BODY;

493

514

return no;

494

515

}

495

516

}

498

519

}

499

520

500

521

/* create style element using rules from dictionary */

501

static void CreateStyleElement(Lexer *lexer, Node *doc)

522

static void CreateStyleElement( TidyDocImpl* doc )

502

523

{

524

Lexer* lexer = doc->lexer;

503

525

Node *node, *head, *body;

504

Style *style;

526

TagStyle *style;

505

527

AttVal *av;

506

528

507

if (lexer->styles == null && NiceBody(lexer, doc))

529

if ( lexer->styles == NULL && NiceBody(doc) )

508

530

return;

509

531

510

node = NewNode();

532

node = NewNode( lexer );

511

533

node->type = StartTag;

512

534

node->implicit = yes;

513

node->element = wstrdup("style");

514

FindTag(node);

535

node->element = tmbstrdup("style");

536

FindTag( doc, node );

515

537

516

538

/* insert type attribute */

517

av = NewAttribute();

518

av->attribute = wstrdup("type");

519

av->value = wstrdup("text/css");

520

av->delim = '"';

521

av->dict = FindAttribute(av);

522

node->attributes = av;

523

524

body = FindBody(doc);

525

539

av = NewAttributeEx( doc, "type", "text/css", '"' );

540

InsertAttributeAtStart( node, av );

541

542

body = FindBody( doc );

526

543

lexer->txtstart = lexer->lexsize;

527

528

if (body)

529

CleanBodyAttrs(lexer, body);

544

if ( body )

545

CleanBodyAttrs( doc, body );

530

546

531

547

for (style = lexer->styles; style; style = style->next)

532

548

{

543

559

544

560

lexer->txtend = lexer->lexsize;

545

561

546

InsertNodeAtEnd(node, TextToken(lexer));

562

InsertNodeAtEnd( node, TextToken(lexer) );

547

563

548

564

/*

549

565

now insert style element into document head

551

567

doc is root node. search its children for html node

552

568

the head node should be first child of html node

553

569

*/

554

555

head = FindHEAD(doc);

556

557

if (head)

558

InsertNodeAtEnd(head, node);

570

if ( NULL != (head = FindHEAD( doc )) )

571

InsertNodeAtEnd( head, node );

559

572

}

560

573

561

574

562

575

/* ensure bidirectional links are consistent */

563

static void FixNodeLinks(Node *node)

576

void FixNodeLinks(Node *node)

564

577

{

565

578

Node *child;

566

579

582

595

used to strip child of node when

583

596

the node has one and only one child

584

597

*/

585

static void StripOnlyChild(Node *node)

598

static void StripOnlyChild(TidyDocImpl* doc, Node *node)

586

599

{

587

600

Node *child;

588

601

589

602

child = node->content;

590

603

node->content = child->content;

591

604

node->last = child->last;

592

child->content = null;

593

FreeNode(child);

605

child->content = NULL;

606

FreeNode(doc, child);

594

607

595

608

for (child = node->content; child; child = child->next)

596

609

child->parent = node;

597

610

}

598

611

599

/* used to strip font start and end tags */

600

static void DiscardContainer(Node *element, Node **pnode)

612

/*

613

used to strip font start and end tags.

614

Extricate "element", replace it by its content and delete it.

615

*/

616

static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)

601

617

{

602

Node *node, *parent = element->parent;

603

604

618

if (element->content)

605

619

{

620

Node *node, *parent = element->parent;

621

606

622

element->last->next = element->next;

607

623

608

624

if (element->next)

609

625

{

610

626

element->next->prev = element->last;

611

element->last->next = element->next;

612

627

}

613

628

else

614

629

parent->last = element->last;

625

640

node->parent = parent;

626

641

627

642

*pnode = element->content;

643

644

element->next = element->content = NULL;

645

FreeNode(doc, element);

628

646

}

629

647

else

630

648

{

631

if (element->next)

632

element->next->prev = element->prev;

633

else

634

parent->last = element->prev;

635

636

if (element->prev)

637

element->prev->next = element->next;

638

else

639

parent->content = element->next;

640

641

*pnode = element->next;

642

}

643

644

element->next = element->content = null;

645

FreeNode(element);

646

}

647

648

/*

649

Add style property to element, creating style

650

attribute as needed and adding ; delimiter

651

*/

652

static void AddStyleProperty(Node *node, char *property)

653

{

654

AttVal *av;

655

656

for (av = node->attributes; av; av = av->next)

657

{

658

if (wstrcmp(av->attribute, "style") == 0)

659

break;

660

}

661

662

/* if style attribute already exists then insert property */

663

664

if (av)

665

{

666

char *s;

667

668

s = AddProperty(av->value, property);

669

MemFree(av->value);

670

av->value = s;

671

}

672

else /* else create new style attribute */

673

{

674

av = NewAttribute();

675

av->attribute = wstrdup("style");

676

av->value = wstrdup(property);

677

av->delim = '"';

678

av->dict = FindAttribute(av);

679

av->next = node->attributes;

680

node->attributes = av;

649

*pnode = DiscardElement(doc, element);

681

650

}

682

651

}

683

652

690

659

into the list in order, merging values for

691

660

the same property name.

692

661

*/

693

static char *MergeProperties(char *s1, char *s2)

662

static tmbstr MergeProperties( ctmbstr s1, ctmbstr s2 )

694

663

{

695

char *s;

664

tmbstr s;

696

665

StyleProp *prop;

697

666

698

prop = CreateProps(null, s1);

667

prop = CreateProps(NULL, s1);

699

668

prop = CreateProps(prop, s2);

700

669

s = CreatePropString(prop);

701

670

FreeStyleProps(prop);

702

671

return s;

703

672

}

704

673

705

static void MergeClasses(Node *node, Node *child)

674

/*

675

Add style property to element, creating style

676

attribute as needed and adding ; delimiter

677

*/

678

static void AddStyleProperty(TidyDocImpl* doc, Node *node, ctmbstr property )

679

{

680

AttVal *av = AttrGetById(node, TidyAttr_STYLE);

681

682

/* if style attribute already exists then insert property */

683

684

if ( av )

685

{

686

if (av->value != NULL)

687

{

688

tmbstr s = MergeProperties( av->value, property );

689

MemFree( av->value );

690

av->value = s;

691

}

692

else

693

{

694

av->value = tmbstrdup( property );

695

}

696

}

697

else /* else create new style attribute */

698

{

699

av = NewAttributeEx( doc, "style", property, '"' );

700

InsertAttributeAtStart( node, av );

701

}

702

}

703

704

static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)

706

705

{

707

706

AttVal *av;

708

char *s1, *s2, *names;

707

tmbstr s1, s2, names;

709

708

710

for (s2 = null, av = child->attributes; av; av = av->next)

709

for (s2 = NULL, av = child->attributes; av; av = av->next)

711

710

{

712

if (wstrcmp(av->attribute, "class") == 0)

711

if (attrIsCLASS(av))

713

712

{

714

713

s2 = av->value;

715

714

break;

716

715

}

717

716

}

718

717

719

for (s1 = null, av = node->attributes; av; av = av->next)

718

for (s1 = NULL, av = node->attributes; av; av = av->next)

720

719

{

721

if (wstrcmp(av->attribute, "class") == 0)

720

if (attrIsCLASS(av))

722

721

{

723

722

s1 = av->value;

724

723

break;

729

728

{

730

729

if (s2) /* merge class names from both */

731

730

{

732

int l1, l2;

733

l1 = wstrlen(s1);

734

l2 = wstrlen(s2);

735

names = (char *)MemAlloc(l1 + l2 + 2);

736

wstrcpy(names, s1);

731

uint l1, l2;

732

l1 = tmbstrlen(s1);

733

l2 = tmbstrlen(s2);

734

names = (tmbstr) MemAlloc(l1 + l2 + 2);

735

tmbstrcpy(names, s1);

737

736

names[l1] = ' ';

738

wstrcpy(names+l1+1, s2);

737

tmbstrcpy(names+l1+1, s2);

739

738

MemFree(av->value);

740

739

av->value = names;

741

740

}

742

741

}

743

742

else if (s2) /* copy class names from child */

744

743

{

745

av = NewAttribute();

746

av->attribute = wstrdup("class");

747

av->value = wstrdup(s2);

748

av->delim = '"';

749

av->dict = FindAttribute(av);

750

av->next = node->attributes;

751

node->attributes = av;

744

av = NewAttributeEx( doc, "class", s2, '"' );

745

InsertAttributeAtStart( node, av );

752

746

}

753

747

}

754

748

755

static void MergeStyles(Node *node, Node *child)

749

static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)

756

750

{

757

751

AttVal *av;

758

char *s1, *s2, *style;

752

tmbstr s1, s2, style;

759

753

760

754

/*

761

755

the child may have a class attribute used

762

756

for attaching styles, if so the class name

763

757

needs to be copied to node's class

764

758

*/

765

MergeClasses(node, child);

759

MergeClasses(doc, node, child);

766

760

767

for (s2 = null, av = child->attributes; av; av = av->next)

761

for (s2 = NULL, av = child->attributes; av; av = av->next)

768

762

{

769

if (wstrcmp(av->attribute, "style") == 0)

763

if (attrIsSTYLE(av))

770

764

{

771

765

s2 = av->value;

772

766

break;

773

767

}

774

768

}

775

769

776

for (s1 = null, av = node->attributes; av; av = av->next)

770

for (s1 = NULL, av = node->attributes; av; av = av->next)

777

771

{

778

if (wstrcmp(av->attribute, "style") == 0)

772

if (attrIsSTYLE(av))

779

773

{

780

774

s1 = av->value;

781

775

break;

793

787

}

794

788

else if (s2) /* copy style of child */

795

789

{

796

av = NewAttribute();

797

av->attribute = wstrdup("style");

798

av->value = wstrdup(s2);

799

av->delim = '"';

800

av->dict = FindAttribute(av);

801

av->next = node->attributes;

802

node->attributes = av;

790

av = NewAttributeEx( doc, "style", s2, '"' );

791

InsertAttributeAtStart( node, av );

803

792

}

804

793

}

805

794

806

static char *FontSize2Name(char *size)

795

static ctmbstr FontSize2Name(ctmbstr size, tmbstr buf, size_t count)

807

796

{

808

#if 0

809

static char *sizes[7] =

810

{

811

"50%", "60%", "80%", null,

812

"120%", "150%", "200%"

813

};

814

#else

815

static char *sizes[7] =

816

{

817

"60%", "70%", "80%", null,

818

"120%", "150%", "200%"

819

};

820

#endif

821

static char buf[16];

797

static const ctmbstr sizes[7] =

798

{

799

"60%", "70%", "80%", NULL,

800

"120%", "150%", "200%"

801

};

802

803

if (size[0] == '\0')

804

return NULL;

822

805

823

806

if ('0' <= size[0] && size[0] <= '6')

824

807

{

837

820

x *= 0.8;

838

821

839

822

x *= 100;

840

sprintf(buf, "%d%%", (int)(x));

841

823

tmbsnprintf(buf, count, "%d%%", (int)(x));

842

824

return buf;

843

825

}

844

845

826

return "smaller"; /*"70%"; */

846

827

}

847

828

854

835

x *= 1.2;

855

836

856

837

x *= 100;

857

sprintf(buf, "%d%%", (int)(x));

858

838

/* Add 0.001 to avoid roundoff error - see #1004512 */

839

tmbsnprintf(buf, count, "%d%%", (int)(x+0.001));

859

840

return buf;

860

841

}

861

842

862

843

return "larger"; /* "140%" */

863

844

}

864

845

865

static void AddFontFace(Node *node, char *face)

846

static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )

866

847

{

867

char buf[1024];

868

869

sprintf(buf, "font-family: %s", face);

870

AddStyleProperty(node, buf);

848

tmbchar buf[256];

849

tmbsnprintf(buf, sizeof(buf), "font-family: %s", face );

850

AddStyleProperty( doc, node, buf );

871

851

}

872

852

873

static void AddFontSize(Node *node, char *size)

853

static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )

874

854

{

875

char *value, buf[1024];

876

877

if (wstrcmp(size, "6") == 0 && node->tag == tag_p)

878

{

879

MemFree(node->element);

880

node->element = wstrdup("h1");

881

FindTag(node);

882

return;

883

}

884

885

if (wstrcmp(size, "5") == 0 && node->tag == tag_p)

886

{

887

MemFree(node->element);

888

node->element = wstrdup("h2");

889

FindTag(node);

890

return;

891

}

892

893

if (wstrcmp(size, "4") == 0 && node->tag == tag_p)

894

{

895

MemFree(node->element);

896

node->element = wstrdup("h3");

897

FindTag(node);

898

return;

899

}

900

901

value = FontSize2Name(size);

855

tmbchar work[ 32 ] = {0};

856

ctmbstr value = NULL;

857

858

if (nodeIsP(node))

859

{

860

if (tmbstrcmp(size, "6") == 0)

861

value = "h1";

862

else if (tmbstrcmp(size, "5") == 0)

863

value = "h2";

864

else if (tmbstrcmp(size, "4") == 0)

865

value = "h3";

866

867

if (value)

868

{

869

MemFree(node->element);

870

node->element = tmbstrdup(value);

871

FindTag(doc, node);

872

return;

873

}

874

}

875

876

value = FontSize2Name(size, work, sizeof(work) - 1);

902

877

903

878

if (value)

904

879

{

905

sprintf(buf, "font-size: %s", value);

906

AddStyleProperty(node, buf);

907

}

908

}

909

910

static void AddFontColor(Node *node, char *color)

911

{

912

char buf[1024];

913

914

sprintf(buf, "color: %s", color);

915

AddStyleProperty(node, buf);

916

}

917

918

static void AddAlign(Node *node, char *align)

919

{

920

char buf[1024], *p, *q;

921

922

/* force alignment value to lower case */

923

for (p = buf, q = "text-align: "; (*p++ = *q++););

924

for (p = p-1; (*p++ = ToLower(*align++)););

925

AddStyleProperty(node, buf);

880

tmbchar buf[64];

881

tmbsnprintf(buf, sizeof(buf), "font-size: %s", value);

882

AddStyleProperty( doc, node, buf );

883

}

884

}

885

886

static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)

887

{

888

tmbchar buf[128];

889

tmbsnprintf(buf, sizeof(buf), "color: %s", color);

890

AddStyleProperty( doc, node, buf );

891

}

892

893

/* force alignment value to lower case */

894

static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )

895

{

896

int i;

897

tmbchar buf[128];

898

899

tmbstrcpy( buf, "text-align: " );

900

for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )

901

{

902

if ( (buf[i] = (tmbchar)ToLower(*align++)) == '\0' )

903

break;

904

}

905

buf[i] = '\0';

906

AddStyleProperty( doc, node, buf );

926

907

}

927

908

928

909

/*

929

910

add style properties to node corresponding to

930

911

the font face, size and color attributes

931

912

*/

932

static void AddFontStyles(Node *node, AttVal *av)

913

static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)

933

914

{

934

915

while (av)

935

916

{

936

if (wstrcmp(av->attribute, "face") == 0)

937

AddFontFace(node, av->value);

938

else if (wstrcmp(av->attribute, "size") == 0)

939

AddFontSize(node, av->value);

940

else if (wstrcmp(av->attribute, "color") == 0)

941

AddFontColor(node, av->value);

942

917

if (AttrHasValue(av))

918

{

919

if (attrIsFACE(av))

920

AddFontFace( doc, node, av->value );

921

else if (attrIsSIZE(av))

922

AddFontSize( doc, node, av->value );

923

else if (attrIsCOLOR(av))

924

AddFontColor( doc, node, av->value );

925

}

943

926

av = av->next;

944

927

}

945

928

}

948

931

Symptom: <p align=center>

949

932

Action: <p style="text-align: center">

950

933

*/

951

static void TextAlign(Lexer *lexer, Node *node)

934

static void TextAlign( TidyDocImpl* doc, Node* node )

952

935

{

953

936

AttVal *av, *prev;

954

937

955

prev = null;

938

prev = NULL;

956

939

957

940

for (av = node->attributes; av; av = av->next)

958

941

{

959

if (wstrcmp(av->attribute, "align") == 0)

942

if (attrIsALIGN(av))

960

943

{

961

944

if (prev)

962

945

prev->next = av->next;

963

946

else

964

947

node->attributes = av->next;

965

948

966

MemFree(av->attribute);

967

968

949

if (av->value)

969

{

970

AddAlign(node, av->value);

971

MemFree(av->value);

972

}

950

AddAlign( doc, node, av->value );

973

951

974

MemFree(av);

952

FreeAttribute(doc, av);

975

953

break;

976

954

}

977

955

981

959

982

960

/*

983

961

The clean up rules use the pnode argument to return the

984

next node when the orignal node has been deleted

962

next node when the original node has been deleted

985

963

*/

986

964

987

965

/*

989

967

Action: coerce <dir> <li> to <div> with indent.

990

968

*/

991

969

992

static Bool Dir2Div(Lexer *lexer, Node *node, Node **pnode)

970

static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))

993

971

{

994

972

Node *child;

995

973

996

if (node->tag == tag_dir || node->tag == tag_ul || node->tag == tag_ol)

974

if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )

997

975

{

998

976

child = node->content;

999

977

1000

if (child == null)

978

if (child == NULL)

1001

979

return no;

1002

980

1003

981

/* check child has no peers */

1005

983

if (child->next)

1006

984

return no;

1007

985

1008

if (child->tag != tag_li)

986

if ( !nodeIsLI(child) )

1009

987

return no;

1010

988

1011

if (!child->implicit)

989

if ( !child->implicit )

1012

990

return no;

1013

991

1014

992

/* coerce dir to div */

1015

1016

node->tag = tag_div;

1017

MemFree(node->element);

1018

node->element = wstrdup("div");

1019

AddStyleProperty(node, "margin-left: 2em");

1020

StripOnlyChild(node);

1021

return yes;

1022

1023

#if 0

1024

content = child->content;

1025

last = child->last;

1026

child->content = null;

1027

1028

/* adjust parent and set margin on contents of <li> */

1029

1030

for (child = content; child; child = child->next)

1031

{

1032

child->parent = node->parent;

1033

AddStyleProperty(child, "margin-left: 1em");

1034

}

1035

1036

/* hook first/last into sequence */

1037

1038

if (content)

1039

{

1040

content->prev = node->prev;

1041

last->next = node->next;

1042

FixNodeLinks(content);

1043

FixNodeLinks(last);

1044

}

1045

1046

node->next = null;

1047

FreeNode(node);

1048

1049

/* ensure that new node is cleaned */

1050

*pnode = CleanNode(lexer, content);

1051

return yes;

1052

#endif

993

node->tag = LookupTagDef( TidyTag_DIV );

994

MemFree( node->element );

995

node->element = tmbstrdup("div");

996

AddStyleProperty( doc, node, "margin-left: 2em" );

997

StripOnlyChild( doc, node );

998

return yes;

1053

999

}

1054

1000

1055

1001

return no;

1060

1006

Action: replace <center> by <div style="text-align: center">

1061

1007

*/

1062

1008

1063

static Bool Center2Div(Lexer *lexer, Node *node, Node **pnode)

1009

static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)

1064

1010

{

1065

if (node->tag == tag_center)

1011

if ( nodeIsCENTER(node) )

1066

1012

{

1067

if (DropFontTags)

1013

if ( cfgBool(doc, TidyDropFontTags) )

1068

1014

{

1069

1015

if (node->content)

1070

1016

{

1071

Node *last = node->last, *parent = node->parent;

1072

1073

DiscardContainer(node, pnode);

1074

1075

node = InferredTag(lexer, "br");

1076

1077

if (last->next)

1078

last->next->prev = node;

1079

1080

node->next = last->next;

1081

last->next = node;

1082

node->prev = last;

1083

1084

if (parent->last == last)

1085

parent->last = node;

1086

1087

node->parent = parent;

1017

Node *last = node->last;

1018

DiscardContainer( doc, node, pnode );

1019

1020

node = InferredTag(doc, TidyTag_BR);

1021

InsertNodeAfterElement(last, node);

1088

1022

}

1089

1023

else

1090

1024

{

1091

Node *prev = node->prev, *next = node->next, *parent = node->parent;

1092

DiscardContainer(node, pnode);

1093

1094

node = InferredTag(lexer, "br");

1095

node->next = next;

1096

node->prev = prev;

1097

node->parent = parent;

1098

1025

Node *prev = node->prev, *next = node->next,

1026

*parent = node->parent;

1027

DiscardContainer( doc, node, pnode );

1028

1029

node = InferredTag(doc, TidyTag_BR);

1099

1030

if (next)

1100

next->prev = node;

1101

else

1102

parent->last = node;

1103

1104

if (prev)

1105

prev->next = node;

1106

else

1107

parent->content = node;

1031

InsertNodeBeforeElement(next, node);

1032

else if (prev)

1033

InsertNodeAfterElement(prev, node);

1034

else

1035

InsertNodeAtStart(parent, node);

1108

1036

}

1109

1037

1110

1038

return yes;

1111

1039

}

1112

1040

1113

node->tag = tag_div;

1114

MemFree(node->element);

1115

node->element = wstrdup("div");

1116

AddStyleProperty(node, "text-align: center");

1041

RenameElem( node, TidyTag_DIV );

1042

AddStyleProperty( doc, node, "text-align: center" );

1117

1043

return yes;

1118

1044

}

1119

1045

1120

1046

return no;

1121

1047

}

1122

1048

1049

/* Copy child attributes to node. Duplicate attributes are overwritten.

1050

Unique attributes (such as ID) disable the action.

1051

Attributes style and class are not dealt with. A call to MergeStyles

1052

will do that.

1053

*/

1054

static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)

1055

{

1056

AttVal *av1, *av2;

1057

TidyAttrId id;

1058

1059

/* Detect attributes that cannot be merged or overwritten. */

1060

if (AttrGetById(child, TidyAttr_ID) != NULL

1061

&& AttrGetById(node, TidyAttr_ID) != NULL)

1062

return no;

1063

1064

/* Move child attributes to node. Attributes in node

1065

can be overwritten or merged. */

1066

for (av2 = child->attributes; av2; )

1067

{

1068

/* Dealt by MergeStyles. */

1069

if (attrIsSTYLE(av2) || attrIsCLASS(av2))

1070

{

1071

av2 = av2->next;

1072

continue;

1073

}

1074

/* Avoid duplicates in node */

1075

if ((id=AttrId(av2)) != TidyAttr_UNKNOWN

1076

&& (av1=AttrGetById(node, id))!= NULL)

1077

RemoveAttribute( doc, node, av1 );

1078

1079

/* Move attribute from child to node */

1080

DetachAttribute( child, av2 );

1081

av1 = av2;

1082

av2 = av2->next;

1083

av1->next = NULL;

1084

InsertAttributeAtEnd( node, av1 );

1085

}

1086

1087

return yes;

1088

}

1089

1123

1090

/*

1124

Symptom <div><div>...</div></div>

1125

Action: merge the two divs

1091

Symptom <XX><XX>...</XX></XX>

1092

Action: merge the two XXs

1126

1093

1127

This is useful after nested <dir>s used by Word

1094

For instance, this is useful after nested <dir>s used by Word

1128

1095

for indenting have been converted to <div>s

1096

1097

If state is "no", no merging.

1098

If state is "yes", inner element is discarded. Only Style and Class

1099

attributes are merged using MergeStyles().

1100

If state is "auto", atttibutes are merged as described in CopyAttrs().

1101

Style and Class attributes are merged using MergeStyles().

1129

1102

*/

1130

static Bool MergeDivs(Lexer *lexer, Node *node, Node **pnode)

1103

static Bool MergeNestedElements( TidyDocImpl* doc,

1104

TidyTagId Id, TidyTriState state, Node *node,

1105

Node **ARG_UNUSED(pnode))

1131

1106

{

1132

1107

Node *child;

1133

1108

1134

if (node->tag != tag_div)

1109

if ( state == TidyNoState

1110

|| !TagIsId(node, Id) )

1135

1111

return no;

1136

1112

1137

1113

child = node->content;

1138

1114

1139

if (!child)

1140

return no;

1141

1142

if (child->tag != tag_div)

1143

return no;

1144

1145

if (child->next != null)

1146

return no;

1147

1148

MergeStyles(node, child);

1149

StripOnlyChild(node);

1115

if ( child == NULL

1116

|| child->next != NULL

1117

|| !TagIsId(child, Id) )

1118

return no;

1119

1120

if ( state == TidyAutoState

1121

&& CopyAttrs(doc, node, child) == no )

1122

return no;

1123

1124

MergeStyles( doc, node, child );

1125

StripOnlyChild( doc, node );

1150

1126

return yes;

1151

1127

}

1152

1128

1155

1131

Action: discard outer list

1156

1132

*/

1157

1133

1158

static Bool NestedList(Lexer *lexer, Node *node, Node **pnode)

1134

static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )

1159

1135

{

1160

1136

Node *child, *list;

1161

1137

1162

if (node->tag == tag_ul || node->tag == tag_ol)

1138

if ( nodeIsUL(node) || nodeIsOL(node) )

1163

1139

{

1164

1140

child = node->content;

1165

1141

1166

if (child == null)

1142

if (child == NULL)

1167

1143

return no;

1168

1144

1169

1145

/* check child has no peers */

1179

1155

if (list->tag != node->tag)

1180

1156

return no;

1181

1157

1158

/* check list has no peers */

1159

if (list->next)

1160

return no;

1161

1182

1162

*pnode = list; /* Set node to resume iteration */

1183

1163

1184

1164

/* move inner list node into position of outer node */

1188

1168

FixNodeLinks(list);

1189

1169

1190

1170

/* get rid of outer ul and its li */

1191

/* XXX: Are we leaking the child node? -creitzel 7 Jun, 01 */

1192

child->content = null;

1193

node->content = null;

1194

node->next = null;

1195

FreeNode(node);

1196

node = null;

1171

child->content = NULL;

1172

FreeNode( doc, child ); /* See test #427841. */

1173

child = NULL;

1174

node->content = NULL;

1175

node->next = NULL;

1176

FreeNode( doc, node );

1177

node = NULL;

1197

1178

1198

1179

/*

1199

1180

If prev node was a list the chances are this node

1203

1184

1204

1185

if (list->prev)

1205

1186

{

1206

if (list->prev->tag == tag_ul || list->prev->tag == tag_ol)

1187

if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))

1188

&& list->prev->last )

1207

1189

{

1208

1190

node = list;

1209

1191

list = node->prev;

1192

1193

child = list->last; /* <li> */

1194

1210

1195

list->next = node->next;

1211

1212

if (list->next)

1213

list->next->prev = list;

1214

1215

child = list->last; /* <li> */

1196

FixNodeLinks(list);

1216

1197

1217

1198

node->parent = child;

1218

node->next = null;

1199

node->next = NULL;

1219

1200

node->prev = child->last;

1220

1201

FixNodeLinks(node);

1221

CleanNode(lexer, node);

1202

CleanNode( doc, node );

1222

1203

}

1223

1204

}

1224

1205

1229

1210

}

1230

1211

1231

1212

/*

1213

Some necessary conditions to apply BlockStyle().

1214

*/

1215

1216

static Bool CanApplyBlockStyle( Node *node )

1217

{

1218

if (node->tag->model & (CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)

1219

&& !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )

1220

{

1221

return yes;

1222

}

1223

return no;

1224

}

1225

1226

/*

1232

1227

Symptom: the only child of a block-level element is a

1233

1228

presentation element such as B, I or FONT

1234

1229

1251

1246

However, to avoid CSS problems with Navigator 4, this isn't done

1252

1247

for the elements: caption, tr and table

1253

1248

*/

1254

static Bool BlockStyle(Lexer *lexer, Node *node, Node **pnode)

1249

static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )

1255

1250

{

1256

1251

Node *child;

1257

1252

1258

if (node->tag->model & (CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE))

1253

if (CanApplyBlockStyle(node))

1259

1254

{

1260

if (node->tag != tag_table

1261

&& node->tag != tag_tr

1262

&& node->tag != tag_li)

1263

{

1264

/* check for align attribute */

1265

if (node->tag != tag_caption)

1266

TextAlign(lexer, node);

1267

1268

child = node->content;

1269

1270

if (child == null)

1271

return no;

1272

1273

/* check child has no peers */

1274

1275

if (child->next)

1276

return no;

1277

1278

if (child->tag == tag_b)

1279

{

1280

MergeStyles(node, child);

1281

AddStyleProperty(node, "font-weight: bold");

1282

StripOnlyChild(node);

1283

return yes;

1284

}

1285

1286

if (child->tag == tag_i)

1287

{

1288

MergeStyles(node, child);

1289

AddStyleProperty(node, "font-style: italic");

1290

StripOnlyChild(node);

1291

return yes;

1292

}

1293

1294

if (child->tag == tag_font)

1295

{

1296

MergeStyles(node, child);

1297

AddFontStyles(node, child->attributes);

1298

StripOnlyChild(node);

1299

return yes;

1300

}

1255

/* check for align attribute */

1256

if ( !nodeIsCAPTION(node) )

1257

TextAlign( doc, node );

1258

1259

child = node->content;

1260

if (child == NULL)

1261

return no;

1262

1263

/* check child has no peers */

1264

if (child->next)

1265

return no;

1266

1267

if ( nodeIsB(child) )

1268

{

1269

MergeStyles( doc, node, child );

1270

AddStyleProperty( doc, node, "font-weight: bold" );

1271

StripOnlyChild( doc, node );

1272

return yes;

1273

}

1274

1275

if ( nodeIsI(child) )

1276

{

1277

MergeStyles( doc, node, child );

1278

AddStyleProperty( doc, node, "font-style: italic" );

1279

StripOnlyChild( doc, node );

1280

return yes;

1281

}

1282

1283

if ( nodeIsFONT(child) )

1284

{

1285

MergeStyles( doc, node, child );

1286

AddFontStyles( doc, node, child->attributes );

1287

StripOnlyChild( doc, node );

1288

return yes;

1301

1289

}

1302

1290

}

1303

1291

1305

1293

}

1306

1294

1307

1295

/* the only child of table cell or an inline element such as em */

1308

static Bool InlineStyle(Lexer *lexer, Node *node, Node **pnode)

1296

static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )

1309

1297

{

1310

1298

Node *child;

1311

1299

1312

if (node->tag != tag_font && (node->tag->model & (CM_INLINE|CM_ROW)))

1300

if ( !nodeIsFONT(node) && nodeHasCM(node, CM_INLINE|CM_ROW) )

1313

1301

{

1314

1302

child = node->content;

1315

1303

1316

if (child == null)

1304

if (child == NULL)

1317

1305

return no;

1318

1306

1319

1307

/* check child has no peers */

1321

1309

if (child->next)

1322

1310

return no;

1323

1311

1324

if (child->tag == tag_b && LogicalEmphasis)

1325

{

1326

MergeStyles(node, child);

1327

AddStyleProperty(node, "font-weight: bold");

1328

StripOnlyChild(node);

1329

return yes;

1330

}

1331

1332

if (child->tag == tag_i && LogicalEmphasis)

1333

{

1334

MergeStyles(node, child);

1335

AddStyleProperty(node, "font-style: italic");

1336

StripOnlyChild(node);

1337

return yes;

1338

}

1339

1340

if (child->tag == tag_font)

1341

{

1342

MergeStyles(node, child);

1343

AddFontStyles(node, child->attributes);

1344

StripOnlyChild(node);

1312

if ( nodeIsB(child) && cfgBool(doc, TidyLogicalEmphasis) )

1313

{

1314

MergeStyles( doc, node, child );

1315

AddStyleProperty( doc, node, "font-weight: bold" );

1316

StripOnlyChild( doc, node );

1317

return yes;

1318

}

1319

1320

if ( nodeIsI(child) && cfgBool(doc, TidyLogicalEmphasis) )

1321

{

1322

MergeStyles( doc, node, child );

1323

AddStyleProperty( doc, node, "font-style: italic" );

1324

StripOnlyChild( doc, node );

1325

return yes;

1326

}

1327

1328

if ( nodeIsFONT(child) )

1329

{

1330

MergeStyles( doc, node, child );

1331

AddFontStyles( doc, node, child->attributes );

1332

StripOnlyChild( doc, node );

1345

1333

return yes;

1346

1334

}

1347

1335

}

1354

1342

the font element's attributes and replacing them

1355

1343

by a single style attribute.

1356

1344

*/

1357

static Bool Font2Span(Lexer *lexer, Node *node, Node **pnode)

1345

static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )

1358

1346

{

1359

1347

AttVal *av, *style, *next;

1360

1348

1361

if (node->tag == tag_font)

1349

if ( nodeIsFONT(node) )

1362

1350

{

1363

if (DropFontTags)

1351

if ( cfgBool(doc, TidyDropFontTags) )

1364

1352

{

1365

DiscardContainer(node, pnode);

1366

return no;

1353

DiscardContainer( doc, node, pnode );

1354

return yes;

1367

1355

}

1368

1356

1369

/* if FONT is only child of parent element then leave alone */

1370

if (node->parent->content == node

1371

&& node->next == null)

1357

/* if FONT is only child of parent element then leave alone

1358

Do so only if BlockStyle may be succesful. */

1359

if ( node->parent->content == node && node->next == NULL &&

1360

CanApplyBlockStyle(node->parent) )

1372

1361

return no;

1373

1362

1374

AddFontStyles(node, node->attributes);

1363

AddFontStyles( doc, node, node->attributes );

1375

1364

1376

1365

/* extract style attribute and free the rest */

1377

1366

av = node->attributes;

1378

style = null;

1367

style = NULL;

1379

1368

1380

1369

while (av)

1381

1370

{

1382

1371

next = av->next;

1383

1372

1384

if (wstrcmp(av->attribute, "style") == 0)

1373

if (attrIsSTYLE(av))

1385

1374

{

1386

av->next = null;

1375

av->next = NULL;

1387

1376

style = av;

1388

1377

}

1389

1378

else

1390

1379

{

1391

if (av->attribute)

1392

MemFree(av->attribute);

1393

if (av->value)

1394

MemFree(av->value);

1395

1396

MemFree(av);

1380

FreeAttribute( doc, av );

1397

1381

}

1398

1399

1382

av = next;

1400

1383

}

1401

1384

1402

1385

node->attributes = style;

1403

1404

node->tag = tag_span;

1405

MemFree(node->element);

1406

node->element = wstrdup("span");

1407

1386

RenameElem( node, TidyTag_SPAN );

1408

1387

return yes;

1409

1388

}

1410

1389

1411

1390

return no;

1412

1391

}

1413

1392

1414

static Bool IsElement(Node *node)

1415

{

1416

return (node->type == StartTag || node->type == StartEndTag ? yes : no);

1417

}

1418

1419

1393

/*

1420

1394

Applies all matching rules to a node.

1421

1395

*/

1422

Node *CleanNode(Lexer *lexer, Node *node)

1396

Node* CleanNode( TidyDocImpl* doc, Node *node )

1423

1397

{

1424

Node *next = null;

1398

Node *next = NULL;

1399

TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);

1425

1400

1426

for (next = node; node && IsElement(node); node = next)

1401

for (next = node; nodeIsElement(node); node = next)

1427

1402

{

1428

if (Dir2Div(lexer, node, &next))

1403

if ( Dir2Div(doc, node, &next) )

1429

1404

continue;

1430

1405

1431

1406

/* Special case: true result means

1433

1408

** So we must jump back up the CreateStyleProperties()

1434

1409

** call stack until we have a valid node reference.

1435

1410

*/

1436

if (NestedList(lexer, node, &next))

1411

if ( NestedList(doc, node, &next) )

1437

1412

return next;

1438

1413

1439

if (Center2Div(lexer, node, &next))

1440

continue;

1441

1442

if (MergeDivs(lexer, node, &next))

1443

continue;

1444

1445

if (BlockStyle(lexer, node, &next))

1446

continue;

1447

1448

if (InlineStyle(lexer, node, &next))

1449

continue;

1450

1451

if (Font2Span(lexer, node, &next))

1414

if ( Center2Div(doc, node, &next) )

1415

continue;

1416

1417

if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )

1418

continue;

1419

1420

if ( BlockStyle(doc, node, &next) )

1421

continue;

1422

1423

if ( InlineStyle(doc, node, &next) )

1424

continue;

1425

1426

if ( Font2Span(doc, node, &next) )

1452

1427

continue;

1453

1428

1454

1429

break;

1458

1433

}

1459

1434

1460

1435

/* Special case: if the current node is destroyed by

1461

** CleanNode() lower in the tree, this node and its

1462

** parent no longer exist. So we must jump back up

1463

** the CreateStyleProperties() call stack until we

1464

** have a valid node reference.

1436

** CleanNode() lower in the tree, this node and its parent

1437

** no longer exist. So we must jump back up the CleanTree()

1438

** call stack until we have a valid node reference.

1465

1439

*/

1466

1440

1467

static Node *CreateStyleProperties(Lexer *lexer, Node *node, Node** prepl)

1441

static Node* CleanTree( TidyDocImpl* doc, Node *node )

1468

1442

{

1469

Node *child;

1470

1471

1443

if (node->content)

1472

1444

{

1473

Node* repl = node;

1474

for (child = node->content; child != null; child = child->next)

1445

Node *child;

1446

for (child = node->content; child != NULL; child = child->next)

1475

1447

{

1476

child = CreateStyleProperties(lexer, child, &repl);

1477

if ( repl != node )

1478

return repl;

1448

child = CleanTree( doc, child );

1449

if ( !child )

1450

break;

1479

1451

}

1480

1452

}

1481

1453

1482

return CleanNode(lexer, node);

1454

return CleanNode( doc, node );

1483

1455

}

1484

1456

1485

static void DefineStyleRules(Lexer *lexer, Node *node)

1457

static void DefineStyleRules( TidyDocImpl* doc, Node *node )

1486

1458

{

1487

1459

Node *child;

1488

1460

1489

1461

if (node->content)

1490

1462

{

1491

1463

for (child = node->content;

1492

child != null; child = child->next)

1464

child != NULL; child = child->next)

1493

1465

{

1494

DefineStyleRules(lexer, child);

1466

DefineStyleRules( doc, child );

1495

1467

}

1496

1468

}

1497

1469

1498

Style2Rule(lexer, node);

1470

Style2Rule( doc, node );

1499

1471

}

1500

1472

1501

void CleanTree(Lexer *lexer, Node *doc)

1473

void CleanDocument( TidyDocImpl* doc )

1502

1474

{

1503

Node* repl = doc;

1504

doc = CreateStyleProperties(lexer, doc, &repl);

1475

/* placeholder. CleanTree()/CleanNode() will not

1476

** zap root element

1477

*/

1478

CleanTree( doc, &doc->root );

1505

1479

1506

if (MakeClean)

1480

if ( cfgBool(doc, TidyMakeClean) )

1507

1481

{

1508

DefineStyleRules(lexer, doc);

1509

CreateStyleElement(lexer, doc);

1482

DefineStyleRules( doc, &doc->root );

1483

CreateStyleElement( doc );

1510

1484

}

1511

1485

}

1512

1486

1513

1487

/* simplifies <b><b> ... </b> ...</b> etc. */

1514

void NestedEmphasis(Node *node)

1488

void NestedEmphasis( TidyDocImpl* doc, Node* node )

1515

1489

{

1516

1490

Node *next;

1517

1491

1519

1493

{

1520

1494

next = node->next;

1521

1495

1522

if ((node->tag == tag_b || node->tag == tag_i)

1523

&& node->parent && node->parent->tag == node->tag)

1496

if ( (nodeIsB(node) || nodeIsI(node))

1497

&& node->parent && node->parent->tag == node->tag)

1524

1498

{

1525

1499

/* strip redundant inner element */

1526

DiscardContainer(node, &next);

1500

DiscardContainer( doc, node, &next );

1527

1501

node = next;

1528

1502

continue;

1529

1503

}

1530

1504

1531

if (node->content)

1532

NestedEmphasis(node->content);

1505

if ( node->content )

1506

NestedEmphasis( doc, node->content );

1533

1507

1534

1508

node = next;

1535

1509

}

1536

1510

}

1537

1511

1512

1513

1538

1514

/* replace i by em and b by strong */

1539

void EmFromI(Node *node)

1515

void EmFromI( TidyDocImpl* doc, Node* node )

1540

1516

{

1541

1517

while (node)

1542

1518

{

1543

if (node->tag == tag_i)

1544

{

1545

MemFree(node->element);

1546

node->element = wstrdup(tag_em->name);

1547

node->tag = tag_em;

1548

}

1549

else if (node->tag == tag_b)

1550

{

1551

MemFree(node->element);

1552

node->element = wstrdup(tag_strong->name);

1553

node->tag = tag_strong;

1554

}

1519

if ( nodeIsI(node) )

1520

RenameElem( node, TidyTag_EM );

1521

else if ( nodeIsB(node) )

1522

RenameElem( node, TidyTag_STRONG );

1555

1523

1556

if (node->content)

1557

EmFromI(node->content);

1524

if ( node->content )

1525

EmFromI( doc, node->content );

1558

1526

1559

1527

node = node->next;

1560

1528

}

1562

1530

1563

1531

static Bool HasOneChild(Node *node)

1564

1532

{

1565

return (node->content && node->content->next == null);

1533

return (node->content && node->content->next == NULL);

1566

1534

}

1567

1535

1568

1536

/*

1572

1540

li. This is recursively replaced by an

1573

1541

implicit blockquote.

1574

1542

*/

1575

void List2BQ(Node *node)

1543

void List2BQ( TidyDocImpl* doc, Node* node )

1576

1544

{

1577

1545

while (node)

1578

1546

{

1579

1547

if (node->content)

1580

List2BQ(node->content);

1548

List2BQ( doc, node->content );

1581

1549

1582

if (node->tag && node->tag->parser == ParseList &&

1583

HasOneChild(node) && node->content->implicit)

1550

if ( node->tag && node->tag->parser == ParseList &&

1551

HasOneChild(node) && node->content->implicit )

1584

1552

{

1585

StripOnlyChild(node);

1586

MemFree(node->element);

1587

node->element = wstrdup(tag_blockquote->name);

1588

node->tag = tag_blockquote;

1553

StripOnlyChild( doc, node );

1554

RenameElem( node, TidyTag_BLOCKQUOTE );

1589

1555

node->implicit = yes;

1590

1556

}

1591

1557

1593

1559

}

1594

1560

}

1595

1561

1596

static char indent_buf[32];

1597

1562

1598

1563

/*

1599

1564

Replace implicit blockquote by div with an indent

1600

1565

taking care to reduce nested blockquotes to a single

1601

1566

div with the indent set to match the nesting depth

1602

1567

*/

1603

void BQ2Div(Node *node)

1568

void BQ2Div( TidyDocImpl* doc, Node *node )

1604

1569

{

1605

int indent;

1606

size_t len;

1607

AttVal *attval;

1570

tmbchar indent_buf[ 32 ];

1571

uint indent;

1608

1572

1609

1573

while (node)

1610

1574

{

1611

if (node->tag == tag_blockquote && node->implicit)

1575

if ( nodeIsBLOCKQUOTE(node) && node->implicit )

1612

1576

{

1613

1577

indent = 1;

1614

1578

1615

while(HasOneChild(node) &&

1616

node->content->tag == tag_blockquote &&

1617

node->implicit)

1579

while( HasOneChild(node) &&

1580

nodeIsBLOCKQUOTE(node->content) &&

1581

node->implicit)

1618

1582

{

1619

1583

++indent;

1620

StripOnlyChild(node);

1584

StripOnlyChild( doc, node );

1621

1585

}

1622

1586

1623

1587

if (node->content)

1624

BQ2Div(node->content);

1625

1626

len = sprintf(indent_buf, "margin-left: %dem", 2*indent);

1627

1628

MemFree(node->element);

1629

node->element = wstrdup(tag_div->name);

1630

node->tag = tag_div;

1631

1632

attval = GetAttrByName(node, "style");

1633

1634

if (attval)

1635

{

1636

char *s;

1637

1638

s = (char *)MemAlloc(len + 3 + wstrlen(attval->value));

1639

1640

wstrcpy(s, indent_buf);

1641

wstrcat(s, "; ");

1642

wstrcat(s, attval->value);

1643

1644

MemFree(attval->value);

1645

1646

attval->value = s;

1647

}

1648

else

1649

{

1650

AddAttribute(node, "style", indent_buf);

1651

}

1588

BQ2Div( doc, node->content );

1589

1590

tmbsnprintf(indent_buf, sizeof(indent_buf), "margin-left: %dem",

1591

2*indent);

1592

1593

RenameElem( node, TidyTag_DIV );

1594

AddStyleProperty(doc, node, indent_buf );

1652

1595

}

1653

1596

else if (node->content)

1654

BQ2Div(node->content);

1655

1597

BQ2Div( doc, node->content );

1656

1598

1657

1599

node = node->next;

1658

1600

}

1659

1601

}

1660

1602

1661

1603

1662

Node *FindEnclosingCell(Node *node)

1604

Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)

1663

1605

{

1664

1606

Node *check;

1665

1607

1666

1608

for ( check=node; check; check = check->parent )

1667

1609

{

1668

if ( check->tag == tag_td )

1610

if ( nodeIsTD(check) )

1669

1611

return check;

1670

1612

}

1671

return null;

1613

return NULL;

1672

1614

}

1673

1615

1674

1616

/* node is <![if ...]> prune up to <![endif]> */

1675

static Node *PruneSection(Lexer *lexer, Node *node)

1617

static Node* PruneSection( TidyDocImpl* doc, Node *node )

1676

1618

{

1619

Lexer* lexer = doc->lexer;

1620

1677

1621

for (;;)

1678

1622

{

1679

if (wstrncmp(lexer->lexbuf + node->start, "if !supportEmptyParas", 21) == 0)

1623

ctmbstr lexbuf = lexer->lexbuf + node->start;

1624

if ( tmbstrncmp(lexbuf, "if !supportEmptyParas", 21) == 0 )

1680

1625

{

1681

Node* cell = FindEnclosingCell( node );

1626

Node* cell = FindEnclosingCell( doc, node );

1682

1627

if ( cell )

1683

1628

{

1684

/* Need to put   into cell so it doesn't look weird */

1685

char onesixty[2] = { (char) 160, (char)0 };

1686

Node* nbsp = NewLiteralTextNode( lexer, onesixty );

1629

/* Need to put   into cell so it doesn't look weird

1630

*/

1631

Node* nbsp = NewLiteralTextNode( lexer, "\240" );

1632

assert( (byte)'\240' == (byte)160 );

1687

1633

InsertNodeBeforeElement( node, nbsp );

1688

1634

}

1689

1635

}

1690

1636

1691

1637

/* discard node and returns next */

1692

node = DiscardElement(node);

1638

node = DiscardElement( doc, node );

1693

1639

1694

if (node == null)

1695

return null;

1640

if (node == NULL)

1641

return NULL;

1696

1642

1697

1643

if (node->type == SectionTag)

1698

1644

{

1699

if (wstrncmp(lexer->lexbuf + node->start, "if", 2) == 0)

1645

if (tmbstrncmp(lexer->lexbuf + node->start, "if", 2) == 0)

1700

1646

{

1701

node = PruneSection(lexer, node);

1647

node = PruneSection( doc, node );

1702

1648

continue;

1703

1649

}

1704

1650

1705

if (wstrncmp(lexer->lexbuf + node->start, "endif", 5) == 0)

1651

if (tmbstrncmp(lexer->lexbuf + node->start, "endif", 5) == 0)

1706

1652

{

1707

node = DiscardElement(node);

1653

node = DiscardElement( doc, node );

1708

1654

break;

1709

1655

}

1710

1656

}

1713

1659

return node;

1714

1660

}

1715

1661

1716

void DropSections(Lexer *lexer, Node *node)

1662

void DropSections( TidyDocImpl* doc, Node* node )

1717

1663

{

1664

Lexer* lexer = doc->lexer;

1718

1665

while (node)

1719

1666

{

1720

1667

if (node->type == SectionTag)

1721

1668

{

1722

1669

/* prune up to matching endif */

1723

if ((wstrncmp(lexer->lexbuf + node->start, "if", 2) == 0) &&

1724

(wstrncmp(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */

1670

if ((tmbstrncmp(lexer->lexbuf + node->start, "if", 2) == 0) &&

1671

(tmbstrncmp(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */

1725

1672

{

1726

node = PruneSection(lexer, node);

1673

node = PruneSection( doc, node );

1727

1674

continue;

1728

1675

}

1729

1676

1730

1677

/* discard others as well */

1731

node = DiscardElement(node);

1678

node = DiscardElement( doc, node );

1732

1679

continue;

1733

1680

}

1734

1681

1735

1682

if (node->content)

1736

DropSections(lexer, node->content);

1683

DropSections( doc, node->content );

1737

1684

1738

1685

node = node->next;

1739

1686

}

1740

1687

}

1741

1688

1742

static void PurgeWord2000Attributes(Node *node)

1689

static void PurgeWord2000Attributes( TidyDocImpl* ARG_UNUSED(doc), Node* node )

1743

1690

{

1744

AttVal *attr, *next, *prev = null;

1691

AttVal *attr, *next, *prev = NULL;

1745

1692

1746

1693

for ( attr = node->attributes; attr; attr = next )

1747

1694

{

1749

1696

1750

1697

/* special check for class="Code" denoting pre text */

1751

1698

/* Pass thru user defined styles as HTML class names */

1752

if (wstrcmp(attr->attribute, "class") == 0)

1699

if (attrIsCLASS(attr))

1753

1700

{

1754

if ( wstrcmp(attr->value, "Code") == 0 ||

1755

wstrncmp(attr->value, "Mso", 3) != 0 )

1701

if (AttrValueIs(attr, "Code") ||

1702

tmbstrncmp(attr->value, "Mso", 3) != 0 )

1756

1703

{

1757

1704

prev = attr;

1758

1705

continue;

1759

1706

}

1760

1707

}

1761

1708

1762

if ( wstrcmp(attr->attribute, "class") == 0 ||

1763

wstrcmp(attr->attribute, "style") == 0 ||

1764

wstrcmp(attr->attribute, "lang") == 0 ||

1765

wstrncmp(attr->attribute, "x:", 2) == 0 ||

1766

( ( wstrcmp(attr->attribute, "height") == 0 ||

1767

wstrcmp(attr->attribute, "width") == 0 ) &&

1768

( node->tag == tag_td ||

1769

node->tag == tag_tr ||

1770

node->tag == tag_th ) )

1771

)

1709

if (attrIsCLASS(attr) ||

1710

attrIsSTYLE(attr) ||

1711

attrIsLANG(attr) ||

1712

( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&

1713

(nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||

1714

(attr->attribute && tmbstrncmp(attr->attribute, "x:", 2) == 0) )

1772

1715

{

1773

1716

if (prev)

1774

1717

prev->next = next;

1775

1718

else

1776

1719

node->attributes = next;

1777

1720

1778

FreeAttribute(attr);

1721

FreeAttribute( doc, attr );

1779

1722

}

1780

1723

else

1781

1724

prev = attr;

1783

1726

}

1784

1727

1785

1728

/* Word2000 uses span excessively, so we strip span out */

1786

static Node *StripSpan(Lexer *lexer, Node *span)

1729

static Node* StripSpan( TidyDocImpl* doc, Node* span )

1787

1730

{

1788

Node *node, *prev = null, *content;

1731

Node *node, *prev = NULL, *content;

1789

1732

1790

1733

/*

1791

1734

deal with span elements that have content

1793

1736

after having processed it

1794

1737

*/

1795

1738

1796

CleanWord2000(lexer, span->content);

1739

CleanWord2000( doc, span->content );

1797

1740

content = span->content;

1798

1741

1799

1742

if (span->prev)

1816

1759

prev = node;

1817

1760

}

1818

1761

1819

if (span->next == null)

1762

if (span->next == NULL)

1820

1763

span->parent->last = prev;

1821

1764

1822

1765

node = span->next;

1823

span->content = null;

1824

DiscardElement(span);

1766

span->content = NULL;

1767

DiscardElement( doc, span );

1825

1768

return node;

1826

1769

}

1827

1770

1828

1771

/* map non-breaking spaces to regular spaces */

1829

static void NormalizeSpaces(Lexer *lexer, Node *node)

1772

void NormalizeSpaces(Lexer *lexer, Node *node)

1830

1773

{

1831

while (node)

1774

while ( node )

1832

1775

{

1833

if (node->content)

1834

NormalizeSpaces(lexer, node->content);

1776

if ( node->content )

1777

NormalizeSpaces( lexer, node->content );

1835

1778

1836

if (node->type == TextNode)

1779

if (nodeIsText(node))

1837

1780

{

1838

1781

uint i, c;

1839

char *p = lexer->lexbuf + node->start;

1782

tmbstr p = lexer->lexbuf + node->start;

1840

1783

1841

1784

for (i = node->start; i < node->end; ++i)

1842

1785

{

1843

c = (unsigned char)lexer->lexbuf[i];

1786

c = (byte) lexer->lexbuf[i];

1844

1787

1845

1788

/* look for UTF-8 multibyte character */

1846

if (c > 0x7F)

1847

i += GetUTF8((unsigned char *)(lexer->lexbuf + i), &c);

1789

if ( c > 0x7F )

1790

i += GetUTF8( lexer->lexbuf + i, &c );

1848

1791

1849

if (c == 160)

1792

if ( c == 160 )

1850

1793

c = ' ';

1851

1794

1852

1795

p = PutUTF8(p, c);

1853

1796

}

1797

node->end = p - lexer->lexbuf;

1854

1798

}

1855

1799

1856

1800

node = node->next;

1860

1804

/* used to hunt for hidden preformatted sections */

1861

1805

Bool NoMargins(Node *node)

1862

1806

{

1863

AttVal *attval = GetAttrByName(node, "style");

1864

1865

if (attval == null)

1866

return no;

1867

1868

/* search for substring "margin-top: 0" */

1869

1870

if (!wsubstr(attval->value, "margin-top: 0"))

1871

return no;

1872

/* search for substring "margin-top: 0" */

1873

1874

if (!wsubstr(attval->value, "margin-bottom: 0"))

1807

AttVal *attval = AttrGetById(node, TidyAttr_STYLE);

1808

1809

if ( !AttrHasValue(attval) )

1810

return no;

1811

1812

/* search for substring "margin-top: 0" */

1813

if (!tmbsubstr(attval->value, "margin-top: 0"))

1814

return no;

1815

1816

/* search for substring "margin-bottom: 0" */

1817

if (!tmbsubstr(attval->value, "margin-bottom: 0"))

1875

1818

return no;

1876

1819

1877

1820

return yes;

1878

1821

}

1879

1822

1880

1823

/* does element have a single space as its content? */

1881

Bool SingleSpace(Lexer *lexer, Node *node)

1824

static Bool SingleSpace( Lexer* lexer, Node* node )

1882

1825

{

1883

if (node->content)

1826

if ( node->content )

1884

1827

{

1885

1828

node = node->content;

1886

1829

1887

if (node->next != null)

1888

return no;

1889

1890

if (node->type != TextNode)

1891

return no;

1892

1893

if (((node->end - node->start) == 1) &&

1894

lexer->lexbuf[node->start] == ' ')

1830

if ( node->next != NULL )

1831

return no;

1832

1833

if ( node->type != TextNode )

1834

return no;

1835

1836

if ( (node->end - node->start) == 1 &&

1837

lexer->lexbuf[node->start] == ' ' )

1895

1838

return yes;

1896

1839

1897

if ((node->end - node->start) == 2)

1840

if ( (node->end - node->start) == 2 )

1898

1841

{

1899

uint c;

1900

1901

GetUTF8((unsigned char *)lexer->lexbuf + node->start, &c);

1902

1903

if (c == 160)

1842

uint c = 0;

1843

GetUTF8( lexer->lexbuf + node->start, &c );

1844

if ( c == 160 )

1904

1845

return yes;

1905

1846

}

1906

1847

}

1915

1856

declare them as new tags, such as o:p which needs to be declared

1916

1857

as inline.

1917

1858

*/

1918

void CleanWord2000(Lexer *lexer, Node *node)

1859

void CleanWord2000( TidyDocImpl* doc, Node *node)

1919

1860

{

1920

1861

/* used to a list from a sequence of bulletted p's */

1921

Node *list = null;

1862

Lexer* lexer = doc->lexer;

1863

Node* list = NULL;

1922

1864

1923

while (node)

1865

while ( node )

1924

1866

{

1925

1867

/* get rid of Word's xmlns attributes */

1926

if (node->tag == tag_html)

1868

if ( nodeIsHTML(node) )

1927

1869

{

1928

1870

/* check that it's a Word 2000 document */

1929

if (!GetAttrByName(node, "xmlns:o") && !MakeBare)

1871

if ( !GetAttrByName(node, "xmlns:o") &&

1872

!cfgBool(doc, TidyMakeBare) )

1930

1873

return;

1931

1874

1932

FreeAttrs(node);

1875

FreeAttrs( doc, node );

1933

1876

}

1934

1877

1935

1878

/* fix up preformatted sections by looking for a

1936

1879

** sequence of paragraphs with zero top/bottom margin

1937

1880

*/

1938

if (node->tag == tag_p)

1881

if ( nodeIsP(node) )

1939

1882

{

1940

1883

if (NoMargins(node))

1941

1884

{

1942

1885

Node *pre, *next;

1943

CoerceNode(lexer, node, tag_pre);

1886

CoerceNode(doc, node, TidyTag_PRE, no, yes);

1944

1887

1945

PurgeWord2000Attributes(node);

1888

PurgeWord2000Attributes( doc, node );

1946

1889

1947

1890

if (node->content)

1948

CleanWord2000(lexer, node->content);

1891

CleanWord2000( doc, node->content );

1949

1892

1950

1893

pre = node;

1951

1894

node = node->next;

1952

1895

1953

1896

/* continue to strip p's */

1954

1897

1955

while (node->tag == tag_p && NoMargins(node))

1898

while ( nodeIsP(node) && NoMargins(node) )

1956

1899

{

1957

1900

next = node->next;

1958

1901

RemoveNode(node);

1959

1902

InsertNodeAtEnd(pre, NewLineNode(lexer));

1960

1903

InsertNodeAtEnd(pre, node);

1961

StripSpan(lexer, node);

1904

StripSpan( doc, node );

1962

1905

node = next;

1963

1906

}

1964

1907

1965

if (node == null)

1908

if (node == NULL)

1966

1909

break;

1967

1910

}

1968

1911

}

1970

1913

if (node->tag && (node->tag->model & CM_BLOCK)

1971

1914

&& SingleSpace(lexer, node))

1972

1915

{

1973

node = StripSpan(lexer, node);

1916

node = StripSpan( doc, node );

1974

1917

continue;

1975

1918

}

1976

1919

/* discard Word's style verbiage */

1977

if (node->tag == tag_style || node->tag == tag_meta || node->type == CommentTag)

1920

if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||

1921

node->type == CommentTag )

1978

1922

{

1979

node = DiscardElement(node);

1923

node = DiscardElement( doc, node );

1980

1924

continue;

1981

1925

}

1982

1926

1983

1927

/* strip out all span and font tags Word scatters so liberally! */

1984

if (node->tag == tag_span || node->tag == tag_font)

1928

if ( nodeIsSPAN(node) || nodeIsFONT(node) )

1985

1929

{

1986

node = StripSpan(lexer, node);

1930

node = StripSpan( doc, node );

1987

1931

continue;

1988

1932

}

1989

1933

1990

if (node->tag == tag_link)

1934

if ( nodeIsLINK(node) )

1991

1935

{

1992

AttVal *attr = GetAttrByName(node, "rel");

1936

AttVal *attr = AttrGetById(node, TidyAttr_REL);

1993

1937

1994

if (attr && wstrcmp(attr->value, "File-List") == 0)

1938

if (AttrValueIs(attr, "File-List"))

1995

1939

{

1996

node = DiscardElement(node);

1940

node = DiscardElement( doc, node );

1997

1941

continue;

1998

1942

}

1999

1943

}

2000

1944

2001

1945

/* discard empty paragraphs */

2002

if (node->content == null && node->tag == tag_p)

1946

1947

if ( node->content == NULL && nodeIsP(node) )

2003

1948

{

2004

node = DiscardElement(node);

1949

/* Use the existing function to ensure consistency */

1950

node = TrimEmptyElement( doc, node );

2005

1951

continue;

2006

1952

}

2007

1953

2008

if (node->tag == tag_p)

1954

if ( nodeIsP(node) )

2009

1955

{

2010

1956

AttVal *attr, *atrStyle;

2011

1957

2012

attr = GetAttrByName(node, "class");

2013

atrStyle = GetAttrByName(node, "style");

1958

attr = AttrGetById(node, TidyAttr_CLASS);

1959

atrStyle = AttrGetById(node, TidyAttr_STYLE);

2014

1960

/*

2015

1961

(JES) Sometimes Word marks a list item with the following hokie syntax

2016

1962

<p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;

2018

1964

*/

2019

1965

/* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */

2020

1966

/* map <p class="MsoListNumber"> to <ol>...</ol> */

2021

if ((attr &&

2022

(wstrcmp(attr->value, "MsoListBullet") == 0 ||

2023

wstrcmp(attr->value, "MsoListNumber") == 0 )) ||

2024

(atrStyle && (strstr(atrStyle->value,"mso-list:") != null))) /* 463066 - fix by Joel Shafer 19 Sep 01 */

1967

if ( AttrValueIs(attr, "MsoListBullet") ||

1968

AttrValueIs(attr, "MsoListNumber") ||

1969

AttrContains(atrStyle, "mso-list:") )

2025

1970

{

2026

Dict* listType = tag_ul;

2027

if ( wstrcmp(attr->value, "MsoListNumber") == 0 )

2028

listType = tag_ol;

2029

2030

CoerceNode(lexer, node, tag_li);

2031

2032

if (!list || list->tag != listType)

1971

TidyTagId listType = TidyTag_UL;

1972

if (AttrValueIs(attr, "MsoListNumber"))

1973

listType = TidyTag_OL;

1974

1975

CoerceNode(doc, node, TidyTag_LI, no, yes);

1976

1977

if ( !list || TagId(list) != listType )

2033

1978

{

2034

list = InferredTag(lexer, listType->name);

1979

const Dict* tag = LookupTagDef( listType );

1980

list = InferredTag(doc, tag->id);

2035

1981

InsertNodeBeforeElement(node, list);

2036

1982

}

2037

1983

2038

PurgeWord2000Attributes(node);

1984

PurgeWord2000Attributes( doc, node );

2039

1985

2040

if (node->content)

2041

CleanWord2000(lexer, node->content);

1986

if ( node->content )

1987

CleanWord2000( doc, node->content );

2042

1988

2043

1989

/* remove node and append to contents of list */

2044

1990

RemoveNode(node);

2046

1992

node = list;

2047

1993

}

2048

1994

/* map sequence of <p class="Code"> to <pre>...</pre> */

2049

else if (attr && wstrcmp(attr->value, "Code") == 0)

1995

else if (AttrValueIs(attr, "Code"))

2050

1996

{

2051

1997

Node *br = NewLineNode(lexer);

2052

NormalizeSpaces(lexer, node);

1998

NormalizeSpaces(lexer, node->content);

2053

1999

2054

if (!list || list->tag != tag_pre)

2000

if ( !list || TagId(list) != TidyTag_PRE )

2055

2001

{

2056

list = InferredTag(lexer, "pre");

2002

list = InferredTag(doc, TidyTag_PRE);

2057

2003

InsertNodeBeforeElement(node, list);

2058

2004

}

2059

2005

2060

2006

/* remove node and append to contents of list */

2061

2007

RemoveNode(node);

2062

2008

InsertNodeAtEnd(list, node);

2063

StripSpan(lexer, node);

2009

StripSpan( doc, node );

2064

2010

InsertNodeAtEnd(list, br);

2065

2011

node = list->next;

2066

2012

}

2067

2013

else

2068

list = null;

2014

list = NULL;

2069

2015

}

2070

2016

else

2071

list = null;

2017

list = NULL;

2018

2019

if (!node)

2020

return;

2072

2021

2073

2022

/* strip out style and class attributes */

2074

if (node->type == StartTag || node->type == StartEndTag)

2075

PurgeWord2000Attributes(node);

2023

if (nodeIsElement(node))

2024

PurgeWord2000Attributes( doc, node );

2076

2025

2077

2026

if (node->content)

2078

CleanWord2000(lexer, node->content);

2027

CleanWord2000( doc, node->content );

2079

2028

2080

2029

node = node->next;

2081

2030

}

2082

2031

}

2083

2032

2084

Bool IsWord2000(Node *root)

2033

Bool IsWord2000( TidyDocImpl* doc )

2085

2034

{

2086

2035

AttVal *attval;

2087

2036

Node *node, *head;

2088

Node *html = FindHTML(root);

2037

Node *html = FindHTML( doc );

2089

2038

2090

2039

if (html && GetAttrByName(html, "xmlns:o"))

2091

2040

return yes;

2092

2041

2093

2042

/* search for <meta name="GENERATOR" content="Microsoft ..."> */

2094

head = FindHEAD(root);

2043

head = FindHEAD( doc );

2095

2044

2096

2045

if (head)

2097

2046

{

2098

2047

for (node = head->content; node; node = node->next)

2099

2048

{

2100

if (node->tag != tag_meta)

2101

continue;

2102

2103

attval = GetAttrByName(node, "name");

2104

2105

if (attval == null || attval->value == null)

2106

continue;

2107

2108

if (wstrcasecmp(attval->value, "generator") != 0)

2109

continue;

2110

2111

attval = GetAttrByName(node, "content");

2112

2113

if (attval == null || attval->value == null)

2114

continue;

2115

2116

if (wsubstr(attval->value, "Microsoft"))

2049

if ( !nodeIsMETA(node) )

2050

continue;

2051

2052

attval = AttrGetById( node, TidyAttr_NAME );

2053

2054

if ( !AttrValueIs(attval, "generator") )

2055

continue;

2056

2057

attval = AttrGetById( node, TidyAttr_CONTENT );

2058

2059

if ( AttrContains(attval, "Microsoft") )

2117

2060

return yes;

2118

2061

}

2119

2062

}

2122

2065

}

2123

2066

2124

2067

/* where appropriate move object elements from head to body */

2125

void BumpObject(Lexer *lexer, Node *html)

2068

void BumpObject( TidyDocImpl* doc, Node *html )

2126

2069

{

2127

Node *node, *next, *head = null, *body = null;

2128

2129

for (node = html->content; node != null; node = node->next)

2070

Node *node, *next, *head = NULL, *body = NULL;

2071

2072

if (!html)

2073

return;

2074

2075

for ( node = html->content; node != NULL; node = node->next )

2130

2076

{

2131

if (node->tag == tag_head)

2077

if ( nodeIsHEAD(node) )

2132

2078

head = node;

2133

2079

2134

if (node->tag == tag_body)

2080

if ( nodeIsBODY(node) )

2135

2081

body = node;

2136

2082

}

2137

2083

2138

if (head != null && body != null)

2084

if ( head != NULL && body != NULL )

2139

2085

{

2140

for (node = head->content; node != null; node = next)

2086

for (node = head->content; node != NULL; node = next)

2141

2087

{

2142

2088

next = node->next;

2143

2089

2144

if (node->tag == tag_object)

2090

if ( nodeIsOBJECT(node) )

2145

2091

{

2146

2092

Node *child;

2147

2093

Bool bump = no;

2148

2094

2149

for (child = node->content; child != null; child = child->next)

2095

for (child = node->content; child != NULL; child = child->next)

2150

2096

{

2151

2097

/* bump to body unless content is param */

2152

if ((child->type == TextNode && !IsBlank(lexer, node))

2153

|| child->tag != tag_param)

2098

if ( (nodeIsText(child) && !IsBlank(doc->lexer, node))

2099

|| !nodeIsPARAM(child) )

2154

2100

{

2155

2101

bump = yes;

2156

2102

break;

2157

2103

}

2158

2104

}

2159

2105

2160

if (bump)

2161

{

2162

RemoveNode(node);

2163

InsertNodeAtStart(body, node);

2164

}

2165

}

2166

}

2106

if ( bump )

2107

{

2108

RemoveNode( node );

2109

InsertNodeAtStart( body, node );

2110

}

2111

}

2112

}

2113

}

2114

}

2115

2116

/* This is disabled due to http://tidy.sf.net/bug/681116 */

2117

#if 0

2118

void FixBrakes( TidyDocImpl* pDoc, Node *pParent )

2119

{

2120

Node *pNode;

2121

Bool bBRDeleted = no;

2122

2123

if (NULL == pParent)

2124

return;

2125

2126

/* First, check the status of All My Children */

2127

pNode = pParent->content;

2128

while (NULL != pNode )

2129

{

2130

/* The node may get trimmed, so save the next pointer, if any */

2131

Node *pNext = pNode->next;

2132

FixBrakes( pDoc, pNode );

2133

pNode = pNext;

2134

}

2135

2136

2137

/* As long as my last child is a <br />, move it to my last peer */

2138

if ( nodeCMIsBlock( pParent ))

2139

{

2140

for ( pNode = pParent->last;

2141

NULL != pNode && nodeIsBR( pNode );

2142

pNode = pParent->last )

2143

{

2144

if ( NULL == pNode->attributes && no == bBRDeleted )

2145

{

2146

DiscardElement( pDoc, pNode );

2147

bBRDeleted = yes;

2148

}

2149

else

2150

{

2151

RemoveNode( pNode );

2152

InsertNodeAfterElement( pParent, pNode );

2153

}

2154

}

2155

TrimEmptyElement( pDoc, pParent );

2156

}

2157

}

2158

#endif

2159

2160

void VerifyHTTPEquiv(TidyDocImpl* pDoc, Node *head)

2161

{

2162

Node *pNode;

2163

StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL;

2164

tmbstr s, pszBegin, pszEnd;

2165

ctmbstr enc = GetEncodingNameFromTidyId(cfg(pDoc, TidyOutCharEncoding));

2166

2167

if (!enc)

2168

return;

2169

2170

if (!nodeIsHEAD(head))

2171

head = FindHEAD(pDoc);

2172

2173

if (!head)

2174

return;

2175

2176

/* Find any <meta http-equiv='Content-Type' content='...' /> */

2177

for (pNode = head->content; NULL != pNode; pNode = pNode->next)

2178

{

2179

AttVal* httpEquiv = AttrGetById(pNode, TidyAttr_HTTP_EQUIV);

2180

AttVal* metaContent = AttrGetById(pNode, TidyAttr_CONTENT);

2181

2182

if ( !nodeIsMETA(pNode) || !metaContent ||

2183

!AttrValueIs(httpEquiv, "Content-Type") )

2184

continue;

2185

2186

pszBegin = s = tmbstrdup( metaContent->value );

2187

while (pszBegin && *pszBegin)

2188

{

2189

while (isspace( *pszBegin ))

2190

pszBegin++;

2191

pszEnd = pszBegin;

2192

while ('\0' != *pszEnd && ';' != *pszEnd)

2193

pszEnd++;

2194

if (';' == *pszEnd )

2195

*(pszEnd++) = '\0';

2196

if (pszEnd > pszBegin)

2197

{

2198

prop = (StyleProp *)MemAlloc(sizeof(StyleProp));

2199

prop->name = tmbstrdup( pszBegin );

2200

prop->value = NULL;

2201

prop->next = NULL;

2202

2203

if (NULL != pLastProp)

2204

pLastProp->next = prop;

2205

else

2206

pFirstProp = prop;

2207

2208

pLastProp = prop;

2209

pszBegin = pszEnd;

2210

}

2211

}

2212

MemFree( s );

2213

2214

/* find the charset property */

2215

for (prop = pFirstProp; NULL != prop; prop = prop->next)

2216

{

2217

if (0 != tmbstrncasecmp( prop->name, "charset", 7 ))

2218

continue;

2219

2220

MemFree( prop->name );

2221

prop->name = MemAlloc( 8 + tmbstrlen(enc) + 1 );

2222

tmbstrcpy(prop->name, "charset=");

2223

tmbstrcpy(prop->name+8, enc);

2224

s = CreatePropString( pFirstProp );

2225

MemFree( metaContent->value );

2226

metaContent->value = s;

2227

break;

2228

}

2229

/* #718127, prevent memory leakage */

2230

FreeStyleProps(pFirstProp);

2231

pFirstProp = NULL;

2232

pLastProp = NULL;

2233

}

2234

}

2235

2236

void DropComments(TidyDocImpl* doc, Node* node)

2237

{

2238

Node* next;

2239

2240

while (node)

2241

{

2242

next = node->next;

2243

2244

if (node->type == CommentTag)

2245

{

2246

RemoveNode(node);

2247

FreeNode(doc, node);

2248

node = next;

2249

continue;

2250

}

2251

2252

if (node->content)

2253

DropComments(doc, node->content);

2254

2255

node = next;

2256

}

2257

}

2258

2259

void DropFontElements(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))

2260

{

2261

Node* next;

2262

2263

while (node)

2264

{

2265

next = node->next;

2266

2267

if (nodeIsFONT(node))

2268

{

2269

DiscardContainer(doc, node, &next);

2270

node = next;

2271

continue;

2272

}

2273

2274

if (node->content)

2275

DropFontElements(doc, node->content, &next);

2276

2277

node = next;

2278

}

2279

}

2280

2281

void WbrToSpace(TidyDocImpl* doc, Node* node)

2282

{

2283

Node* next;

2284

2285

while (node)

2286

{

2287

next = node->next;

2288

2289

if (nodeIsWBR(node))

2290

{

2291

Node* text;

2292

text = NewLiteralTextNode(doc->lexer, " ");

2293

InsertNodeAfterElement(node, text);

2294

RemoveNode(node);

2295

FreeNode(doc, node);

2296

node = next;

2297

continue;

2298

}

2299

2300

if (node->content)

2301

WbrToSpace(doc, node->content);

2302

2303

node = next;

2304

}

2305

}

2306

2307

/*

2308

Filters from Word and PowerPoint often use smart

2309

quotes resulting in character codes between 128

2310

and 159. Unfortunately, the corresponding HTML 4.0

2311

entities for these are not widely supported. The

2312

following converts dashes and quotation marks to

2313

the nearest ASCII equivalent. My thanks to

2314

Andrzej Novosiolov for his help with this code.

2315

2316

Note: The old code in the pretty printer applied

2317

this to all node types and attribute values while

2318

this routine applies it only to text nodes. First,

2319

Microsoft Office products rarely put the relevant

2320

characters into these tokens, second support for

2321

them is much better now and last but not least, it

2322

can be harmful to replace these characters since

2323

US-ASCII quote marks are often used as syntax

2324

characters, a simple

2325

2326

2327

2328

would be broken if the U+2018 is replaced by "'".

2329

The old code would neither take care whether the

2330

quote mark is already used as delimiter,

2331

2332

2333

2334

got

2335

2336

2337

2338

Since browser support is much better nowadays and

2339

high-quality typography is better than ASCII it'd

2340

be probably a good idea to drop the feature...

2341

*/

2342

void DowngradeTypography(TidyDocImpl* doc, Node* node)

2343

{

2344

Node* next;

2345

Lexer* lexer = doc->lexer;

2346

2347

while (node)

2348

{

2349

next = node->next;

2350

2351

if (nodeIsText(node))

2352

{

2353

uint i, c;

2354

tmbstr p = lexer->lexbuf + node->start;

2355

2356

for (i = node->start; i < node->end; ++i)

2357

{

2358

c = (unsigned char) lexer->lexbuf[i];

2359

2360

if (c > 0x7F)

2361

i += GetUTF8(lexer->lexbuf + i, &c);

2362

2363

if (c >= 0x2013 && c <= 0x201E)

2364

{

2365

switch (c)

2366

{

2367

case 0x2013: /* en dash */

2368

case 0x2014: /* em dash */

2369

c = '-';

2370

break;

2371

case 0x2018: /* left single quotation mark */

2372

case 0x2019: /* right single quotation mark */

2373

case 0x201A: /* single low-9 quotation mark */

2374

c = '\'';

2375

break;

2376

case 0x201C: /* left double quotation mark */

2377

case 0x201D: /* right double quotation mark */

2378

case 0x201E: /* double low-9 quotation mark */

2379

c = '"';

2380

break;

2381

}

2382

}

2383

2384

p = PutUTF8(p, c);

2385

}

2386

2387

node->end = p - lexer->lexbuf;

2388

}

2389

2390

if (node->content)

2391

DowngradeTypography(doc, node->content);

2392

2393

node = next;

2394

}

2395

}

2396

2397

void ReplacePreformattedSpaces(TidyDocImpl* doc, Node* node)

2398

{

2399

Node* next;

2400

2401

while (node)

2402

{

2403

next = node->next;

2404

2405

if (node->tag && node->tag->parser == ParsePre)

2406

{

2407

NormalizeSpaces(doc->lexer, node->content);

2408

node = next;

2409

continue;

2410

}

2411

2412

if (node->content)

2413

ReplacePreformattedSpaces(doc, node->content);

2414

2415

node = next;

2416

}

2417

}

2418

2419

void ConvertCDATANodes(TidyDocImpl* doc, Node* node)

2420

{

2421

Node* next;

2422

2423

while (node)

2424

{

2425

next = node->next;

2426

2427

if (node->type == CDATATag)

2428

node->type = TextNode;

2429

2430

if (node->content)

2431

ConvertCDATANodes(doc, node->content);

2432

2433

node = next;

2434

}

2435

}

2436

2437

/*

2438

FixLanguageInformation ensures that the document contains (only)

2439

the attributes for language information desired by the output

2440

document type. For example, for XHTML 1.0 documents both

2441

'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'

2442

is desired and for HTML 4.01 only 'lang' is desired.

2443

*/

2444

void FixLanguageInformation(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)

2445

{

2446

Node* next;

2447

2448

while (node)

2449

{

2450

next = node->next;

2451

2452

/* todo: report modifications made here to the report system */

2453

2454

if (nodeIsElement(node))

2455

{

2456

AttVal* lang = AttrGetById(node, TidyAttr_LANG);

2457

AttVal* xmlLang = AttrGetById(node, TidyAttr_XML_LANG);

2458

2459

if (lang && xmlLang)

2460

{

2461

/*

2462

todo: check whether both attributes are in sync,

2463

here or elsewhere, where elsewhere is probably

2464

preferable.

2465

AD - March 2005: not mandatory according the standards.

2466

*/

2467

}

2468

else if (lang && wantXmlLang)

2469

{

2470

if (NodeAttributeVersions( node, TidyAttr_XML_LANG )

2471

& doc->lexer->versionEmitted)

2472

RepairAttrValue(doc, node, "xml:lang", lang->value);

2473

}

2474

else if (xmlLang && wantLang)

2475

{

2476

if (NodeAttributeVersions( node, TidyAttr_LANG )

2477

& doc->lexer->versionEmitted)

2478

RepairAttrValue(doc, node, "lang", xmlLang->value);

2479

}

2480

2481

if (lang && !wantLang)

2482

RemoveAttribute(doc, node, lang);

2483

2484

if (xmlLang && !wantXmlLang)

2485

RemoveAttribute(doc, node, xmlLang);

2486

}

2487

2488

if (node->content)

2489

FixLanguageInformation(doc, node->content, wantXmlLang, wantLang);

2490

2491

node = next;

2492

}

2493

}

2494

2495

/*

2496

Set/fix/remove <html xmlns='...'>

2497

*/

2498

void FixXhtmlNamespace(TidyDocImpl* doc, Bool wantXmlns)

2499

{

2500

Node* html = FindHTML(doc);

2501

AttVal* xmlns;

2502

2503

if (!html)

2504

return;

2505

2506

xmlns = AttrGetById(html, TidyAttr_XMLNS);

2507

2508

if (wantXmlns)

2509

{

2510

if (!AttrValueIs(xmlns, XHTML_NAMESPACE))

2511

RepairAttrValue(doc, html, "xmlns", XHTML_NAMESPACE);

2512

}

2513

else if (xmlns)

2514

{

2515

RemoveAttribute(doc, html, xmlns);

2516

}

2517

}

2518

2519

/*

2520

...

2521

*/

2522

void FixAnchors(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)

2523

{

2524

Node* next;

2525

2526

while (node)

2527

{

2528

next = node->next;

2529

2530

if (IsAnchorElement(doc, node))

2531

{

2532

AttVal *name = AttrGetById(node, TidyAttr_NAME);

2533

AttVal *id = AttrGetById(node, TidyAttr_ID);

2534

2535

/* todo: how are empty name/id attributes handled? */

2536

2537

if (name && id)

2538

{

2539

Bool NameHasValue = AttrHasValue(name);

2540

Bool IdHasValue = AttrHasValue(id);

2541

if ( (NameHasValue != IdHasValue) ||

2542

(NameHasValue && IdHasValue &&

2543

tmbstrcmp(name->value, id->value) != 0 ) )

2544

ReportAttrError( doc, node, name, ID_NAME_MISMATCH);

2545

}

2546

else if (name && wantId)

2547

{

2548

if (NodeAttributeVersions( node, TidyAttr_ID )

2549

& doc->lexer->versionEmitted)

2550

{

2551

if (IsValidHTMLID(name->value))

2552

{

2553

RepairAttrValue(doc, node, "id", name->value);

2554

}

2555

else

2556

{

2557

ReportAttrError(doc, node, name, INVALID_XML_ID);

2558

}

2559

}

2560

}

2561

else if (id && wantName)

2562

{

2563

if (NodeAttributeVersions( node, TidyAttr_NAME )

2564

& doc->lexer->versionEmitted)

2565

/* todo: do not assume id is valid */

2566

RepairAttrValue(doc, node, "name", id->value);

2567

}

2568

2569

if (id && !wantId)

2570

RemoveAttribute(doc, node, id);

2571

2572

if (name && !wantName)

2573

RemoveAttribute(doc, node, name);

2574

2575

if (AttrGetById(node, TidyAttr_NAME) == NULL &&

2576

AttrGetById(node, TidyAttr_ID) == NULL)

2577

RemoveAnchorByNode(doc, node);

2578

}

2579

2580

if (node->content)

2581

FixAnchors(doc, node->content, wantName, wantId);

2582

2583

node = next;

2167

2584

}

2168

2585

}