~ubuntu-branches/ubuntu/saucy/libhtmlcleaner-java/saucy-proposed

public HtmlTokenizer(Reader reader, CleanerProperties props, CleanerTransformations transformations, ITagInfoProvider tagInfoProvider) throws IOException {

this._reader = new BufferedReader(reader);

this.props = props;

100

this.isOmitUnknownTags = props.isOmitUnknownTags();

101

this.isTreatUnknownTagsAsContent = props.isTreatUnknownTagsAsContent();

102

this.isOmitDeprecatedTags = props.isOmitDeprecatedTags();

103

this.isTreatDeprecatedTagsAsContent = props.isTreatDeprecatedTagsAsContent();

104

this.isNamespacesAware = props.isNamespacesAware();

105

this.isOmitComments = props.isOmitComments();

106

this.isAllowMultiWordAttributes = props.isAllowMultiWordAttributes();

107

this.isAllowHtmlInsideAttributes = props.isAllowHtmlInsideAttributes();

108

this.transformations = transformations;

109

this.tagInfoProvider = tagInfoProvider;

110

}

111

112

private void addToken(BaseToken token) {

113

_tokenList.add(token);

114

makeTree(_tokenList);

115

}

116

117

abstract void makeTree(List<BaseToken> tokenList);

118

119

abstract TagNode createTagNode(String name);

120

121

private void readIfNeeded(int neededChars) throws IOException {

122

if (_len == -1 && _pos + neededChars >= WORKING_BUFFER_SIZE) {

123

int numToCopy = WORKING_BUFFER_SIZE - _pos;

124

System.arraycopy(_working, _pos, _working, 0, numToCopy);

125

_pos = 0;

126

127

int expected = WORKING_BUFFER_SIZE - numToCopy;

128

int size = 0;

129

int charsRead;

130

int offset = numToCopy;

131

do {

132

charsRead = _reader.read(_working, offset, expected);

133

if (charsRead >= 0) {

134

size += charsRead;

135

offset += charsRead;

136

expected -= charsRead;

137

}

138

} while (charsRead >= 0 && expected > 0);

139

140

if (expected > 0) {

141

_len = size + numToCopy;

142

}

143

144

// convert invalid XML characters to spaces

145

for (int i = 0; i < (_len >= 0 ? _len : WORKING_BUFFER_SIZE); i++) {

146

int ch = _working[i];

147

if (ch >= 1 && ch <= 32 && ch != 10 && ch != 13) {

148

_working[i] = ' ';

149

}

150

}

151

}

152

}

153

154

List<BaseToken> getTokenList() {

155

return this._tokenList;

156

}

157

158

private void go() throws IOException {

159

_pos++;

160

readIfNeeded(0);

161

}

162

163

private void go(int step) throws IOException {

164

_pos += step;

165

readIfNeeded(step - 1);

166

}

167

168

/**

169

* Checks if content starts with specified value at the current position.

170

* @param value

171

* @return true if starts with specified value, false otherwise.

172

* @throws IOException

173

174

private boolean startsWith(String value) throws IOException {

175

int valueLen = value.length();

176

readIfNeeded(valueLen);

177

if (_len >= 0 && _pos + valueLen > _len) {

178

return false;

179

}

180

181

for (int i = 0; i < valueLen; i++) {

182

char ch1 = Character.toLowerCase( value.charAt(i) );

183

char ch2 = Character.toLowerCase( _working[_pos + i] );

184

if (ch1 != ch2) {

185

return false;

186

}

187

}

188

189

return true;

190

}

191

192

private boolean startsWithSimple(String value) throws IOException {

193

int valueLen = value.length();

194

readIfNeeded(valueLen);

195

if (_len >= 0 && _pos + valueLen > _len) {

196

return false;

197

}

198

199

for (int i = 0; i < valueLen; i++) {

200

if (value.charAt(i) != _working[_pos + i]) {

201

return false;

202

}

203

}

204

205

return true;

206

}

207

208

/**

209

* Checks if character at specified position is whitespace.

210

* @param position

211

* @return true is whitespace, false otherwise.

212

213

private boolean isWhitespace(int position) {

214

if (_len >= 0 && position >= _len) {

215

return false;

216

}

217

218

return Character.isWhitespace( _working[position] );

219

}

220

221

/**

222

* Checks if character at current runtime position is whitespace.

223

* @return true is whitespace, false otherwise.

224

225

private boolean isWhitespace() {

226

return isWhitespace(_pos);

227

}

228

229

private boolean isWhitespaceSafe() {

230

return Character.isWhitespace( _working[_pos] );

231

}

232

233

/**

234

* Checks if character at specified position is equal to specified char.

235

* @param position

236

* @param ch

237

* @return true is equals, false otherwise.

238

239

private boolean isChar(int position, char ch) {

240

if (_len >= 0 && position >= _len) {

241

return false;

242

}

243

244

return Character.toLowerCase(ch) == Character.toLowerCase(_working[position]);

245

}

246

247

/**

248

* Checks if character at current runtime position is equal to specified char.

249

* @param ch

250

* @return true is equal, false otherwise.

251

252

private boolean isChar(char ch) {

253

return isChar(_pos, ch);

254

}

255

256

private boolean isCharSimple(char ch) {

257

return (_len < 0 || _pos < _len) && (ch == _working[_pos]);

258

}

259

260

/**

261

* @return Current character to be read, but first it must be checked if it exists.

262

* This method is made for performance reasons to be used instead of isChar(...).

263

264

private char getCurrentChar() {

265

return _working[_pos];

266

}

267

268

private boolean isCharEquals(char ch) {

269

return _working[_pos] == ch;

270

}

271

272

/**

273

* Checks if character at specified position can be identifier start.

274

* @param position

275

* @return true is may be identifier start, false otherwise.

276

277

private boolean isIdentifierStartChar(int position) {

278

if (_len >= 0 && position >= _len) {

279

return false;

280

}

281

282

char ch = _working[position];

283

return Character.isUnicodeIdentifierStart(ch) || ch == '_';

284

}

285

286

/**

287

* Checks if character at current runtime position can be identifier start.

288

* @return true is may be identifier start, false otherwise.

289

290

private boolean isIdentifierStartChar() {

291

return isIdentifierStartChar(_pos);

292

}

293

294

/**

295

* Checks if character at current runtime position can be identifier part.

296

* @return true is may be identifier part, false otherwise.

297

298

private boolean isIdentifierChar() {

299

if (_len >= 0 && _pos >= _len) {

300

return false;

301

}

302

303

char ch = _working[_pos];

304

return Character.isUnicodeIdentifierStart(ch) || Character.isDigit(ch) || Utils.isIdentifierHelperChar(ch);

305

}

306

307

private boolean isValidXmlChar() {

308

return isAllRead() || Utils.isValidXmlChar(_working[_pos]);

309

}

310

311

private boolean isValidXmlCharSafe() {

312

return Utils.isValidXmlChar(_working[_pos]);

313

}

314

315

/**

316

* Checks if end of the content is reached.

317

318

private boolean isAllRead() {

319

return _len >= 0 && _pos >= _len;

320

}

321

322

/**

323

* Saves specified character to the temporary buffer.

324

* @param ch

325

326

private void save(char ch) {

327

if (_savedLen >= _saved.length) {

328

char newSaved[] = new char[_saved.length + 512];

329

System.arraycopy(_saved, 0, newSaved, 0, _saved.length);

330

_saved = newSaved;

331

}

332

_saved[_savedLen++] = ch;

333

}

334

335

/**

336

* Saves character at current runtime position to the temporary buffer.

337

338

private void saveCurrent() {

339

if (!isAllRead()) {

340

save( _working[_pos] );

341

}

342

}

343

344

private void saveCurrentSafe() {

345

save( _working[_pos] );

346

}

347

348

/**

349

* Saves specified number of characters at current runtime position to the temporary buffer.

350

* @throws IOException

351

352

private void saveCurrent(int size) throws IOException {

353

readIfNeeded(size);

354

int pos = _pos;

355

while ( !isAllRead() && (size > 0) ) {

356

save( _working[pos] );

357

pos++;

358

size--;

359

}

360

}

361

362

/**

363

* Skips whitespaces at current position and moves foreward until

364

* non-whitespace character is found or the end of content is reached.

365

* @throws IOException

366

367

private void skipWhitespaces() throws IOException {

368

while ( !isAllRead() && isWhitespaceSafe() ) {

369

saveCurrentSafe();

370

go();

371

}

372

}

373

374

private boolean addSavedAsContent() {

375

if (_savedLen > 0) {

376

addToken(new ContentNode(_saved, _savedLen));

377

_savedLen = 0;

378

return true;

379

}

380

381

return false;

382

}

383

384

/**

385

* Starts parsing HTML.

386

* @throws IOException

387

388

void start() throws IOException {

389

// initialize runtime values

390

_currentTagToken = null;

391

_tokenList.clear();

392

_asExpected = true;

393

_isScriptContext = false;

394

395

boolean isLateForDoctype = false;

396

397

this._pos = WORKING_BUFFER_SIZE;

398

readIfNeeded(0);

399

400

boolean isScriptEmpty = true;

401

402

while ( !isAllRead() ) {

403

// resets all the runtime values

404

_savedLen = 0;

405

_currentTagToken = null;

406

_asExpected = true;

407

408

// this is enough for making decision

409

readIfNeeded(10);

410

411

if (_isScriptContext) {

412

if ( startsWith("</script") && (isWhitespace(_pos + 8) || isChar(_pos + 8, '>')) ) {

413

tagEnd();

414

} else if ( isScriptEmpty && startsWithSimple("<!--") ) {

415

comment();

416

} else {

417

boolean isTokenAdded = content();

418

if (isScriptEmpty && isTokenAdded) {

419

final BaseToken lastToken = _tokenList.get(_tokenList.size() - 1);

420

if (lastToken != null) {

421

final String lastTokenAsString = lastToken.toString();

422

if (lastTokenAsString != null && lastTokenAsString.trim().length() > 0) {

423

isScriptEmpty = false;

424

}

425

}

426

}

427

}

428

if (!_isScriptContext) {

429

isScriptEmpty = true;

430

}

431

} else {

432

if ( startsWith("<!doctype") ) {

433

if ( !isLateForDoctype ) {

434

doctype();

435

isLateForDoctype = true;

436

} else {

437

ignoreUntil('<');

438

}

439

} else if ( startsWithSimple("</") && isIdentifierStartChar(_pos + 2) ) {

440

isLateForDoctype = true;

441

tagEnd();

442

} else if ( startsWithSimple("<!--") ) {

443

comment();

444

} else if ( startsWithSimple("<") && isIdentifierStartChar(_pos + 1) ) {

445

isLateForDoctype = true;

446

tagStart();

447

} else if ( props.isIgnoreQuestAndExclam() && (startsWithSimple("<!") || startsWithSimple("<?")) ) {

448

ignoreUntil('>');

449

if (isCharSimple('>')) {

450

go();

451

}

452

} else {

453

content();

454

}

455

}

456

}

457

458

_reader.close();

459

}

460

461

/**

462

* Checks if specified tag name is one of the reserved tags: HTML, HEAD or BODY

463

* @param tagName

464

* @return

465

466

private boolean isReservedTag(String tagName) {

467

tagName = tagName.toLowerCase();

468

return "html".equals(tagName) || "head".equals(tagName) || "body".equals(tagName);

469

}

470

471

/**

472

* Parses start of the tag.

473

* It expects that current position is at the "<" after which

474

* the tag's name follows.

475

* @throws IOException

476

477

private void tagStart() throws IOException {

478

saveCurrent();

479

go();

480

481

if ( isAllRead() ) {

482

return;

483

}

484

485

String tagName = identifier();

486

487

TagTransformation tagTransformation = null;

488

if (transformations != null && transformations.hasTransformationForTag(tagName)) {

489

tagTransformation = transformations.getTransformation(tagName);

490

if (tagTransformation != null) {

491

tagName = tagTransformation.getDestTag();

492

}

493

}

494

495

if (tagName != null) {

496

TagInfo tagInfo = tagInfoProvider.getTagInfo(tagName);

497

if ( (tagInfo == null && !isOmitUnknownTags && isTreatUnknownTagsAsContent && !isReservedTag(tagName)) ||

498

(tagInfo != null && tagInfo.isDeprecated() && !isOmitDeprecatedTags && isTreatDeprecatedTagsAsContent) ) {

499

content();

500

return;

501

}

502

}

503

504

TagNode tagNode = createTagNode(tagName);

505

_currentTagToken = tagNode;

506

507

if (_asExpected) {

508

skipWhitespaces();

509

tagAttributes();

510

511

if (tagName != null) {

512

if (tagTransformation != null) {

513

tagNode.transformAttributes(tagTransformation);

514

}

515

addToken(_currentTagToken);

516

}

517

518

if ( isCharSimple('>') ) {

519

go();

520

if ( "script".equalsIgnoreCase(tagName) ) {

521

_isScriptContext = true;

522

}

523

} else if ( startsWithSimple("/>") ) {

524

go(2);

525

if ( "script".equalsIgnoreCase(tagName) ) {

526

addToken( new EndTagToken(tagName) );

527

}

528

}

529

530

_currentTagToken = null;

531

} else {

532

addSavedAsContent();

533

}

534

}

535

536

537

/**

538

* Parses end of the tag.

539

* It expects that current position is at the "<" after which

540

* "/" and the tag's name follows.

541

* @throws IOException

542

543

private void tagEnd() throws IOException {

544

saveCurrent(2);

545

go(2);

546

547

if ( isAllRead() ) {

548

return;

549

}

550

551

String tagName = identifier();

552

if (transformations != null && transformations.hasTransformationForTag(tagName)) {

553

TagTransformation tagTransformation = transformations.getTransformation(tagName);

554

if (tagTransformation != null) {

555

tagName = tagTransformation.getDestTag();

556

}

557

}

558

559

if (tagName != null) {

560

TagInfo tagInfo = tagInfoProvider.getTagInfo(tagName);

561

if ( (tagInfo == null && !isOmitUnknownTags && isTreatUnknownTagsAsContent && !isReservedTag(tagName)) ||

562

(tagInfo != null && tagInfo.isDeprecated() && !isOmitDeprecatedTags && isTreatDeprecatedTagsAsContent) ) {

563

content();

564

return;

565

}

566

}

567

568

_currentTagToken = new EndTagToken(tagName);

569

570

if (_asExpected) {

571

skipWhitespaces();

572

tagAttributes();

573

574

if (tagName != null) {

575

addToken(_currentTagToken);

576

}

577

578

if ( isCharSimple('>') ) {

579

go();

580

}

581

582

if ( "script".equalsIgnoreCase(tagName) ) {

583

_isScriptContext = false;

584

}

585

586

_currentTagToken = null;

587

} else {

588

addSavedAsContent();

589

}

590

}

591

592

/**

593

* Parses an identifier from the current position.

594

* @throws IOException

595

596

private String identifier() throws IOException {

597

_asExpected = true;

598

599

if ( !isIdentifierStartChar() ) {

600

_asExpected = false;

601

return null;

602

}

603

604

commonStr.delete(0, commonStr.length());

605

606

while ( !isAllRead() && isIdentifierChar() ) {

607

saveCurrentSafe();

608

commonStr.append( _working[_pos] );

609

go();

610

}

611

612

// strip invalid characters from the end

613

while ( commonStr.length() > 0 && Utils.isIdentifierHelperChar(commonStr.charAt(commonStr.length() - 1)) ) {

614

commonStr.deleteCharAt( commonStr.length() - 1 );

615

}

616

617

if ( commonStr.length() == 0 ) {

618

return null;

619

}

620

621

String id = commonStr.toString();

622

623

int columnIndex = id.indexOf(':');

624

if (columnIndex >= 0) {

625

String prefix = id.substring(0, columnIndex);

626

String suffix = id.substring(columnIndex + 1);

627

int nextColumnIndex = suffix.indexOf(':');

628

if (nextColumnIndex >= 0) {

629

suffix = suffix.substring(0, nextColumnIndex);

630

}

631

id = isNamespacesAware ? (prefix + ":" + suffix) : suffix;

632

}

633

634

return id;

635

}

636

637

/**

638

* Parses list tag attributes from the current position.

639

* @throws IOException

640

641

private void tagAttributes() throws IOException {

642

while( !isAllRead() && _asExpected && !isCharSimple('>') && !startsWithSimple("/>") ) {

643

skipWhitespaces();

644

String attName = identifier();

645

646

if (!_asExpected) {

647

if ( !isCharSimple('<') && !isCharSimple('>') && !startsWithSimple("/>") ) {

648

if (isValidXmlChar()) {

649

saveCurrent();

650

}

651

go();

652

}

653

654

if (!isCharSimple('<')) {

655

_asExpected = true;

656

}

657

658

continue;

659

}

660

661

String attValue;

662

663

skipWhitespaces();

664

if ( isCharSimple('=') ) {

665

saveCurrentSafe();

666

go();

667

attValue = attributeValue();

668

} else if (CleanerProperties.BOOL_ATT_EMPTY.equals(props.booleanAttributeValues)) {

669

attValue = "";

670

} else if (CleanerProperties.BOOL_ATT_TRUE.equals(props.booleanAttributeValues)) {

671

attValue = "true";

672

} else {

673

attValue = attName;

674

}

675

676

if (_asExpected) {

677

_currentTagToken.setAttribute(attName, attValue);

678

}

679

}

680

}

681

682

/**

683

* Parses a single tag attribute - it is expected to be in one of the forms:

684

* name=value

685

* name="value"

686

* name='value'

687

* name

688

* @throws IOException

689

690

private String attributeValue() throws IOException {

691

skipWhitespaces();

692

693

if ( isCharSimple('<') || isCharSimple('>') || startsWithSimple("/>") ) {

694

return "";

695

}

696

697

boolean isQuoteMode = false;

698

boolean isAposMode = false;

699

700

commonStr.delete(0, commonStr.length());

701

702

if ( isCharSimple('\'') ) {

703

isAposMode = true;

704

saveCurrentSafe();

705

go();

706

} else if ( isCharSimple('\"') ) {

707

isQuoteMode = true;

708

saveCurrentSafe();

709

go();

710

}

711

712

while ( !isAllRead() &&

713

( ((isAposMode && !isCharEquals('\'') || isQuoteMode && !isCharEquals('\"')) && (isAllowHtmlInsideAttributes || !isCharEquals('>') && !isCharEquals('<')) && (isAllowMultiWordAttributes || !isWhitespaceSafe())) ||

714

(!isAposMode && !isQuoteMode && !isWhitespaceSafe() && !isCharEquals('>') && !isCharEquals('<'))

715

)

716

) {

717

if (isValidXmlCharSafe()) {

718

commonStr.append( _working[_pos] );

719

saveCurrentSafe();

720

}

721

go();

722

}

723

724

if ( isCharSimple('\'') && isAposMode ) {

725

saveCurrentSafe();

726

go();

727

} else if ( isCharSimple('\"') && isQuoteMode ) {

728

saveCurrentSafe();

729

go();

730

}

731

732

733

return commonStr.toString();

734

}

735

736

private boolean content() throws IOException {

737

while ( !isAllRead() ) {

738

if (isValidXmlCharSafe()) {

739

saveCurrentSafe();

740

}

741

go();

742

743

if ( isCharSimple('<') ) {

744

break;

745

}

746

}

747

748

return addSavedAsContent();

749

}

750

751

private void ignoreUntil(char ch) throws IOException {

752

while ( !isAllRead() ) {

753

go();

754

if ( isChar(ch) ) {

755

break;

756

}

757

}

758

}

759

760

private void comment() throws IOException {

761

go(4);

762

while ( !isAllRead() && !startsWithSimple("-->") ) {

763

if (isValidXmlCharSafe()) {

764

saveCurrentSafe();

765

}

766

go();

767

}

768

769

if (startsWithSimple("-->")) {

770

go(3);

771

}

772

773

if (_savedLen > 0) {

774

if (!isOmitComments) {

775

String hyphenRepl = props.getHyphenReplacementInComment();

776

String comment = new String(_saved, 0, _savedLen).replaceAll("--", hyphenRepl + hyphenRepl);

777

778

if ( comment.length() > 0 && comment.charAt(0) == '-' ) {

779

comment = hyphenRepl + comment.substring(1);

780

}

781

int len = comment.length();

782

if ( len > 0 && comment.charAt(len - 1) == '-' ) {

783

comment = comment.substring(0, len - 1) + hyphenRepl;

784

}

785

786

addToken( new CommentNode(comment) );

787

}

788

_savedLen = 0;

789

}

790

}

791

792

private void doctype() throws IOException {

793

go(9);

794

795

skipWhitespaces();

796

String part1 = identifier();

797

skipWhitespaces();

798

String part2 = identifier();

799

skipWhitespaces();

800

String part3 = attributeValue();

801

skipWhitespaces();

802

String part4 = attributeValue();

803

804

ignoreUntil('<');

805

806

_docType = new DoctypeToken(part1, part2, part3, part4);

807

}

808

809

public DoctypeToken getDocType() {

810

return _docType;

811

}

812

813

}

Older »