~ubuntu-branches/ubuntu/hoary/libextractor/hoary

« back to all changes in this revision

Viewing changes to src/plugins/htmlextractor.c

Committer: Bazaar Package Importer
Author(s): Glenn McGrath
Date: 2004-06-26 12:59:02 UTC
Revision ID: james.westby@ubuntu.com-20040626125902-w97jpn43hsk7tcde

Tags: upstream-0.3.3

Import upstream version 0.3.3

files added:

AUTHORS

COPYING

ChangeLog

INSTALL

Makefile.am

Makefile.in

NEWS

PLATFORMS

README

TODO

acinclude.m4

aclocal.m4

compile

config.guess

config.sub

configure

configure.ac

depcomp

doc/Makefile.am

doc/Makefile.in

doc/extract.1

doc/libextractor.3

install-sh

libltdl

libltdl/COPYING.LIB

libltdl/Makefile.am

libltdl/Makefile.in

libltdl/README

libltdl/acinclude.m4

libltdl/aclocal.m4

libltdl/config-h.in

libltdl/config.guess

libltdl/config.sub

libltdl/configure

libltdl/configure.ac

libltdl/install-sh

libltdl/ltdl.c

libltdl/ltdl.h

libltdl/ltmain.sh

libltdl/missing

libltdl/mkinstalldirs

ltmain.sh

missing

mkinstalldirs

src/Makefile.am

src/Makefile.in

src/include

src/include/Makefile.am

src/include/Makefile.in

src/include/config.h.in

src/include/extractor.h

src/include/extractor_util.h

src/main

src/main/Makefile.am

src/main/Makefile.in

src/main/extract.c

src/main/extractor.c

src/main/getopt.c

src/main/getopt.h

src/main/getopt1.c

src/main/io.c

src/main/org_ovmj_libextractor_Extractor.h

src/org

src/org/ovmj

src/org/ovmj/libextractor

src/org/ovmj/libextractor/Extractor.java

src/org/ovmj/libextractor/Xtract.java

src/plugins

src/plugins/Makefile.am

src/plugins/Makefile.in

src/plugins/asfextractor.c

src/plugins/elfextractor.c

src/plugins/filenameextractor.c

src/plugins/gifextractor.c

src/plugins/htmlextractor.c

src/plugins/jpegextractor.c

src/plugins/lowerextractor.c

src/plugins/mimeextractor.c

src/plugins/mp3extractor.c

src/plugins/mpegextractor.c

src/plugins/oggextractor.c

src/plugins/pack.c

src/plugins/pack.h

src/plugins/pdf

src/plugins/pdf/Array.cc

src/plugins/pdf/Array.h

src/plugins/pdf/COPYING

src/plugins/pdf/Catalog.cc

src/plugins/pdf/Catalog.h

src/plugins/pdf/CompactFontInfo.h

src/plugins/pdf/Decrypt.cc

src/plugins/pdf/Decrypt.h

src/plugins/pdf/Dict.cc

src/plugins/pdf/Dict.h

src/plugins/pdf/Error.cc

src/plugins/pdf/Error.h

src/plugins/pdf/FontEncoding.cc

src/plugins/pdf/FontEncoding.h

src/plugins/pdf/FontFile.cc

src/plugins/pdf/FontFile.h

src/plugins/pdf/FontInfo.h

src/plugins/pdf/FormWidget.cc

src/plugins/pdf/FormWidget.h

src/plugins/pdf/Function.cc

src/plugins/pdf/Function.h

src/plugins/pdf/GString.cc

src/plugins/pdf/GString.h

src/plugins/pdf/Gfx.cc

src/plugins/pdf/Gfx.h

src/plugins/pdf/GfxFont.cc

src/plugins/pdf/GfxFont.h

src/plugins/pdf/GfxState.cc

src/plugins/pdf/GfxState.h

src/plugins/pdf/Lexer.cc

src/plugins/pdf/Lexer.h

src/plugins/pdf/Link.cc

src/plugins/pdf/Link.h

src/plugins/pdf/Makefile.am

src/plugins/pdf/Makefile.in

src/plugins/pdf/Object.cc

src/plugins/pdf/Object.h

src/plugins/pdf/OutputDev.cc

src/plugins/pdf/OutputDev.h

src/plugins/pdf/PDFDoc.cc

src/plugins/pdf/PDFDoc.h

src/plugins/pdf/Page.cc

src/plugins/pdf/Page.h

src/plugins/pdf/Params.cc

src/plugins/pdf/Params.h

src/plugins/pdf/Parser.cc

src/plugins/pdf/Parser.h

src/plugins/pdf/StdFontInfo.h

src/plugins/pdf/Stream-CCITT.h

src/plugins/pdf/Stream.cc

src/plugins/pdf/Stream.h

src/plugins/pdf/XRef.cc

src/plugins/pdf/XRef.h

src/plugins/pdf/config.h

src/plugins/pdf/darwin.lt.rb

src/plugins/pdf/gfile.cc

src/plugins/pdf/gfile.h

src/plugins/pdf/gmem.cc

src/plugins/pdf/gmem.h

src/plugins/pdf/gmempp.cc

src/plugins/pdf/gtypes.h

src/plugins/pdf/parseargs.cc

src/plugins/pdf/parseargs.h

src/plugins/pdf/pdfextractor.cc

src/plugins/pdf/pdfinfo.cc

src/plugins/pngextractor.c

src/plugins/printable

src/plugins/printable/Makefile.am

src/plugins/printable/Makefile.in

src/plugins/printable/bloomfilter.c

src/plugins/printable/bloomfilter.h

src/plugins/printable/da.txt

src/plugins/printable/de.txt

src/plugins/printable/dictionary-builder.c

src/plugins/printable/en.txt

src/plugins/printable/es.txt

src/plugins/printable/it.txt

src/plugins/printable/no.txt

src/plugins/printable/printableextractor.c

src/plugins/printable/sha1.c

src/plugins/printable/sha1.h

src/plugins/psextractor.c

src/plugins/qtextractor.c

src/plugins/realextractor.c

src/plugins/riffextractor.c

src/plugins/rpm

src/plugins/rpm/Makefile.am

src/plugins/rpm/Makefile.in

src/plugins/rpm/rpmextractor.c

src/plugins/splitextractor.c

src/plugins/tiffextractor.c

src/plugins/wavextractor.c

src/plugins/zipextractor.c

src/test

src/test/Makefile.am

src/test/Makefile.in

src/test/keywordlisttest.c

src/test/multiload.c

src/test/plugintest.c

src/test/trivialtest.c

Show diffs side-by-side

added added

removed removed

src/plugins/htmlextractor.c

This file is part of libextractor.

libextractor is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License as published

by the Free Software Foundation; either version 2, or (at your

option) any later version.

libextractor is distributed in the hope that it will be useful, but

WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

General Public License for more details.

You should have received a copy of the GNU General Public License

along with libextractor; see the file COPYING. If not, write to the

Free Software Foundation, Inc., 59 Temple Place - Suite 330,

Boston, MA 02111-1307, USA.

Portions of this code were adapted from libhtmlparse by

Mooneer Salem (mooneer@translator.cs). The main changes

to libhtmlparse were the removal of globals to make the

code reentrant.

**/

#include "extractor.h"

#include <string.h>

#include <stdio.h>

#include <sys/types.h>

#include <sys/stat.h>

#include <unistd.h>

#include <stdlib.h>

#include <ctype.h>

#include <errno.h>

#define HAVE_MAIN 0

/* struct holding the arguments of tags */

struct ArgvTable {

char *arg, *val;

};

/**

* libhtmlparse has the callbacks defined as globals,

* which is bad for making libextractor re-entrant.

* We now put them all in one big table that is passed

* around inside the parser.

* The CallBacks

* You may call one ore several or even all callbacks. Except of the

* XHTMLCallBack, all CallBacks will work as expected and described

* XHTMLCallBack:

* The XHTMLCallBack is a special case, because you can decide, if the

* XHTML specific tags should be handeled as a start- AND endtag, or

* as an XHTML tag. If you call nothing, except start and endtag, the

* behaviour is, that you'll get a start AND an endtag called back.

* If you call XHTMLCallBack, it will only give you the XHTML call back.

* If you are in doubt or simply confused now, call XHTMLCallBack()

typedef struct PC_ {

/* handle comments and javascript */

int (*commentCallBack) (char *comment, struct PC_ * pc);

int (*commentStartCallBack) (struct PC_ * pc);

int (*commentEndCallBack) (struct PC_ * pc);

/* Declaration e.g. <!DOCTYPE HTML ... */

int (*declCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);

/* Start tag e.g. <html>, with arguments, args may be NULL, numargs may be 0 */

int (*startCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);

/* End tag e.g. </html>*/

int (*endCallBack) (char *tag, struct PC_ * pc);

/* handle plain text */

int (*textCallBack) (char *text, struct PC_ * pc);

int (*textStartCallBack) (struct PC_ * pc);

int (*textEndCallBack) (struct PC_ * pc);

/* PHP inserts. BUG(?): if someone prints another PHP function from this PHP function

our lib will get confused. */

int (*phpCallBack) (char *text, struct PC_ * pc);

/* empty tags like <hr/>, <br/>, with arguments, args may be NULL, numargs may be 0 */

int (*XHTMLCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);

/* XML tags <?xml>, with arguments, args may be NULL, numargs may be 0 */

int (*xmlCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);

/* entities like ä,ä text will inherit all chars between '&' and ';' */

int (*entityCallBack) (char *text, struct PC_ * pc);

/* and we also put some formaly static variables in this */

/* needed to pass text in <script> tags verbatim */

unsigned int lhtml_script_passthru;

100

101

102

int numArgs;

103

104

int numArgsStatus;

105

106

/**

107

* 0: ignore, 1: add keyword

108

109

int nextTextAction;

110

111

/**

112

* If nextTextAction == 1, this gives the type of the

113

* keyword.

114

115

EXTRACTOR_KeywordType nextKeywordType;

116

117

/**

118

* Result of the current pass.

119

120

struct EXTRACTOR_Keywords * result;

121

122

} ParserContext;

123

124

125

/**********************************************************************/

126

127

128

/* argument caching (e.g width="80%") */

129

static struct ArgvTable *addArgToTable(struct ArgvTable *args, char *arg, char *val,

130

struct PC_ * pc) {

131

pc->numArgs++;

132

if (args == NULL) {

133

args = (struct ArgvTable*) calloc(1, sizeof(struct ArgvTable)*(pc->numArgs+1));

134

} else {

135

args = (struct ArgvTable*) realloc(args, sizeof(struct ArgvTable)*(pc->numArgs+1));

136

}

137

if (args == NULL) {

138

fprintf(stderr,

139

"FATAL: could not allocate: %s at %s:%d\n",

140

strerror(errno),

141

__FILE__, __LINE__);

142

exit(EXIT_FAILURE);

143

}

144

args[pc->numArgs-1].arg = arg;

145

args[pc->numArgs-1].val = val;

146

return args;

147

}

148

149

/* clean up memory */

150

static void freeArgs (struct ArgvTable *args,

151

struct PC_ * pc) {

152

int i;

153

154

if (args != NULL) {

155

for(i=0; i<pc->numArgs; i++) {

156

free(args[i].arg);

157

free(args[i].val);

158

}

159

free(args);

160

args=NULL;

161

pc->numArgs=0;

162

}

163

}

164

165

/* prototype */

166

static const char *parseForEntities(const char *, struct PC_ * pc);

167

168

169

static const char *parseText(const char *html, struct PC_ * pc) {

170

char *tmp;

171

const char *tmp2;

172

int ret=0;

173

174

while( (*html != '\0') && isspace((int) *html)) html++;

175

176

if (*html == '<') return html;

177

178

tmp2 = html;

179

while ( (*html != '\0') && (*html != '<') ) html++;

180

181

tmp = (char *)calloc(1, (size_t)(html-tmp2+1));

182

if (!tmp) return "";

183

184

memcpy(tmp, tmp2, (size_t)(html-tmp2));

185

186

if (strlen(tmp) > 0) {

187

if (pc->textStartCallBack) {

188

ret = pc->textStartCallBack(pc);

189

if (ret != 0) {

190

free(tmp);

191

return "";

192

}

193

}

194

if (pc->textCallBack) {

195

if (pc->entityCallBack){ /* that is textCallBack(text)

196

with entityCallBack(entity) as an extrabonus */

197

/*printf("entity is here\n");*/

198

parseForEntities(tmp, pc);

199

} else{

200

ret = pc->textCallBack(tmp, pc);

201

if (ret != 0) {

202

free(tmp);

203

return "";

204

}

205

}

206

}

207

if (pc->textEndCallBack) {

208

ret = pc->textEndCallBack(pc);

209

if (ret != 0) {

210

free(tmp);

211

return "";

212

}

213

}

214

}

215

free(tmp);

216

if (*html != '\0')

217

if (*(html+1) == '>') html += 2;

218

return html;

219

}

220

221

static const char *parseComment (const char *html, struct PC_ * pc) {

222

char *tmp;

223

const char *tmp2;

224

int ret=0;

225

226

while ( (*html == '-') || isspace((int)*html))html++;

227

228

tmp2 = html;

229

while ( (*html != '\0') && !(*html == '-' && *(html+1) == '-' && *(html+2) == '>')) html++;

230

231

tmp = (char *)calloc(1, (size_t)(html-tmp2+1));

232

if (!tmp) return "";

233

234

memcpy(tmp, tmp2, (size_t)(html-tmp2));

235

236

if (*(html+3) != '\0') html += 3;

237

238

if (pc->commentStartCallBack) {

239

ret = pc->commentStartCallBack(pc);

240

if (ret != 0) {

241

free(tmp);

242

return "";

243

}

244

}

245

if (pc->commentCallBack) {

246

ret = pc->commentCallBack(tmp, pc);

247

if (ret != 0) {

248

free(tmp);

249

return "";

250

}

251

}

252

if (pc->commentEndCallBack) {

253

ret = pc->commentEndCallBack(pc);

254

if (ret != 0) {

255

free(tmp);

256

return "";

257

}

258

}

259

free(tmp);

260

return html;

261

}

262

263

static const char *parseEndTag(const char *html, struct PC_ * pc) {

264

char *tmp;

265

const char *tmp2;

266

int ret=0;

267

268

if (*html == '\0')

269

return html;

270

271

html++;

272

tmp2 = html;

273

while(*html != '\0' && *html != '>') html++;

274

275

tmp =(char *) calloc(1, (size_t)(html-tmp2+1));

276

if (!tmp) return "";

277

278

memcpy(tmp, tmp2, (size_t)(html-tmp2));

279

280

if (pc->endCallBack) {

281

ret = pc->endCallBack(tmp,pc);

282

if (ret != 0) {

283

free(tmp);

284

return "";

285

}

286

}

287

if (*html == '>') html++;

288

free(tmp);

289

return html;

290

}

291

292

static const char *parsePHP(const char *html, struct PC_ * pc) {

293

const char *tmp;

294

char *tmp2;

295

int ret=0;

296

297

html += 4;

298

while(*html != '\0' && isspace((int)*html)) html++;

299

300

tmp = html;

301

302

while(*html != '\0' && !(*html == '?' && *(html+1) == '>')) html++;

303

tmp2 = (char *)calloc(1, (size_t)(html-tmp+1));

304

if (!tmp2) return "";

305

306

memcpy(tmp2, tmp, (size_t)(html-tmp));

307

308

if (pc->phpCallBack) {

309

ret = pc->phpCallBack(tmp2, pc);

310

if (ret != 0) {

311

free(tmp2);

312

return "";

313

}

314

}

315

free(tmp2);

316

html += 2;

317

return html;

318

}

319

320

/* parse the XML tag itself */

321

static const char *parseXMLtag(const char *html, struct PC_ * pc) {

322

char *tag, *name, *value;

323

const char *tmp;

324

int ret;

325

struct ArgvTable *tmp2 = NULL;

326

327

pc->numArgs = 0;

328

tmp = html;

329

while(*html != '\0' && !isspace((int)*html) && *html != '>') html++;

330

331

/* you may want to upper/lower tags, so I leave the tag itself untouched */

332

tag = (char *)calloc(1, (size_t)(html-tmp+1));

333

if (!tag) {

334

return "";

335

}

336

memcpy(tag, tmp, (size_t)(html-tmp));

337

338

if (*html == '>') {

339

if (pc->xmlCallBack != NULL) {

340

ret = pc->xmlCallBack(tag, NULL, 0, pc);

341

free(tag);

342

if (*html == '>') html++;

343

return((ret != 0) ? (char *) "" : html);

344

}

345

}

346

while(*html != '\0' && isspace((int)*html)) html++;

347

348

while(*html != '\0' && *html != '>' ) {

349

while(isspace((int)*html)) html++;

350

if (*html == '>') break;

351

352

tmp = html;

353

while(*html != '\0' && !isspace((int)*html) && *html != '=' && *html != '>') html++;

354

name = (char *)calloc(1, (size_t)(html-tmp+1));

355

if (!name) {

356

free(tag);

357

tag = NULL;

358

return "";

359

}

360

memcpy(name, tmp, (size_t)(html-tmp));

361

if (isspace((int)*html)) {

362

tmp2 = addArgToTable(tmp2, name, NULL, pc);

363

while(*html != '\0' && isspace((int)*html) && *html != '>') html++;

364

}

365

if (*html == '>') {

366

tmp2 = addArgToTable(tmp2, name, NULL, pc);

367

html++;

368

break;

369

}

370

if (*html == '=') html++;

371

if (*html != '"' && *html != '\'') {

372

tmp = html;

373

while(*html != '\0' && *html != '>' && !isspace((int)*html)) html++;

374

value = (char *)calloc(1, (size_t)(html-tmp+1));

375

if (!value) {

376

free(name);

377

name = NULL;

378

free(tag);

379

tag = NULL;

380

381

if (tmp2 != NULL) {

382

freeArgs(tmp2, pc);

383

tmp2 = NULL;

384

}

385

return "";

386

}

387

memcpy(value, tmp, (size_t)(html-tmp));

388

tmp2 = addArgToTable(tmp2, name, value, pc);

389

} else if (*html == '"') {

390

html++;

391

tmp = html;

392

while(*html != '\0' && !(*html == '"' && *(html-1) != '\\')) html++;

393

value = (char *) calloc(1, (size_t)(html-tmp+1));

394

if (!value) {

395

free(name);

396

name = NULL;

397

free(tag);

398

tag = NULL;

399

400

if (tmp2 != NULL) {

401

freeArgs(tmp2, pc);

402

tmp2 = NULL;

403

}

404

return "";

405

}

406

memcpy(value, tmp, (size_t)(html-tmp));

407

if (*html != '\0')

408

html++;

409

tmp2 = addArgToTable(tmp2, name, value, pc);

410

} else if (*html == '\'') {

411

html++;

412

tmp = html;

413

while(*html != '\0' && !(*html == '\'' && *(html-1) != '\\')) html++;

414

415

value = (char *)calloc(1, (size_t)(html-tmp+1));

416

if (!value) {

417

free(name);

418

name = NULL;

419

free(tag);

420

tag = NULL;

421

if (tmp2 != NULL) {

422

freeArgs(tmp2, pc);

423

tmp2 = NULL;

424

}

425

return "";

426

}

427

memcpy(value, tmp, (size_t)(html-tmp));

428

if (*html != '\0')

429

html++;

430

tmp2 = addArgToTable(tmp2, name, value, pc);

431

}

432

tmp = NULL;

433

value = NULL;

434

name = NULL;

435

}

436

if (*html != '\0') html++;

437

ret = pc->xmlCallBack(tag, tmp2, pc->numArgs, pc);

438

if (tmp2 != NULL) {

439

freeArgs(tmp2, pc);

440

tmp2 = NULL;

441

}

442

free(tag);

443

tag = NULL;

444

pc->numArgsStatus=0;

445

return (ret != 0 ? "" : html);

446

}

447

448

/* cannibalistic function, munches the actuall tag */

449

static const char *eatUp(const char *html){

450

while ( (*html != '>') &&

451

(*html != '\0') ) {

452

html++;

453

}

454

if (*html != '\0')

455

html++;

456

return html;

457

}

458

459

/* cannibalistic function, munches the actuall text */

460

static const char *eatUpText(const char *html){

461

while(*html != '\0' && *html != '<')

462

html++;

463

return html;

464

}

465

466

467

/* decides, if a found '?' leads to PHP or XML if requisited

468

otherwise it gormandizes them up. *burps* */

469

static const char *parseXML(const char *html, struct PC_ * pc) {

470

/* conditional expressions inside a conditional expression

471

don't try _this_ at home kids! ;-) */

472

html=(((tolower((int)(*(html+1))))==(int)('p')) ?

473

( (pc->phpCallBack) ? parsePHP (html, pc) : eatUp(html) ) :

474

( (pc->xmlCallBack) ? parseXMLtag(html, pc) : eatUp(html) ) );

475

return html;

476

}

477

478

static const char *parseStartTag (const char *html, struct PC_ * pc) {

479

char *tag, *name, *value;

480

const char *tmp;

481

int ret = 0;

482

struct ArgvTable *tmp2 = NULL;

483

484

pc->numArgs = 0;

485

tmp = html;

486

while(*html != '\0' && !isspace((int)*html) &&

487

*html != '>' && *html != '/') html++;

488

489

tag = (char *)calloc(1, (size_t)(html-tmp+1));

490

if (!tag) {

491

return "";

492

}

493

memcpy(tag, tmp, (size_t)(html-tmp));

494

495

if (strncasecmp("script", tag, 6) == 0) {

496

pc->lhtml_script_passthru = 1;

497

}

498

else if (strncasecmp("pre", tag, 3) == 0) {

499

pc->lhtml_script_passthru = 2;

500

}

501

502

if (*html == '>') {

503

if (pc->startCallBack) {

504

ret = pc->startCallBack(tag, NULL, 0, pc);

505

free(tag);

506

tag = NULL;

507

508

/* this check is redundant */

509

/* if (*html == '>') */ html++;

510

return((ret != 0) ? "" : html);

511

}

512

}

513

else if (*html == '/' ) { /* XHTML empty tag like <hr/>, <br/>*/

514

/**********************************************

515

* You may choose now between two behaviors *

516

* of libhtmlparse to handle XHTML empty tags: *

517

* a) call XHTMLCallBack *

518

* b) call start- AND endCallBack *

519

***********************************************/

520

if (pc->startCallBack != NULL && !(pc->XHTMLCallBack)) {

521

ret = pc->startCallBack(tag, NULL, 0, pc);

522

}

523

if (pc->endCallBack != NULL && ret==0 && !(pc->XHTMLCallBack)) {

524

ret = pc->endCallBack(tag, pc);

525

}

526

if(pc->XHTMLCallBack){

527

ret = pc->XHTMLCallBack(tag, NULL, 0, pc);

528

}

529

530

free(tag);

531

tag = NULL;

532

533

html += 2;

534

return((ret != 0) ? "" : html);

535

}

536

537

while(*html != '\0' && isspace((int)*html)) html++;

538

539

while(*html != '\0' && *html != '>' ) {

540

while(isspace((int)*html)) html++;

541

if (*html == '>') break;

542

543

if (*html == '/' && *(html+1) == '>') {

544

html++; break;

545

}

546

547

tmp = html;

548

while(*html != '\0' && !isspace((int)*html) &&

549

*html != '=' && *html != '>') html++;

550

name = (char *)calloc(1, (size_t)(html-tmp+1));

551

if (!name) {

552

free(tag);

553

tag = NULL;

554

return "";

555

}

556

557

memcpy(name, tmp, (size_t)(html-tmp));

558

559

if (isspace((int)*html)) {

560

const char *x = html;

561

while(*x != '\0' && *x != '>' && *x != '=') x++;

562

if (*x == '=') {

563

html = x;

564

goto namevalue;

565

}

566

tmp2 = addArgToTable(tmp2, name, NULL, pc);

567

while(*html != '\0' && isspace((int)*html) &&

568

*html != '>' &&

569

!(*html == '/' && *(html+1) == '>'))

570

html++;

571

} else {

572

573

if (*html == '/') {

574

html++;

575

break;

576

}

577

578

/* html++ is repeated after the while loop

579

* and may cause deletion of important info */

580

if (*html == '>') {

581

tmp2 = addArgToTable(tmp2, name, NULL, pc);

582

/*html++;*/

583

break;

584

}

585

586

namevalue:

587

if (*html == '=') html++;

588

589

while(isspace(*html)) html++;

590

591

if (*html != '"' && *html != '\'') {

592

tmp = html;

593

while(*html != '\0' && *html != '>' &&

594

!isspace((int)*html) &&

595

!(*html == '/' && *(html+1) == '>'))

596

html++;

597

value = (char *)calloc(1, (size_t)(html-tmp+1));

598

if (value == NULL) {

599

free(name);

600

name = NULL;

601

free(tag);

602

tag = NULL;

603

604

freeArgs(tmp2, pc);

605

return "";

606

}

607

memcpy(value, tmp, (size_t)(html-tmp));

608

tmp2 = addArgToTable(tmp2, name, value, pc);

609

} else if (*html == '"') {

610

html++;

611

tmp = html;

612

while(*html != '\0' &&

613

!(*html == '"' && *(html-1) != '\\'))

614

html++;

615

value = (char *) calloc(1, (size_t)(html-tmp+1));

616

if (value == NULL) {

617

free(name);

618

name = NULL;

619

free(tag);

620

tag = NULL;

621

622

freeArgs(tmp2, pc);

623

return "";

624

}

625

626

memcpy(value, tmp, (size_t)(html-tmp));

627

if (*html != '\0')

628

html++;

629

tmp2 = addArgToTable(tmp2, name, value, pc);

630

} else if (*html == '\'') {

631

html++;

632

tmp = html;

633

while(*html != '\0' && !(*html == '\'' &&

634

*(html-1) != '\\')) html++;

635

636

value = (char *)calloc(1, (size_t)(html-tmp+1));

637

if (value == NULL) {

638

free(name);

639

name = NULL;

640

free(tag);

641

tag = NULL;

642

643

freeArgs(tmp2, pc);

644

return "";

645

}

646

647

memcpy(value, tmp, (size_t)(html-tmp));

648

if (*html != '\0')

649

html++;

650

tmp2 = addArgToTable(tmp2, name, value, pc);

651

}

652

tmp = NULL;

653

}

654

}

655

if (*html != '\0') html++;

656

657

if (pc->startCallBack != NULL && *(html-2)!='/' ) {

658

ret = pc->startCallBack(tag, tmp2, pc->numArgs, pc);

659

}

660

if (pc->endCallBack != NULL && ret==0 && *(html-2)=='/'

661

&& !(pc->XHTMLCallBack)) {

662

ret = pc->endCallBack(tag, pc);

663

}

664

/* these tags may have arguments too, e.g. <hr noshade/> */

665

if (pc->XHTMLCallBack != NULL && *(html-2)=='/') {

666

ret = pc->XHTMLCallBack(tag, tmp2, pc->numArgs, pc);

667

}

668

if(tmp2 != NULL){

669

freeArgs(tmp2, pc);

670

}

671

free(tag);

672

tag = NULL;

673

674

pc->numArgsStatus=0;

675

676

/* this is a bad hack, feel free to write a better one (maybe a more readable one? ;-)*/

677

return

678

(pc->XHTMLCallBack != NULL) ?

679

(html) :

680

((ret != 0) ? "" : html);

681

}

682

683

static const char *parseDecl(const char *html, struct PC_ * pc) {

684

char *tag, *name, *value;

685

const char *tmp;

686

int ret=0;

687

struct ArgvTable *tmp2 = NULL;

688

689

pc->numArgs = 0;

690

tmp = html;

691

while(*html != '\0' && !isspace((int)*html) && *html != '>') html++;

692

693

tag = (char *)calloc(1, (size_t)(html-tmp+1));

694

if (!tag) {

695

return "";

696

}

697

698

memcpy(tag, tmp, (size_t)(html-tmp));

699

700

if (*html == '>') {

701

if (pc->declCallBack) {

702

ret = pc->declCallBack(tag, NULL, 0, pc);

703

free(tag);

704

tag = NULL;

705

706

if (*html == '>') html++;

707

return((ret != 0) ? "" : html);

708

}

709

}

710

711

while(*html != '\0' && isspace((int)*html)) html++;

712

713

while(*html != '\0' && *html != '>') {

714

while(isspace((int)*html)) html++;

715

if (*html == '>') break;

716

tmp = html;

717

switch(*tmp) {

718

case '\'' :

719

html++;

720

tmp = html;

721

while(*html != '\0' && !(*html == '\'' && *html != '\\'))

722

html++;

723

break;

724

case '"' :

725

html++;

726

tmp = html;

727

while(*html != '\0' && !(*html == '"' && *html != '\\'))

728

html++;

729

break;

730

default :

731

while(*html != '\0' && !isspace((int)*html) && *html != '=' && *html != '>')

732

html++;

733

break;

734

}

735

736

name = (char *) calloc(1, (size_t)(html-tmp+1));

737

if (!name) {

738

free(tag);

739

tag = NULL;

740

return "";

741

}

742

743

memcpy(name, tmp, (size_t)(html-tmp));

744

745

if (isspace((int)*html)) {

746

tmp2 = addArgToTable(tmp2, name, NULL, pc);

747

while(*html != '\0' && isspace((int)*html) && *html != '>')

748

html++;

749

continue;

750

}

751

752

if (*html == '>') {

753

tmp2 = addArgToTable(tmp2, name, NULL, pc);

754

html++;

755

break;

756

}

757

758

if (*(html+1) == '>') {

759

tmp2 = addArgToTable(tmp2, name, NULL, pc);

760

html += 2;

761

break;

762

}

763

764

if (*html == '=') html++;

765

switch(*html){

766

case '\'' :

767

html++;

768

tmp = html;

769

while(*html != '\0' && !(*html == '\'' && *(html-1) != '\\'))

770

html++;

771

772

value = (char *) calloc(1, (size_t)(html-tmp+1));

773

if (!value) {

774

free(name);

775

name = NULL;

776

free(tag);

777

tag = NULL;

778

779

freeArgs(tmp2, pc);

780

return "";

781

}

782

783

memcpy(value, tmp, (size_t)(html-tmp));

784

if (*html != '\0')

785

html++;

786

tmp2 = addArgToTable(tmp2, name, value, pc);

787

break;

788

case '"' :

789

html++;

790

tmp = html;

791

while (*html != '\0' && !(*html == '"' && *(html-1) != '\\'))

792

html++;

793

value = (char *)calloc(1, (size_t)(html-tmp+1));

794

if (!value) {

795

free(name);

796

name = NULL;

797

free(tag);

798

tag = NULL;

799

800

freeArgs(tmp2, pc);

801

return "";

802

}

803

804

memcpy(value, tmp, (size_t)(html-tmp));

805

if (*html != '\0')

806

html++;

807

tmp2 = addArgToTable(tmp2, name, value, pc);

808

break;

809

default :

810

html++;

811

tmp = html;

812

while(*html != '\0' && *html != '>' && !isspace((int)*html))

813

html++;

814

value = (char *) calloc(1, (size_t)(html-tmp+1));

815

if (!value) {

816

free(name);

817

name = NULL;

818

free(tag);

819

tag = NULL;

820

821

freeArgs(tmp2, pc);

822

return "";

823

}

824

825

memcpy(value, tmp, (size_t)(html-tmp));

826

tmp2 = addArgToTable(tmp2, name, value, pc);

827

break;

828

}

829

tmp = NULL;

830

}

831

832

if (*html != '\0') html++;

833

834

if (pc->declCallBack) {

835

ret = pc->declCallBack(tag, tmp2, pc->numArgs, pc);

836

freeArgs(tmp2, pc);

837

free(tag);

838

tag = NULL;

839

return((ret != 0) ? "" : html);

840

}

841

freeArgs(tmp2, pc);

842

pc->numArgsStatus=0;

843

844

return html;

845

}

846

847

static const char *parseForEntities (const char *tmp, struct PC_ * pc){

848

char *entity, *text ;

849

const char *tmp1, *tmp2;

850

int ret=0, count=0;

851

while(*tmp != '\0'){

852

tmp1 = tmp;

853

while(*tmp != '\0' && *tmp != '&')tmp++;

854

855

text = (char *)calloc(1, (size_t)(tmp-tmp1+1));

856

if (text == NULL) {

857

return "";

858

}

859

860

memcpy(text, tmp1, (size_t)(tmp-tmp1));

861

/* the chunk of text before the first entity will

862

not be called, if it starts with an entity*/

863

if(strlen(text)>0 && (!(isspace((int)*text)))){

864

if (pc->textCallBack) {

865

ret = pc->textCallBack(text, pc);

866

}

867

free(text);

868

text = NULL;

869

tmp1 = "";

870

}

871

if(*tmp == '&'){

872

tmp++;

873

tmp2=tmp;

874

/* sometimes the ';' is absent, it's a bad hack, just to avoid more trouble */

875

while( *tmp != '\0' && (*tmp != ';' && count != 9) ){

876

tmp++;

877

count++;

878

}

879

entity = (char *)calloc(1, (size_t)(tmp-tmp2+1));

880

if (!entity) {

881

return "";

882

} else {

883

memcpy(entity, tmp2, (size_t)(tmp-tmp2));

884

if (*tmp == ';' || count == 9){ /* should I add an errortrap here? */

885

ret = pc->entityCallBack(entity, pc);

886

free(entity);

887

entity = NULL;

888

tmp2 = "";

889

count = 0;

890

}

891

}

892

}

893

if (*tmp != '\0') tmp++;

894

}

895

return tmp;

896

}

897

898

static void parse (const char *html, struct PC_ * pc) {

899

while(*html != '\0') {

900

/* while(isspace(*html)){html++;} there may be leading blanks in some autogenerated files

901

add this or not, that is the question ;-)) */

902

903

if (pc->lhtml_script_passthru != 0) {

904

const char *text;

905

char *tmp;

906

907

text = html;

908

if (pc->lhtml_script_passthru == 1 ){

909

while(*text != '\0') {

910

if (*text == '<') {

911

if (*(text+2) == 's' || *(text+2) == 'S') {

912

if (*(text+7) == 't' || *(text+7) == 'T') {

913

break;

914

}

915

}

916

}

917

if(*text != '\0') text++;

918

} }

919

if (pc->lhtml_script_passthru == 2 ){

920

while(*text != '\0') {

921

if (*text == '<') {

922

if (*(text+2) == 'p' || *(text+2) == 'P') {

923

if (*(text+4) == 'e' || *(text+4) == 'E') {

924

break;

925

}

926

}

927

}

928

if(*text != '\0') text++;

929

}

930

}

931

tmp = (char *) malloc((size_t)(text-html+1));

932

if (tmp == NULL) {

933

fprintf(stderr, "WARNING [libhtmlparse]: memory error\n");

934

return;

935

}

936

937

strncpy(tmp, html, (size_t)(text-html));

938

939

if (pc->textCallBack != NULL) {

940

int ret = pc->textCallBack(tmp, pc);

941

if (ret != 0) {

942

free(tmp);

943

tmp = NULL;

944

945

return;

946

}

947

}

948

949

free(tmp);

950

tmp = NULL;

951

952

pc->lhtml_script_passthru = 0;

953

html = text;

954

}

955

956

if(*html == '<'){

957

html++;

958

959

switch (*html){

960

case '!' :

961

html++;

962

963

/* I must admit, I like conditional expressions,

964

they are so obviously obfuscated ;-) */

965

966

html = (*html == '-') ?

967

((pc->commentCallBack) ? parseComment(html, pc) : eatUp(html)) :

968

((pc->declCallBack) ? parseDecl(html, pc) : eatUp(html)) ;

969

break;

970

case '?' : /* XML/PHP tag */

971

html = (pc->xmlCallBack != NULL || pc->phpCallBack != NULL) ?

972

parseXML(html, pc) :

973

eatUp(html);

974

break;

975

case '/' : /* HTML end tag */

976

html = (pc->endCallBack) ?

977

parseEndTag(html, pc) :

978

eatUp(html);

979

break;

980

default : /* HTML start tag */

981

html = (pc->XHTMLCallBack != NULL || pc->startCallBack != NULL) ?

982

parseStartTag(html, pc) :

983

eatUp(html);

984

break;

985

}

986

} else { /* All other text */

987

/* while(isspace(*html))html++; it seems to be faster inside the function */

988

html = (pc->textCallBack) ?

989

parseText(html, pc):

990

eatUpText(html);

991

}

992

}

993

return;

994

}

995

996

997

998

/* ******************* now: LE specifics *************** */

999

1000

1001

/**

1002

* Add a keyword.

1003

**/

1004

static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type,

1005

char * keyword,

1006

struct EXTRACTOR_Keywords * next) {

1007

EXTRACTOR_KeywordList * result;

1008

1009

if (keyword == NULL)

1010

return next;

1011

result = (EXTRACTOR_KeywordList*)malloc(sizeof(EXTRACTOR_KeywordList));

1012

result->next = next;

1013

result->keyword = strdup(keyword);

1014

result->keywordType = type;

1015

return result;

1016

}

1017

1018

/**

1019

* Called by the parser whenever we see text.

1020

**/

1021

static int texts (char *comment, struct PC_ * pc) {

1022

if (pc->nextTextAction) {

1023

pc->result = addKeyword(pc->nextKeywordType,

1024

comment,

1025

pc->result);

1026

pc->nextTextAction = 0;

1027

}

1028

return 0;

1029

}

1030

1031

static int hasTag(char * arg,

1032

char * val,

1033

struct ArgvTable * args,

1034

int numargs) {

1035

int i;

1036

for (i=0;i<numargs;i++)

1037

if (0 == strcasecmp(args[i].arg, arg))

1038

if (0 == strcasecmp(args[i].val, val))

1039

return 1;

1040

return 0;

1041

}

1042

1043

static char * getTag(char * arg,

1044

struct ArgvTable * args,

1045

int numargs) {

1046

int i;

1047

for (i=0;i<numargs;i++)

1048

if (0 == strcasecmp(args[i].arg, arg))

1049

return args[i].val;

1050

return NULL;

1051

}

1052

1053

static struct {

1054

char * name;

1055

EXTRACTOR_KeywordType type;

1056

} tagmap[] = {

1057

{ "author" , EXTRACTOR_AUTHOR},

1058

{ "description" , EXTRACTOR_DESCRIPTION},

1059

{ "language", EXTRACTOR_LANGUAGE},

1060

{ "rights", EXTRACTOR_COPYRIGHT},

1061

{ "publisher", EXTRACTOR_PUBLISHER},

1062

{ "date", EXTRACTOR_DATE},

1063

{ "keywords", EXTRACTOR_KEYWORDS},

1064

{NULL, EXTRACTOR_UNKNOWN},

1065

};

1066

1067

1068

1069

static int starttag (char *tag,

1070

struct ArgvTable *args,

1071

int numargs,

1072

struct PC_ * pc) {

1073

int i;

1074

1075

if (0 == strcasecmp(tag,"title")) {

1076

pc->nextTextAction = 1;

1077

pc->nextKeywordType = EXTRACTOR_TITLE;

1078

return 0;

1079

}

1080

if (0 == strcasecmp(tag,"meta")) {

1081

i = 0;

1082

while (tagmap[i].name != NULL) {

1083

if (hasTag("name",tagmap[i].name,args, numargs))

1084

pc->result = addKeyword(tagmap[i].type,

1085

getTag("content",

1086

args, numargs),

1087

pc->result);

1088

i++;

1089

}

1090

}

1091

/* Don't do this, you can't be certain...*/

1092

#if I_AM_CERTAIN

1093

if (0 == strcasecmp(tag,"html")) {

1094

pc->result = addKeyword(EXTRACTOR_MIMETYPE,

1095

"text/html",

1096

pc->result);

1097

return 0;

1098

}

1099

#endif

1100

return 0;

1101

}

1102

1103

static int endtag (char *tag, struct PC_ * pc) {

1104

pc->nextTextAction = 0;

1105

return 0;

1106

}

1107

1108

1109

/* mimetype = text/html */

1110

struct EXTRACTOR_Keywords * libextractor_html_extract(const char * filename,

1111

char * data,

1112

const size_t size,

1113

struct EXTRACTOR_Keywords * prev) {

1114

char backup;

1115

ParserContext pc;

1116

size_t xsize;

1117

1118

if (size == 0)

1119

return prev;

1120

memset(&pc,

1121

1122

sizeof(ParserContext));

1123

pc.result = prev;

1124

pc.textCallBack = &texts;

1125

pc.startCallBack = &starttag;

1126

pc.endCallBack = &endtag;

1127

if (size > 1024 * 32)

1128

xsize = 1024 * 32;

1129

else

1130

xsize = size;

1131

/* the parser requires 0-termination. We just

1132

overwrite the last character in data and

1133

restore it later, assuming that it can

1134

hardly be a keyword in a valid HTML text...

1135

1136

backup = data[xsize-1];

1137

data[xsize-1] = '\0';

1138

parse(data, &pc);

1139

data[xsize-1] = backup;

1140

return pc.result;

1141

}

1142

1143

#if HAVE_MAIN

1144

int main(int argc, char **argv) {

1145

int file;

1146

char * buffer;

1147

struct stat fstatbuf;

1148

size_t size;

1149

1150

if (argc != 2) {

1151

fprintf(stderr,

1152

"Call with filename as argument\n");

1153

return -1;

1154

}

1155

file = OPEN(filename,O_RDONLY);

1156

if (-1 == file)

1157

return NULL;

1158

if (-1 == FSTAT(file, &fstatbuf)) {

1159

close(file);

1160

return NULL;

1161

}

1162

size = fstatbuf.st_size;

1163

buffer = mmap(NULL, size, PROT_READ, MAP_SHARED, file, 0);

1164

close(file);

1165

1166

EXTRACTOR_printKeywords(stdout,

1167

libextractor_html_extract(argv[1],

1168

NULL,

1169

buffer,

1170

size,

1171

NULL));

1172

free(size);

1173

return 0;

1174

}

1175

#endif

Older »