~ubuntu-branches/ubuntu/raring/notecase/raring

« back to all changes in this revision

Viewing changes to src/lib/HtmlParser.cpp

Committer: Bazaar Package Importer
Author(s): Nathan Handler
Date: 2008-12-21 13:09:58 UTC
mfrom: (1.1.6 upstream)
Revision ID: james.westby@ubuntu.com-20081221130958-0ri77h0x7j1dclkq

Tags: 1.9.8-0ubuntu1

https://launchpad.net/bugs/307752

New upstream release (LP: #307752)

files removed:
Makefile~

files modified:
Makefile

NoteCase.dsp

NoteCase.vcproj

deb/control

debian/changelog

docs/control

docs/help.ncd

docs/install.iss

docs/notecase.spec

po/de.po

po/hr.po

po/it.po

po/notecase.pot

po/ro.po

readme.txt

src/AboutDlg.cpp

src/AboutDlg.h

src/DateTimeDlg.cpp

src/DateTimeDlg.h

src/DocAction.cpp

src/DocAction.h

src/DocActionAtt.cpp

src/DocActionAtt.h

src/DocActionFinish.cpp

src/DocActionFinish.h

src/DocActionFinishDel.cpp

src/DocActionFinishDel.h

src/DocActionFmt.cpp

src/DocActionFmt.h

src/DocActionPix.cpp

src/DocActionPix.h

src/DocActionSort.cpp

src/DocActionSort.h

src/EditDlg.cpp

src/EditDlg.h

src/ExecuteFile.h

src/FileAttachmentDlg.cpp

src/FileAttachmentDlg.h

src/FileExportDlg.cpp

src/FileExportDlg.h

src/FileSaveAsDlg.cpp

src/FileSaveAsDlg.h

src/FindDialog.cpp

src/FindDialog.h

src/FindReplaceDlg.cpp

src/FindReplaceDlg.h

src/FindReplaceInfo.cpp

src/FindReplaceInfo.h

src/LinkPropertiesDlg.cpp

src/LinkPropertiesDlg.h

src/MainWnd.cpp

src/MainWnd.h

src/NodePropertiesDlg.cpp

src/NodePropertiesDlg.h

src/OptionsDialog.cpp

src/OptionsDialog.h

src/PasswordDialog.cpp

src/PasswordDialog.h

src/PixPropertiesDlg.cpp

src/PixPropertiesDlg.h

src/PortableTrayIcon.cpp

src/PortableTrayIcon.h

src/ShortcutsList.cpp

src/ShortcutsList.h

src/ShortcutsListDlg.cpp

src/ShortcutsListDlg.h

src/TextView.cpp

src/TextView.h

src/TreeView.cpp

src/TreeView.h

src/callbacks.cpp

src/callbacks.h

src/config.h

src/gui/Dialog.cpp

src/gui/Dialog.h

src/gui/FileDialog.cpp

src/gui/FileDialog.h

src/gui/GuiLanguage.cpp

src/gui/GuiLanguage.h

src/gui/ProgressDlg.cpp

src/gui/ProgressDlg.h

src/interface.cpp

src/interface.h

src/lib/Base64.cpp

src/lib/Base64.h

src/lib/CircularBuffer.cpp

src/lib/CircularBuffer.h

src/lib/DocActionBase.h

src/lib/DocActionManager.cpp

src/lib/DocActionManager.h

src/lib/DocumentIterator.cpp

src/lib/DocumentIterator.h

src/lib/EnumDirectory.cpp

src/lib/EnumDirectory.h

src/lib/File64.cpp

src/lib/File64.h

src/lib/FilePath.cpp

src/lib/FilePath.h

src/lib/FmtInfo.cpp

src/lib/FmtInfo.h

src/lib/FormatIOBase.cpp

src/lib/FormatIOBase.h

src/lib/FormatIOEncHtml.cpp

src/lib/FormatIOEncHtml.h

src/lib/FormatIOExecutable.cpp

src/lib/FormatIOExecutable.h

src/lib/FormatIOGjots2.cpp

src/lib/FormatIOGjots2.h

src/lib/FormatIOHtml.cpp

src/lib/FormatIOHtml.h

src/lib/FormatIOMMLX.cpp

src/lib/FormatIOMMLX.h

src/lib/FormatIOStickyNotes.cpp

src/lib/FormatIOStickyNotes.h

src/lib/FormatIOTxt.cpp

src/lib/FormatIOTxt.h

src/lib/HtmlParser.cpp

src/lib/HtmlParser.h

src/lib/IOLayerBase.cpp

src/lib/IOLayerBase.h

src/lib/IOLayerEnc.cpp

src/lib/IOLayerEnc.h

src/lib/IOLayerFile64.cpp

src/lib/IOLayerFile64.h

src/lib/IOLayerRedirect.cpp

src/lib/IOLayerRedirect.h

src/lib/IOLayerZlib.cpp

src/lib/IOLayerZlib.h

src/lib/IOProcess.cpp

src/lib/IOProcess.h

src/lib/IniFile.cpp

src/lib/IniFile.h

src/lib/LinkInfo.cpp

src/lib/LinkInfo.h

src/lib/NoteDocument.cpp

src/lib/NoteDocument.h

src/lib/NoteNode.cpp

src/lib/NoteNode.h

src/lib/SHA1.cpp

src/lib/SHA1.h

src/lib/TextSearch.cpp

src/lib/TextSearch.h

src/lib/Thread.cpp

src/lib/Thread.h

src/lib/blowfish.cpp

src/lib/blowfish.h

src/lib/blowfish2.h

src/lib/debug.cpp

src/lib/debug.h

src/lib/types.h

src/main.cpp

src/mru.cpp

src/mru.h

src/support.cpp

src/support.h

Show diffs side-by-side

added added

removed removed

src/lib/HtmlParser.cpp

////////////////////////////////////////////////////////////////////////////

// NoteCase notes manager project <http://notecase.sf.net>

// This code is licensed under BSD license.See "license.txt" for more details.

// File: Implements basic HTML parser class

////////////////////////////////////////////////////////////////////////////

#include "HtmlParser.h"

#include "debug.h"

#include <vector>

#include <algorithm>

#include <glib.h>

#include <gtk/gtk.h>

#include <string.h>

#ifndef _WIN32

#include <strings.h> //strcasecmp

#else

#define strcasecmp stricmp

#endif

void replaceall(std::string &strData, const char *szFind, const char *szReplace);

// parser states

#define PARSER_STATE_BLANK 0

#define PARSER_STATE_INSIDE_TAG 1

#define PARSER_STATE_INSIDE_COMMENT 2

// Html escape sequences table

typedef struct {

gunichar cLetter;

const char *szEscape;

} HtmlEscape;

static std::vector<HtmlEscape> g_lstTableSort2;

//table is sorted by first field to enable binary search

static const HtmlEscape _table_char[] =

{

{'\"', """}, //=34

{'&', "&"}, //=38

{'<', "<"}, //=60

{'>', ">"}, //=62

{' ', " "}, //non-breaking space

{161, "¡"}, //'¡' - inverted exclamation mark

{162, "¢"}, //'¢'

{163, "£"}, //'£'

{164, "¤"},//'¤'

{165, "¥"}, //'¥'

{166, "¦"},//'¦' - broken (vertical) bar

{167, "§"}, //'§' - section sign

{168, "¨"}, //'¨' - umlaut

{169, "©"}, //'©' - copyright sign

{170, "ª"}, //'ª' - feminine ordinal

{171, "«"}, //'«' - left guillemet

{174, "®"}, //'®' - registered sign

{176, "°"}, //'°' - degree sign

{177, "±"},//'±' - plus or minus

{178, "²"}, //'²' - superscript two

{179, "³"}, //'³' - superscript three

{187, "»"}, //'»' - right guillemet

{192, "À"},//'À'

{193, "Á"},//'Á'

{194, "Â"}, //'Â'

{195, "Ã"},//'Ã'

{196, "Ä"}, //'Ä'

{197, "Å"}, //'Å'

{198, "Æ"}, //'Æ'

{199, "Ç"},//'Ç'

{200, "È"},//'È'

{201, "É"},//'É'

{202, "Ê"}, //'Ê'

{203, "Ë"}, //'Ë'

{204, "Ì"},//'Ì'

{205, "Í"},//'Í'

{206, "Î"}, //'Î'

{207, "Ï"}, //'Ï'

{208, "Ð"}, //'Ð' - capital Eth, Icelandic

{209, "Ñ"},//'Ñ'

{210, "Ò"},//'Ò'

{211, "Ó"},//'Ó'

{212, "Ô"}, //'Ô'

{213, "Õ"},//'Õ'

{214, "Ö"}, //'Ö'

{215, "×"}, //'×' - multiply sign

{216, "Ø"},//'Ø'

{217, "Ù"},//'Ù'

{218, "Ú"},//'Ú'

{219, "Û"}, //'Û'

{220, "Ü"}, //'Ü'

{221, "Ý"},//'Ý'

{222, "Þ"}, //'Þ' - capital THORN, Icelandic

{223, "ß"}, //'ß'

{224, "à"},//'à'

{225, "á"},//'á'

100

{226, "â"}, //'â'

101

{227, "ã"},//'ã'

102

{228, "ä"}, //'ä'

103

{229, "å"}, //'å'

104

105

{230, "æ"}, //'æ'

106

{231, "ç"},//'ç'

107

{232, "è"},//'è'

108

{233, "é"},//'é'

109

{234, "ê"}, //'ê'

110

{235, "ë"}, //'ë'

111

{236, "ì"},//'ì'

112

{237, "í"},//'í'

113

{238, "î"}, //'î'

114

{239, "ï"}, //'ï'

115

{240, "ð"}, //'ð' - small eth, Icelandic

116

{241, "ñ"},//'ñ'

117

{242, "ò"},//'ò'

118

{243, "ó"},//'ó'

119

{244, "ô"}, //'ô'

120

{245, "õ"},//'õ'

121

{246, "ö"}, //'ö'

122

123

{248, "ø"},//'ø'

124

{249, "ù"},//'ù'

125

{250, "ú"},//'ú'

126

{251, "û"}, //'û'

127

{252, "ü"}, //'ü'

128

{253, "ý"},//'ý'

129

{254, "þ"}, //'þ' - small thorn, Icelandic

130

{255, "ÿ"}, //'ÿ'

131

132

{338, "&OElig;"}, //'Œ'

133

{339, "&oelig;"}, //'œ'

134

{352, "&Scaron;"},//'Š'

135

{353, "&scaron;"},//'š'

136

{376, "&Yuml;"}, //'Ÿ'

137

{402, "&fnof;"}, //'ƒ'

138

139

{8211, "–"}, //'–' - en dash (demi-cadratin)

140

{8212, "—"}, //'—' - em dash (cadratin)

141

142

{8249, "‹"}, //'‹' - left single guillemet

143

{8250, "›"}, //'›' - right single guillemet

144

{8364, "€"}, //'€'

145

{8482, "™"}, //'™' - trademark

146

147

//TOFIX add more chars if needed

148

};

149

150

static int hexVal(char ch)

151

{

152

if ((ch >= 'a') && (ch <= 'f'))

153

return (ch - 'a') + 10;

154

else if ((ch >= 'A') && (ch <= 'F'))

155

return (ch - 'A') + 10;

156

else if ((ch >= '0') && (ch <= '9'))

157

return (ch - '0');

158

return -1;

159

} // hexVal

160

161

#define SIZE_OF(x) (sizeof(x)/sizeof(x[0]))

162

163

static int table_bin_search_char(gunichar chFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);

164

static int table_bin_search_escape(const char *szFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);

165

166

class TblComparator{

167

public:

168

bool operator()(const HtmlEscape &a, const HtmlEscape &b)

169

{

170

//operator < (is a<b ?)

171

return (strcmp(a.szEscape, b.szEscape) < 0);

172

};

173

};

174

175

HTMLParser::HTMLParser()

176

{

177

Clear();

178

179

m_bAllowUnescapedInPreTag = false;

180

m_bInsidePreTag = false;

181

182

//create new sort table (create only once - global object)

183

if(g_lstTableSort2.empty())

184

{

185

for(unsigned int i=0; i<SIZE_OF(_table_char); i++)

186

g_lstTableSort2.push_back(_table_char[i]);

187

TblComparator cmp;

188

std::sort(g_lstTableSort2.begin(), g_lstTableSort2.end(), cmp);

189

}

190

}

191

192

HTMLParser::~HTMLParser()

193

{

194

}

195

196

void HTMLParser::Clear()

197

{

198

m_nState = PARSER_STATE_BLANK;

199

m_strData.erase(m_strData.begin(), m_strData.end());

200

}

201

202

bool HTMLParser::Parse(const char *szBuffer, int len)

203

{

204

if(len < 0)

205

return false;

206

207

for(int i=0; i<len; i++)

208

{

209

if(PARSER_STATE_INSIDE_COMMENT == m_nState)

210

{

211

if( m_strData.size()>2 &&

212

0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2)) //tag ends as comment

213

{

214

TRACE("HTML Parser: Comment ended\n");

215

m_strData.erase(m_strData.size()-2, 2); //remove "--" ending

216

OnComment(m_strData.c_str()+3); //trigger event

217

m_strData.erase(m_strData.begin(), m_strData.end());

218

m_nState = PARSER_STATE_BLANK;

219

}

220

else

221

m_strData += szBuffer[i];

222

}

223

else if(PARSER_STATE_INSIDE_TAG == m_nState)

224

{

225

//check for the end of tag

226

if(szBuffer[i] == '>'){

227

if(0 == strncmp("!--", m_strData.c_str(), 3)) //is tag comment

228

{

229

m_nState = PARSER_STATE_INSIDE_COMMENT;

230

231

if(0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2)) //tag ends as comment

232

{

233

TRACE("HTML Parser: Comment ended\n");

234

m_strData.erase(m_strData.size()-2, 2); //remove "--" ending

235

OnComment(m_strData.c_str()+3); //trigger event

236

m_strData.erase(m_strData.begin(), m_strData.end());

237

m_nState = PARSER_STATE_BLANK;

238

}

239

else

240

m_strData += szBuffer[i];

241

}

242

else if(!m_strData.empty() && m_nState != PARSER_STATE_INSIDE_COMMENT)

243

{

244

if(m_strData.at(0) == '/') //is ending tag

245

{

246

//strip everything after first space within tag to get real tag name

247

std::string strTag(m_strData.c_str()+1);

248

int nPos = strTag.find_first_of(' ');

249

if(nPos >= 0)

250

strTag.erase(strTag.begin()+nPos);

251

252

if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag) || (0 == strcasecmp(strTag.c_str(), "PRE")))

253

{

254

TRACE("HTML Parser: Tag end found (%s)\n", strTag.c_str());

255

OnTagEnd(strTag.c_str()); //trigger event

256

m_strData.erase(m_strData.begin(), m_strData.end());

257

if(0 == strcasecmp(strTag.c_str(), "PRE"))

258

m_bInsidePreTag = false;

259

}

260

else

261

{

262

TRACE("HTML Parser: Push text (%s)\n", m_strData.c_str());

263

m_nState = PARSER_STATE_BLANK;

264

OnText(m_strData.c_str()); //trigger event for previous contents

265

m_strData.erase(m_strData.begin(), m_strData.end());

266

}

267

}

268

else if(m_nState != PARSER_STATE_INSIDE_COMMENT)

269

{

270

std::string strTag(m_strData.c_str());

271

std::string strParams;

272

273

int nPos = strTag.find_first_of(' ');

274

if(nPos >= 0){

275

strTag = strTag.substr(0, nPos);

276

strParams = m_strData.substr(nPos);

277

}

278

279

if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag)){

280

TRACE("HTML Parser: Tag start found (%s)[%s]\n", strTag.c_str(), strParams.c_str());

281

OnTagBegin(strTag.c_str(), strParams.c_str()); //trigger event

282

m_strData.erase(m_strData.begin(), m_strData.end());

283

m_nState = PARSER_STATE_BLANK;

284

if(0 == strcasecmp(strTag.c_str(), "PRE"))

285

m_bInsidePreTag = true;

286

}

287

else{

288

m_nState = PARSER_STATE_BLANK;

289

TRACE("HTML Parser: Push text1 (%s)\n", m_strData.c_str());

290

OnText(m_strData.c_str()); //trigger event for previous contents

291

m_strData.erase(m_strData.begin(), m_strData.end());

292

}

293

}

294

}

295

296

if(PARSER_STATE_INSIDE_COMMENT != m_nState)

297

Clear();

298

}

299

else

300

m_strData += szBuffer[i];

301

}

302

else

303

{

304

//check for the start of tag

305

if(szBuffer[i] == '<' && m_nState == PARSER_STATE_BLANK)

306

{

307

if(!m_strData.empty())

308

{

309

TRACE("HTML Parser: Push text2 (%s)\n", m_strData.c_str());

310

OnText(m_strData.c_str()); //trigger event for previous contents

311

m_strData.erase(m_strData.begin(), m_strData.end());

312

}

313

m_nState = PARSER_STATE_INSIDE_TAG;

314

}

315

else{

316

m_strData += szBuffer[i];

317

}

318

}

319

}

320

321

return true;

322

}

323

324

void HTMLParser::Finalize()

325

{

326

if(!m_strData.empty() && m_nState == PARSER_STATE_BLANK)

327

OnText(m_strData.c_str()); //trigger event for previous contents

328

m_strData.erase(m_strData.begin(), m_strData.end());

329

}

330

331

void HTMLParser::EscapeURI(std::string &data)

332

{

333

//TOFIX replace non-ASCII characters by converting each byte to %HH, where HH is the hexadecimal notation of the byte value

334

replaceall(data, " ", "%20");

335

replaceall(data, "&", "&");

336

}

337

338

void HTMLParser::UnescapeURI(std::string &data)

339

{

340

#if GTK_CHECK_VERSION(2,16,0)

341

char *szRes = g_uri_unescape_string(data.c_str(), NULL);

342

if(szRes){

343

data = szRes;

344

g_free(szRes);

345

}

346

#else

347

int nStart = 0;

348

unsigned int nSize = data.size();

349

std::string::size_type nPos;

350

while ((nPos = data.find('%', nStart)) != std::string::npos)

351

{

352

if(nPos + 2 < nSize) // two chars after %

353

{

354

int a, b;

355

if ((a = hexVal(data[nPos+1])) != -1)

356

{

357

if ((b = hexVal(data[nPos+2])) != -1)

358

{

359

gunichar cChar = ((a * 16) + b);

360

//gchar szText[10];

361

//int nWritten = g_unichar_to_utf8(cChar, szText);

362

//szText[nWritten] = '\0';

363

data.erase(nPos, 3);

364

//data.insert(nPos, szText);

365

data.insert(data.begin()+nPos, (char)cChar);

366

367

//nStart = nPos + nWritten;

368

nStart = nPos + 1;

369

nSize -= 3;

370

//nSize += nWritten;

371

nSize += 1;

372

}

373

else

374

nStart = nPos + 3;

375

}

376

else

377

nStart = nPos + 3;

378

}

379

else

380

break;

381

}

382

383

//TOFIX

384

replaceall(data, "&", "&");

385

#endif

386

}

387

388

void HTMLParser::EscapeChars(std::string &data)

389

{

390

unsigned int nPos, nWidth;

391

const char *szStart = data.c_str();

392

const char *szString = szStart;

393

394

//using UTF-8 characters

395

while(NULL != szString && '\0' != *szString)

396

{

397

int nSkip = 0;

398

gunichar chLetter = g_utf8_get_char (szString);

399

const char *szNext = g_utf8_find_next_char(szString, NULL);

400

401

//TRACE("String to escape: %s\n", szString);

402

403

int nRes = table_bin_search_char(chLetter);

404

if(nRes >= 0)

405

{

406

//replace escape sequence with original special char

407

nPos = szString - szStart;

408

nWidth = szNext - szString;

409

nSkip = strlen(_table_char[nRes].szEscape);

410

411

//TRACE("Escape: %d (width=%d) to %s\n", chLetter, nWidth, _table_char[nRes].szEscape);

412

413

//FIX: data = data.substr(0, nPos) + _table_char[nRes].szEscape + data.substr(nPos+nWidth, 1000000);

414

data.erase(nPos, nWidth);

415

data.insert(nPos, _table_char[nRes].szEscape);

416

417

//TRACE("Escaped line: %s\n", data.c_str());

418

419

szStart = data.c_str(); //in case string was reallocated

420

szString = szStart + nPos + nSkip;

421

}

422

else

423

szString = szNext;

424

}

425

}

426

427

void HTMLParser::UnescapeChars(std::string &data)

428

{

429

unsigned int nPos = 0;

430

while(1)

431

{

432

int nPosStart = data.find('&', nPos);

433

if(nPosStart < 0)

434

break;

435

436

int nPosEnd = data.find(';', nPosStart+1);

437

if(nPosEnd >= 0)

438

{

439

//extract escape sequence

440

std::string strChar = data.substr(nPosStart, nPosEnd-nPosStart+1);

441

//TRACE("Escape sequence %s found!\n", strChar.c_str());

442

443

int nRes = table_bin_search_escape(strChar.c_str());

444

if(nRes >= 0)

445

{

446

//replace escape sequence with original UTF-8 character

447

char szBuffer[20];

448

int nBytes = g_unichar_to_utf8(g_lstTableSort2[nRes].cLetter, szBuffer);

449

szBuffer[nBytes] = '\0';

450

451

//FIX: data = data.substr(0, nPosStart) + szBuffer + data.substr(nPosEnd+1, 1000000);

452

data.erase(nPosStart, nPosEnd+1-nPosStart);

453

data.insert(nPosStart, szBuffer);

454

}

455

else

456

TRACE("ERROR: HTML escape sequence %s is not supported yet!\n", strChar.c_str());

457

}

458

else

459

break; //no sequence found

460

461

nPos = nPosStart+1;

462

}

463

}

464

465

//use binary search to speed up convertion

466

int table_bin_search_char(gunichar chFind, int nLeft, int nRight)

467

{

468

if(nLeft > nRight) return -1; //no match found

469

470

//check middle of the range

471

int nMid = (nLeft + nRight)/2;

472

if(chFind == _table_char[nMid].cLetter)

473

return nMid; //match found

474

475

if(nLeft == nRight) return -1; //no match found

476

477

if(chFind < _table_char[nMid].cLetter)

478

return table_bin_search_char(chFind, nLeft, nMid-1); //search lower half

479

else

480

return table_bin_search_char(chFind, nMid+1, nRight); //search upper half

481

}

482

483

int table_bin_search_escape(const char *szFind, int nLeft, int nRight)

484

{

485

//TRACE("bin search [Escape:%s], l=%d, r=%d\n", szFind, nLeft, nRight);

486

487

if(nLeft > nRight) {

488

//TRACE("bin search: no match found\n");

489

return -1; //no match found

490

}

491

492

//check middle of the range

493

int nMid = (nLeft + nRight)/2;

494

if(0 == strcmp(szFind, g_lstTableSort2[nMid].szEscape)){

495

//TRACE("bin search found [Escape:%s], m=%d\n", szFind, nMid);

496

return nMid; //match found

497

}

498

499

if(nLeft == nRight){

500

//TRACE("bin search: no match found\n");

501

return -1; //no match found

502

}

503

504

if(strcmp(szFind, g_lstTableSort2[nMid].szEscape) < 0)

505

{

506

//TRACE("Search lower half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);

507

return table_bin_search_escape(szFind, nLeft, nMid-1); //search lower half

508

}

509

else{

510

//TRACE("Search upper half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);

511

return table_bin_search_escape(szFind, nMid+1, nRight); //search upper half

512

}

513

}

514

515

bool HTMLParser::ExtractParam(const std::string &data, const char *szParam, std::string &resValue)

516

{

517

std::string strPattern = szParam;

518

strPattern += "=\"";

519

520

std::string::size_type nPos = data.find(strPattern.c_str());

521

if(nPos != std::string::npos)

522

{

523

std::string::size_type nEnd = data.find("\"", nPos+strPattern.size());

524

if(nEnd != std::string::npos){

525

resValue = data.substr(nPos+strPattern.size(), nEnd-nPos-strPattern.size());

526

return true;

527

}

528

}

529

return false; // not found

530

}

////////////////////////////////////////////////////////////////////////////

// NoteCase notes manager project <http://notecase.sf.net>

// This code is licensed under BSD license.See "license.txt" for more details.

// File: Implements basic HTML parser class

////////////////////////////////////////////////////////////////////////////

#include "HtmlParser.h"

#include "debug.h"

#include <vector>

#include <algorithm>

#include <glib.h>

#include <gtk/gtk.h>

#include <string.h>

#ifndef _WIN32

#include <strings.h> //strcasecmp

#else

#define strcasecmp stricmp

#endif

void replaceall(std::string &strData, const char *szFind, const char *szReplace);

// parser states

#define PARSER_STATE_BLANK 0

#define PARSER_STATE_INSIDE_TAG 1

#define PARSER_STATE_INSIDE_COMMENT 2

// Html escape sequences table

typedef struct {

gunichar cLetter;

const char *szEscape;

} HtmlEscape;

static std::vector<HtmlEscape> g_lstTableSort2;

//table is sorted by first field to enable binary search

static const HtmlEscape _table_char[] =

{

{'\"', """}, //=34

{'&', "&"}, //=38

{'<', "<"}, //=60

{'>', ">"}, //=62

{' ', " "}, //non-breaking space

{161, "¡"}, //'¡' - inverted exclamation mark

{162, "¢"}, //'¢'

{163, "£"}, //'£'

{164, "¤"},//'¤'

{165, "¥"}, //'¥'

{166, "¦"},//'¦' - broken (vertical) bar

{167, "§"}, //'§' - section sign

{168, "¨"}, //'¨' - umlaut

{169, "©"}, //'©' - copyright sign

{170, "ª"}, //'ª' - feminine ordinal

{171, "«"}, //'«' - left guillemet

{174, "®"}, //'®' - registered sign

{176, "°"}, //'°' - degree sign

{177, "±"},//'±' - plus or minus

{178, "²"}, //'²' - superscript two

{179, "³"}, //'³' - superscript three

{187, "»"}, //'»' - right guillemet

{192, "À"},//'À'

{193, "Á"},//'Á'

{194, "Â"}, //'Â'

{195, "Ã"},//'Ã'

{196, "Ä"}, //'Ä'

{197, "Å"}, //'Å'

{198, "Æ"}, //'Æ'

{199, "Ç"},//'Ç'

{200, "È"},//'È'

{201, "É"},//'É'

{202, "Ê"}, //'Ê'

{203, "Ë"}, //'Ë'

{204, "Ì"},//'Ì'

{205, "Í"},//'Í'

{206, "Î"}, //'Î'

{207, "Ï"}, //'Ï'

{208, "Ð"}, //'Ð' - capital Eth, Icelandic

{209, "Ñ"},//'Ñ'

{210, "Ò"},//'Ò'

{211, "Ó"},//'Ó'

{212, "Ô"}, //'Ô'

{213, "Õ"},//'Õ'

{214, "Ö"}, //'Ö'

{215, "×"}, //'×' - multiply sign

{216, "Ø"},//'Ø'

{217, "Ù"},//'Ù'

{218, "Ú"},//'Ú'

{219, "Û"}, //'Û'

{220, "Ü"}, //'Ü'

{221, "Ý"},//'Ý'

{222, "Þ"}, //'Þ' - capital THORN, Icelandic

{223, "ß"}, //'ß'

{224, "à"},//'à'

{225, "á"},//'á'

100

{226, "â"}, //'â'

101

{227, "ã"},//'ã'

102

{228, "ä"}, //'ä'

103

{229, "å"}, //'å'

104

105

{230, "æ"}, //'æ'

106

{231, "ç"},//'ç'

107

{232, "è"},//'è'

108

{233, "é"},//'é'

109

{234, "ê"}, //'ê'

110

{235, "ë"}, //'ë'

111

{236, "ì"},//'ì'

112

{237, "í"},//'í'

113

{238, "î"}, //'î'

114

{239, "ï"}, //'ï'

115

{240, "ð"}, //'ð' - small eth, Icelandic

116

{241, "ñ"},//'ñ'

117

{242, "ò"},//'ò'

118

{243, "ó"},//'ó'

119

{244, "ô"}, //'ô'

120

{245, "õ"},//'õ'

121

{246, "ö"}, //'ö'

122

123

{248, "ø"},//'ø'

124

{249, "ù"},//'ù'

125

{250, "ú"},//'ú'

126

{251, "û"}, //'û'

127

{252, "ü"}, //'ü'

128

{253, "ý"},//'ý'

129

{254, "þ"}, //'þ' - small thorn, Icelandic

130

{255, "ÿ"}, //'ÿ'

131

132

{338, "&OElig;"}, //'Œ'

133

{339, "&oelig;"}, //'œ'

134

{352, "&Scaron;"},//'Š'

135

{353, "&scaron;"},//'š'

136

{376, "&Yuml;"}, //'Ÿ'

137

{402, "&fnof;"}, //'ƒ'

138

139

{8211, "–"}, //'–' - en dash (demi-cadratin)

140

{8212, "—"}, //'—' - em dash (cadratin)

141

142

{8249, "‹"}, //'‹' - left single guillemet

143

{8250, "›"}, //'›' - right single guillemet

144

{8364, "€"}, //'€'

145

{8482, "™"}, //'™' - trademark

146

147

//TOFIX add more chars if needed

148

};

149

150

static int hexVal(char ch)

151

{

152

if ((ch >= 'a') && (ch <= 'f'))

153

return (ch - 'a') + 10;

154

else if ((ch >= 'A') && (ch <= 'F'))

155

return (ch - 'A') + 10;

156

else if ((ch >= '0') && (ch <= '9'))

157

return (ch - '0');

158

return -1;

159

} // hexVal

160

161

#define SIZE_OF(x) (sizeof(x)/sizeof(x[0]))

162

163

static int table_bin_search_char(gunichar chFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);

164

static int table_bin_search_escape(const char *szFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);

165

166

class TblComparator{

167

public:

168

bool operator()(const HtmlEscape &a, const HtmlEscape &b)

169

{

170

//operator < (is a<b ?)

171

return (strcmp(a.szEscape, b.szEscape) < 0);

172

};

173

};

174

175

HTMLParser::HTMLParser()

176

{

177

Clear();

178

179

m_bAllowUnescapedInPreTag = false;

180

m_bInsidePreTag = false;

181

182

//create new sort table (create only once - global object)

183

if(g_lstTableSort2.empty())

184

{

185

for(unsigned int i=0; i<SIZE_OF(_table_char); i++)

186

g_lstTableSort2.push_back(_table_char[i]);

187

TblComparator cmp;

188

std::sort(g_lstTableSort2.begin(), g_lstTableSort2.end(), cmp);

189

}

190

}

191

192

HTMLParser::~HTMLParser()

193

{

194

}

195

196

void HTMLParser::Clear()

197

{

198

m_nState = PARSER_STATE_BLANK;

199

m_strData.erase(m_strData.begin(), m_strData.end());

200

}

201

202

bool HTMLParser::Parse(const char *szBuffer, int len)

203

{

204

if(len < 0)

205

return false;

206

207

for(int i=0; i<len; i++)

208

{

209

if(PARSER_STATE_INSIDE_COMMENT == m_nState)

210

{

211

if( m_strData.size()>2 &&

212

0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2)) //tag ends as comment

213

{

214

TRACE("HTML Parser: Comment ended\n");

215

m_strData.erase(m_strData.size()-2, 2); //remove "--" ending

216

OnComment(m_strData.c_str()+3); //trigger event

217

m_strData.erase(m_strData.begin(), m_strData.end());

218

m_nState = PARSER_STATE_BLANK;

219

}

220

else

221

m_strData += szBuffer[i];

222

}

223

else if(PARSER_STATE_INSIDE_TAG == m_nState)

224

{

225

//check for the end of tag

226

if(szBuffer[i] == '>'){

227

if(0 == strncmp("!--", m_strData.c_str(), 3)) //is tag comment

228

{

229

m_nState = PARSER_STATE_INSIDE_COMMENT;

230

231

if(0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2)) //tag ends as comment

232

{

233

TRACE("HTML Parser: Comment ended\n");

234

m_strData.erase(m_strData.size()-2, 2); //remove "--" ending

235

OnComment(m_strData.c_str()+3); //trigger event

236

m_strData.erase(m_strData.begin(), m_strData.end());

237

m_nState = PARSER_STATE_BLANK;

238

}

239

else

240

m_strData += szBuffer[i];

241

}

242

else if(!m_strData.empty() && m_nState != PARSER_STATE_INSIDE_COMMENT)

243

{

244

if(m_strData.at(0) == '/') //is ending tag

245

{

246

//strip everything after first space within tag to get real tag name

247

std::string strTag(m_strData.c_str()+1);

248

int nPos = strTag.find_first_of(' ');

249

if(nPos >= 0)

250

strTag.erase(strTag.begin()+nPos);

251

252

if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag) || (0 == strcasecmp(strTag.c_str(), "PRE")))

253

{

254

TRACE("HTML Parser: Tag end found (%s)\n", strTag.c_str());

255

OnTagEnd(strTag.c_str()); //trigger event

256

m_strData.erase(m_strData.begin(), m_strData.end());

257

if(0 == strcasecmp(strTag.c_str(), "PRE"))

258

m_bInsidePreTag = false;

259

}

260

else

261

{

262

TRACE("HTML Parser: Push text (%s)\n", m_strData.c_str());

263

m_nState = PARSER_STATE_BLANK;

264

OnText(m_strData.c_str()); //trigger event for previous contents

265

m_strData.erase(m_strData.begin(), m_strData.end());

266

}

267

}

268

else if(m_nState != PARSER_STATE_INSIDE_COMMENT)

269

{

270

std::string strTag(m_strData.c_str());

271

std::string strParams;

272

273

int nPos = strTag.find_first_of(' ');

274

if(nPos >= 0){

275

strTag = strTag.substr(0, nPos);

276

strParams = m_strData.substr(nPos);

277

}

278

279

if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag)){

280

TRACE("HTML Parser: Tag start found (%s)[%s]\n", strTag.c_str(), strParams.c_str());

281

OnTagBegin(strTag.c_str(), strParams.c_str()); //trigger event

282

m_strData.erase(m_strData.begin(), m_strData.end());

283

m_nState = PARSER_STATE_BLANK;

284

if(0 == strcasecmp(strTag.c_str(), "PRE"))

285

m_bInsidePreTag = true;

286

}

287

else{

288

m_nState = PARSER_STATE_BLANK;

289

TRACE("HTML Parser: Push text1 (%s)\n", m_strData.c_str());

290

OnText(m_strData.c_str()); //trigger event for previous contents

291

m_strData.erase(m_strData.begin(), m_strData.end());

292

}

293

}

294

}

295

296

if(PARSER_STATE_INSIDE_COMMENT != m_nState)

297

Clear();

298

}

299

else

300

m_strData += szBuffer[i];

301

}

302

else

303

{

304

//check for the start of tag

305

if(szBuffer[i] == '<' && m_nState == PARSER_STATE_BLANK)

306

{

307

if(!m_strData.empty())

308

{

309

TRACE("HTML Parser: Push text2 (%s)\n", m_strData.c_str());

310

OnText(m_strData.c_str()); //trigger event for previous contents

311

m_strData.erase(m_strData.begin(), m_strData.end());

312

}

313

m_nState = PARSER_STATE_INSIDE_TAG;

314

}

315

else{

316

m_strData += szBuffer[i];

317

}

318

}

319

}

320

321

return true;

322

}

323

324

void HTMLParser::Finalize()

325

{

326

if(!m_strData.empty() && m_nState == PARSER_STATE_BLANK)

327

OnText(m_strData.c_str()); //trigger event for previous contents

328

m_strData.erase(m_strData.begin(), m_strData.end());

329

}

330

331

void HTMLParser::EscapeURI(std::string &data)

332

{

333

//TOFIX replace non-ASCII characters by converting each byte to %HH, where HH is the hexadecimal notation of the byte value

334

replaceall(data, " ", "%20");

335

replaceall(data, "&", "&");

336

}

337

338

void HTMLParser::UnescapeURI(std::string &data)

339

{

340

#if GTK_CHECK_VERSION(2,16,0)

341

char *szRes = g_uri_unescape_string(data.c_str(), NULL);

342

if(szRes){

343

data = szRes;

344

g_free(szRes);

345

}

346

#else

347

int nStart = 0;

348

unsigned int nSize = data.size();

349

std::string::size_type nPos;

350

while ((nPos = data.find('%', nStart)) != std::string::npos)

351

{

352

if(nPos + 2 < nSize) // two chars after %

353

{

354

int a, b;

355

if ((a = hexVal(data[nPos+1])) != -1)

356

{

357

if ((b = hexVal(data[nPos+2])) != -1)

358

{

359

gunichar cChar = ((a * 16) + b);

360

//gchar szText[10];

361

//int nWritten = g_unichar_to_utf8(cChar, szText);

362

//szText[nWritten] = '\0';

363

data.erase(nPos, 3);

364

//data.insert(nPos, szText);

365

data.insert(data.begin()+nPos, (char)cChar);

366

367

//nStart = nPos + nWritten;

368

nStart = nPos + 1;

369

nSize -= 3;

370

//nSize += nWritten;

371

nSize += 1;

372

}

373

else

374

nStart = nPos + 3;

375

}

376

else

377

nStart = nPos + 3;

378

}

379

else

380

break;

381

}

382

383

//TOFIX

384

replaceall(data, "&", "&");

385

#endif

386

}

387

388

void HTMLParser::EscapeChars(std::string &data)

389

{

390

unsigned int nPos, nWidth;

391

const char *szStart = data.c_str();

392

const char *szString = szStart;

393

394

//using UTF-8 characters

395

while(NULL != szString && '\0' != *szString)

396

{

397

int nSkip = 0;

398

gunichar chLetter = g_utf8_get_char (szString);

399

const char *szNext = g_utf8_find_next_char(szString, NULL);

400

401

//TRACE("String to escape: %s\n", szString);

402

403

int nRes = table_bin_search_char(chLetter);

404

if(nRes >= 0)

405

{

406

//replace escape sequence with original special char

407

nPos = szString - szStart;

408

nWidth = szNext - szString;

409

nSkip = strlen(_table_char[nRes].szEscape);

410

411

//TRACE("Escape: %d (width=%d) to %s\n", chLetter, nWidth, _table_char[nRes].szEscape);

412

413

//FIX: data = data.substr(0, nPos) + _table_char[nRes].szEscape + data.substr(nPos+nWidth, 1000000);

414

data.erase(nPos, nWidth);

415

data.insert(nPos, _table_char[nRes].szEscape);

416

417

//TRACE("Escaped line: %s\n", data.c_str());

418

419

szStart = data.c_str(); //in case string was reallocated

420

szString = szStart + nPos + nSkip;

421

}

422

else

423

szString = szNext;

424

}

425

}

426

427

void HTMLParser::UnescapeChars(std::string &data)

428

{

429

unsigned int nPos = 0;

430

while(1)

431

{

432

int nPosStart = data.find('&', nPos);

433

if(nPosStart < 0)

434

break;

435

436

int nPosEnd = data.find(';', nPosStart+1);

437

if(nPosEnd >= 0)

438

{

439

//extract escape sequence

440

std::string strChar = data.substr(nPosStart, nPosEnd-nPosStart+1);

441

//TRACE("Escape sequence %s found!\n", strChar.c_str());

442

443

int nRes = table_bin_search_escape(strChar.c_str());

444

if(nRes >= 0)

445

{

446

//replace escape sequence with original UTF-8 character

447

char szBuffer[20];

448

int nBytes = g_unichar_to_utf8(g_lstTableSort2[nRes].cLetter, szBuffer);

449

szBuffer[nBytes] = '\0';

450

451

//FIX: data = data.substr(0, nPosStart) + szBuffer + data.substr(nPosEnd+1, 1000000);

452

data.erase(nPosStart, nPosEnd+1-nPosStart);

453

data.insert(nPosStart, szBuffer);

454

}

455

else

456

TRACE("ERROR: HTML escape sequence %s is not supported yet!\n", strChar.c_str());

457

}

458

else

459

break; //no sequence found

460

461

nPos = nPosStart+1;

462

}

463

}

464

465

//use binary search to speed up convertion

466

int table_bin_search_char(gunichar chFind, int nLeft, int nRight)

467

{

468

if(nLeft > nRight) return -1; //no match found

469

470

//check middle of the range

471

int nMid = (nLeft + nRight)/2;

472

if(chFind == _table_char[nMid].cLetter)

473

return nMid; //match found

474

475

if(nLeft == nRight) return -1; //no match found

476

477

if(chFind < _table_char[nMid].cLetter)

478

return table_bin_search_char(chFind, nLeft, nMid-1); //search lower half

479

else

480

return table_bin_search_char(chFind, nMid+1, nRight); //search upper half

481

}

482

483

int table_bin_search_escape(const char *szFind, int nLeft, int nRight)

484

{

485

//TRACE("bin search [Escape:%s], l=%d, r=%d\n", szFind, nLeft, nRight);

486

487

if(nLeft > nRight) {

488

//TRACE("bin search: no match found\n");

489

return -1; //no match found

490

}

491

492

//check middle of the range

493

int nMid = (nLeft + nRight)/2;

494

if(0 == strcmp(szFind, g_lstTableSort2[nMid].szEscape)){

495

//TRACE("bin search found [Escape:%s], m=%d\n", szFind, nMid);

496

return nMid; //match found

497

}

498

499

if(nLeft == nRight){

500

//TRACE("bin search: no match found\n");

501

return -1; //no match found

502

}

503

504

if(strcmp(szFind, g_lstTableSort2[nMid].szEscape) < 0)

505

{

506

//TRACE("Search lower half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);

507

return table_bin_search_escape(szFind, nLeft, nMid-1); //search lower half

508

}

509

else{

510

//TRACE("Search upper half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);

511

return table_bin_search_escape(szFind, nMid+1, nRight); //search upper half

512

}

513

}

514

515

bool HTMLParser::ExtractParam(const std::string &data, const char *szParam, std::string &resValue)

516

{

517

std::string strPattern = szParam;

518

strPattern += "=\"";

519

520

std::string::size_type nPos = data.find(strPattern.c_str());

521

if(nPos != std::string::npos)

522

{

523

std::string::size_type nEnd = data.find("\"", nPos+strPattern.size());

524

if(nEnd != std::string::npos){

525

resValue = data.substr(nPos+strPattern.size(), nEnd-nPos-strPattern.size());

526

return true;

527

}

528

}

529

return false; // not found

530

}

Older »