1
////////////////////////////////////////////////////////////////////////////
2
// NoteCase notes manager project <http://notecase.sf.net>
4
// This code is licensed under BSD license.See "license.txt" for more details.
6
// File: Implements basic HTML parser class
7
////////////////////////////////////////////////////////////////////////////
9
#include "HtmlParser.h"
18
#include <strings.h> //strcasecmp
20
#define strcasecmp stricmp
23
void replaceall(std::string &strData, const char *szFind, const char *szReplace);
26
#define PARSER_STATE_BLANK 0
27
#define PARSER_STATE_INSIDE_TAG 1
28
#define PARSER_STATE_INSIDE_COMMENT 2
30
// Html escape sequences table
36
static std::vector<HtmlEscape> g_lstTableSort2;
38
//table is sorted by first field to enable binary search
39
static const HtmlEscape _table_char[] =
41
{'\"', """}, //=34
45
{' ', " "}, //non-breaking space
46
{161, "¡"}, //'¡' - inverted exclamation mark
47
{162, "¢"}, //'¢'
48
{163, "£"}, //'£'
49
{164, "¤"},//'¤'
51
{166, "¦"},//'¦' - broken (vertical) bar
52
{167, "§"}, //'§' - section sign
53
{168, "¨"}, //'¨' - umlaut
54
{169, "©"}, //'©' - copyright sign
55
{170, "ª"}, //'ª' - feminine ordinal
56
{171, "«"}, //'«' - left guillemet
57
{174, "®"}, //'®' - registered sign
59
{176, "°"}, //'°' - degree sign
60
{177, "±"},//'±' - plus or minus
61
{178, "²"}, //'²' - superscript two
62
{179, "³"}, //'³' - superscript three
64
{187, "»"}, //'»' - right guillemet
66
{192, "À"},//'À'
67
{193, "Á"},//'Á'
68
{194, "Â"}, //'Â'
69
{195, "Ã"},//'Ã'
70
{196, "Ä"}, //'Ä'
71
{197, "Å"}, //'Å'
72
{198, "Æ"}, //'Æ'
73
{199, "Ç"},//'Ç'
74
{200, "È"},//'È'
75
{201, "É"},//'É'
76
{202, "Ê"}, //'Ê'
77
{203, "Ë"}, //'Ë'
78
{204, "Ì"},//'Ì'
79
{205, "Í"},//'Í'
80
{206, "Î"}, //'Î'
81
{207, "Ï"}, //'Ï'
82
{208, "Ð"}, //'Ð' - capital Eth, Icelandic
83
{209, "Ñ"},//'Ñ'
84
{210, "Ò"},//'Ò'
85
{211, "Ó"},//'Ó'
86
{212, "Ô"}, //'Ô'
87
{213, "Õ"},//'Õ'
88
{214, "Ö"}, //'Ö'
89
{215, "×"}, //'×' - multiply sign
90
{216, "Ø"},//'Ø'
91
{217, "Ù"},//'Ù'
92
{218, "Ú"},//'Ú'
93
{219, "Û"}, //'Û'
94
{220, "Ü"}, //'Ü'
95
{221, "Ý"},//'Ý'
96
{222, "Þ"}, //'Þ' - capital THORN, Icelandic
97
{223, "ß"}, //'ß'
98
{224, "à"},//'à'
99
{225, "á"},//'á'
100
{226, "â"}, //'â'
101
{227, "ã"},//'ã'
102
{228, "ä"}, //'ä'
103
{229, "å"}, //'å'
105
{230, "æ"}, //'æ'
106
{231, "ç"},//'ç'
107
{232, "è"},//'è'
108
{233, "é"},//'é'
109
{234, "ê"}, //'ê'
110
{235, "ë"}, //'ë'
111
{236, "ì"},//'ì'
112
{237, "í"},//'í'
113
{238, "î"}, //'î'
114
{239, "ï"}, //'ï'
115
{240, "ð"}, //'ð' - small eth, Icelandic
116
{241, "ñ"},//'ñ'
117
{242, "ò"},//'ò'
118
{243, "ó"},//'ó'
119
{244, "ô"}, //'ô'
120
{245, "õ"},//'õ'
121
{246, "ö"}, //'ö'
123
{248, "ø"},//'ø'
124
{249, "ù"},//'ù'
125
{250, "ú"},//'ú'
126
{251, "û"}, //'û'
127
{252, "ü"}, //'ü'
128
{253, "ý"},//'ý'
129
{254, "þ"}, //'þ' - small thorn, Icelandic
130
{255, "ÿ"}, //'ÿ'
132
{338, "Œ"}, //'Œ'
133
{339, "œ"}, //'œ'
134
{352, "Š"},//'Š'
135
{353, "š"},//'š'
136
{376, "Ÿ"}, //'Ÿ'
137
{402, "ƒ"}, //'ƒ'
139
{8211, "–"}, //'–' - en dash (demi-cadratin)
140
{8212, "—"}, //'—' - em dash (cadratin)
142
{8249, "‹"}, //'‹' - left single guillemet
143
{8250, "›"}, //'›' - right single guillemet
144
{8364, "€"}, //'€'
145
{8482, "™"}, //'™' - trademark
147
//TOFIX add more chars if needed
150
static int hexVal(char ch)
152
if ((ch >= 'a') && (ch <= 'f'))
153
return (ch - 'a') + 10;
154
else if ((ch >= 'A') && (ch <= 'F'))
155
return (ch - 'A') + 10;
156
else if ((ch >= '0') && (ch <= '9'))
161
#define SIZE_OF(x) (sizeof(x)/sizeof(x[0]))
163
static int table_bin_search_char(gunichar chFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);
164
static int table_bin_search_escape(const char *szFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);
168
bool operator()(const HtmlEscape &a, const HtmlEscape &b)
170
//operator < (is a<b ?)
171
return (strcmp(a.szEscape, b.szEscape) < 0);
175
HTMLParser::HTMLParser()
179
m_bAllowUnescapedInPreTag = false;
180
m_bInsidePreTag = false;
182
//create new sort table (create only once - global object)
183
if(g_lstTableSort2.empty())
185
for(unsigned int i=0; i<SIZE_OF(_table_char); i++)
186
g_lstTableSort2.push_back(_table_char[i]);
188
std::sort(g_lstTableSort2.begin(), g_lstTableSort2.end(), cmp);
192
HTMLParser::~HTMLParser()
196
void HTMLParser::Clear()
198
m_nState = PARSER_STATE_BLANK;
199
m_strData.erase(m_strData.begin(), m_strData.end());
202
bool HTMLParser::Parse(const char *szBuffer, int len)
207
for(int i=0; i<len; i++)
209
if(PARSER_STATE_INSIDE_COMMENT == m_nState)
211
if( m_strData.size()>2 &&
212
0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2)) //tag ends as comment
214
TRACE("HTML Parser: Comment ended\n");
215
m_strData.erase(m_strData.size()-2, 2); //remove "--" ending
216
OnComment(m_strData.c_str()+3); //trigger event
217
m_strData.erase(m_strData.begin(), m_strData.end());
218
m_nState = PARSER_STATE_BLANK;
221
m_strData += szBuffer[i];
223
else if(PARSER_STATE_INSIDE_TAG == m_nState)
225
//check for the end of tag
226
if(szBuffer[i] == '>'){
227
if(0 == strncmp("!--", m_strData.c_str(), 3)) //is tag comment
229
m_nState = PARSER_STATE_INSIDE_COMMENT;
231
if(0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2)) //tag ends as comment
233
TRACE("HTML Parser: Comment ended\n");
234
m_strData.erase(m_strData.size()-2, 2); //remove "--" ending
235
OnComment(m_strData.c_str()+3); //trigger event
236
m_strData.erase(m_strData.begin(), m_strData.end());
237
m_nState = PARSER_STATE_BLANK;
240
m_strData += szBuffer[i];
242
else if(!m_strData.empty() && m_nState != PARSER_STATE_INSIDE_COMMENT)
244
if(m_strData.at(0) == '/') //is ending tag
246
//strip everything after first space within tag to get real tag name
247
std::string strTag(m_strData.c_str()+1);
248
int nPos = strTag.find_first_of(' ');
250
strTag.erase(strTag.begin()+nPos);
252
if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag) || (0 == strcasecmp(strTag.c_str(), "PRE")))
254
TRACE("HTML Parser: Tag end found (%s)\n", strTag.c_str());
255
OnTagEnd(strTag.c_str()); //trigger event
256
m_strData.erase(m_strData.begin(), m_strData.end());
257
if(0 == strcasecmp(strTag.c_str(), "PRE"))
258
m_bInsidePreTag = false;
262
TRACE("HTML Parser: Push text (%s)\n", m_strData.c_str());
263
m_nState = PARSER_STATE_BLANK;
264
OnText(m_strData.c_str()); //trigger event for previous contents
265
m_strData.erase(m_strData.begin(), m_strData.end());
268
else if(m_nState != PARSER_STATE_INSIDE_COMMENT)
270
std::string strTag(m_strData.c_str());
271
std::string strParams;
273
int nPos = strTag.find_first_of(' ');
275
strTag = strTag.substr(0, nPos);
276
strParams = m_strData.substr(nPos);
279
if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag)){
280
TRACE("HTML Parser: Tag start found (%s)[%s]\n", strTag.c_str(), strParams.c_str());
281
OnTagBegin(strTag.c_str(), strParams.c_str()); //trigger event
282
m_strData.erase(m_strData.begin(), m_strData.end());
283
m_nState = PARSER_STATE_BLANK;
284
if(0 == strcasecmp(strTag.c_str(), "PRE"))
285
m_bInsidePreTag = true;
288
m_nState = PARSER_STATE_BLANK;
289
TRACE("HTML Parser: Push text1 (%s)\n", m_strData.c_str());
290
OnText(m_strData.c_str()); //trigger event for previous contents
291
m_strData.erase(m_strData.begin(), m_strData.end());
296
if(PARSER_STATE_INSIDE_COMMENT != m_nState)
300
m_strData += szBuffer[i];
304
//check for the start of tag
305
if(szBuffer[i] == '<' && m_nState == PARSER_STATE_BLANK)
307
if(!m_strData.empty())
309
TRACE("HTML Parser: Push text2 (%s)\n", m_strData.c_str());
310
OnText(m_strData.c_str()); //trigger event for previous contents
311
m_strData.erase(m_strData.begin(), m_strData.end());
313
m_nState = PARSER_STATE_INSIDE_TAG;
316
m_strData += szBuffer[i];
324
void HTMLParser::Finalize()
326
if(!m_strData.empty() && m_nState == PARSER_STATE_BLANK)
327
OnText(m_strData.c_str()); //trigger event for previous contents
328
m_strData.erase(m_strData.begin(), m_strData.end());
331
void HTMLParser::EscapeURI(std::string &data)
333
//TOFIX replace non-ASCII characters by converting each byte to %HH, where HH is the hexadecimal notation of the byte value
334
replaceall(data, " ", "%20");
335
replaceall(data, "&", "&");
338
void HTMLParser::UnescapeURI(std::string &data)
340
#if GTK_CHECK_VERSION(2,16,0)
341
char *szRes = g_uri_unescape_string(data.c_str(), NULL);
348
unsigned int nSize = data.size();
349
std::string::size_type nPos;
350
while ((nPos = data.find('%', nStart)) != std::string::npos)
352
if(nPos + 2 < nSize) // two chars after %
355
if ((a = hexVal(data[nPos+1])) != -1)
357
if ((b = hexVal(data[nPos+2])) != -1)
359
gunichar cChar = ((a * 16) + b);
361
//int nWritten = g_unichar_to_utf8(cChar, szText);
362
//szText[nWritten] = '\0';
364
//data.insert(nPos, szText);
365
data.insert(data.begin()+nPos, (char)cChar);
367
//nStart = nPos + nWritten;
384
replaceall(data, "&", "&");
388
void HTMLParser::EscapeChars(std::string &data)
390
unsigned int nPos, nWidth;
391
const char *szStart = data.c_str();
392
const char *szString = szStart;
394
//using UTF-8 characters
395
while(NULL != szString && '\0' != *szString)
398
gunichar chLetter = g_utf8_get_char (szString);
399
const char *szNext = g_utf8_find_next_char(szString, NULL);
401
//TRACE("String to escape: %s\n", szString);
403
int nRes = table_bin_search_char(chLetter);
406
//replace escape sequence with original special char
407
nPos = szString - szStart;
408
nWidth = szNext - szString;
409
nSkip = strlen(_table_char[nRes].szEscape);
411
//TRACE("Escape: %d (width=%d) to %s\n", chLetter, nWidth, _table_char[nRes].szEscape);
413
//FIX: data = data.substr(0, nPos) + _table_char[nRes].szEscape + data.substr(nPos+nWidth, 1000000);
414
data.erase(nPos, nWidth);
415
data.insert(nPos, _table_char[nRes].szEscape);
417
//TRACE("Escaped line: %s\n", data.c_str());
419
szStart = data.c_str(); //in case string was reallocated
420
szString = szStart + nPos + nSkip;
427
void HTMLParser::UnescapeChars(std::string &data)
429
unsigned int nPos = 0;
432
int nPosStart = data.find('&', nPos);
436
int nPosEnd = data.find(';', nPosStart+1);
439
//extract escape sequence
440
std::string strChar = data.substr(nPosStart, nPosEnd-nPosStart+1);
441
//TRACE("Escape sequence %s found!\n", strChar.c_str());
443
int nRes = table_bin_search_escape(strChar.c_str());
446
//replace escape sequence with original UTF-8 character
448
int nBytes = g_unichar_to_utf8(g_lstTableSort2[nRes].cLetter, szBuffer);
449
szBuffer[nBytes] = '\0';
451
//FIX: data = data.substr(0, nPosStart) + szBuffer + data.substr(nPosEnd+1, 1000000);
452
data.erase(nPosStart, nPosEnd+1-nPosStart);
453
data.insert(nPosStart, szBuffer);
456
TRACE("ERROR: HTML escape sequence %s is not supported yet!\n", strChar.c_str());
459
break; //no sequence found
465
//use binary search to speed up convertion
466
int table_bin_search_char(gunichar chFind, int nLeft, int nRight)
468
if(nLeft > nRight) return -1; //no match found
470
//check middle of the range
471
int nMid = (nLeft + nRight)/2;
472
if(chFind == _table_char[nMid].cLetter)
473
return nMid; //match found
475
if(nLeft == nRight) return -1; //no match found
477
if(chFind < _table_char[nMid].cLetter)
478
return table_bin_search_char(chFind, nLeft, nMid-1); //search lower half
480
return table_bin_search_char(chFind, nMid+1, nRight); //search upper half
483
int table_bin_search_escape(const char *szFind, int nLeft, int nRight)
485
//TRACE("bin search [Escape:%s], l=%d, r=%d\n", szFind, nLeft, nRight);
488
//TRACE("bin search: no match found\n");
489
return -1; //no match found
492
//check middle of the range
493
int nMid = (nLeft + nRight)/2;
494
if(0 == strcmp(szFind, g_lstTableSort2[nMid].szEscape)){
495
//TRACE("bin search found [Escape:%s], m=%d\n", szFind, nMid);
496
return nMid; //match found
500
//TRACE("bin search: no match found\n");
501
return -1; //no match found
504
if(strcmp(szFind, g_lstTableSort2[nMid].szEscape) < 0)
506
//TRACE("Search lower half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);
507
return table_bin_search_escape(szFind, nLeft, nMid-1); //search lower half
510
//TRACE("Search upper half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);
511
return table_bin_search_escape(szFind, nMid+1, nRight); //search upper half
515
bool HTMLParser::ExtractParam(const std::string &data, const char *szParam, std::string &resValue)
517
std::string strPattern = szParam;
520
std::string::size_type nPos = data.find(strPattern.c_str());
521
if(nPos != std::string::npos)
523
std::string::size_type nEnd = data.find("\"", nPos+strPattern.size());
524
if(nEnd != std::string::npos){
525
resValue = data.substr(nPos+strPattern.size(), nEnd-nPos-strPattern.size());
529
return false; // not found
1
////////////////////////////////////////////////////////////////////////////
2
// NoteCase notes manager project <http://notecase.sf.net>
4
// This code is licensed under BSD license.See "license.txt" for more details.
6
// File: Implements basic HTML parser class
7
////////////////////////////////////////////////////////////////////////////
9
#include "HtmlParser.h"
18
#include <strings.h> //strcasecmp
20
#define strcasecmp stricmp
23
void replaceall(std::string &strData, const char *szFind, const char *szReplace);
26
#define PARSER_STATE_BLANK 0
27
#define PARSER_STATE_INSIDE_TAG 1
28
#define PARSER_STATE_INSIDE_COMMENT 2
30
// Html escape sequences table
36
static std::vector<HtmlEscape> g_lstTableSort2;
38
//table is sorted by first field to enable binary search
39
static const HtmlEscape _table_char[] =
41
{'\"', """}, //=34
45
{' ', " "}, //non-breaking space
46
{161, "¡"}, //'¡' - inverted exclamation mark
47
{162, "¢"}, //'¢'
48
{163, "£"}, //'£'
49
{164, "¤"},//'¤'
51
{166, "¦"},//'¦' - broken (vertical) bar
52
{167, "§"}, //'§' - section sign
53
{168, "¨"}, //'¨' - umlaut
54
{169, "©"}, //'©' - copyright sign
55
{170, "ª"}, //'ª' - feminine ordinal
56
{171, "«"}, //'«' - left guillemet
57
{174, "®"}, //'®' - registered sign
59
{176, "°"}, //'°' - degree sign
60
{177, "±"},//'±' - plus or minus
61
{178, "²"}, //'²' - superscript two
62
{179, "³"}, //'³' - superscript three
64
{187, "»"}, //'»' - right guillemet
66
{192, "À"},//'À'
67
{193, "Á"},//'Á'
68
{194, "Â"}, //'Â'
69
{195, "Ã"},//'Ã'
70
{196, "Ä"}, //'Ä'
71
{197, "Å"}, //'Å'
72
{198, "Æ"}, //'Æ'
73
{199, "Ç"},//'Ç'
74
{200, "È"},//'È'
75
{201, "É"},//'É'
76
{202, "Ê"}, //'Ê'
77
{203, "Ë"}, //'Ë'
78
{204, "Ì"},//'Ì'
79
{205, "Í"},//'Í'
80
{206, "Î"}, //'Î'
81
{207, "Ï"}, //'Ï'
82
{208, "Ð"}, //'Ð' - capital Eth, Icelandic
83
{209, "Ñ"},//'Ñ'
84
{210, "Ò"},//'Ò'
85
{211, "Ó"},//'Ó'
86
{212, "Ô"}, //'Ô'
87
{213, "Õ"},//'Õ'
88
{214, "Ö"}, //'Ö'
89
{215, "×"}, //'×' - multiply sign
90
{216, "Ø"},//'Ø'
91
{217, "Ù"},//'Ù'
92
{218, "Ú"},//'Ú'
93
{219, "Û"}, //'Û'
94
{220, "Ü"}, //'Ü'
95
{221, "Ý"},//'Ý'
96
{222, "Þ"}, //'Þ' - capital THORN, Icelandic
97
{223, "ß"}, //'ß'
98
{224, "à"},//'à'
99
{225, "á"},//'á'
100
{226, "â"}, //'â'
101
{227, "ã"},//'ã'
102
{228, "ä"}, //'ä'
103
{229, "å"}, //'å'
105
{230, "æ"}, //'æ'
106
{231, "ç"},//'ç'
107
{232, "è"},//'è'
108
{233, "é"},//'é'
109
{234, "ê"}, //'ê'
110
{235, "ë"}, //'ë'
111
{236, "ì"},//'ì'
112
{237, "í"},//'í'
113
{238, "î"}, //'î'
114
{239, "ï"}, //'ï'
115
{240, "ð"}, //'ð' - small eth, Icelandic
116
{241, "ñ"},//'ñ'
117
{242, "ò"},//'ò'
118
{243, "ó"},//'ó'
119
{244, "ô"}, //'ô'
120
{245, "õ"},//'õ'
121
{246, "ö"}, //'ö'
123
{248, "ø"},//'ø'
124
{249, "ù"},//'ù'
125
{250, "ú"},//'ú'
126
{251, "û"}, //'û'
127
{252, "ü"}, //'ü'
128
{253, "ý"},//'ý'
129
{254, "þ"}, //'þ' - small thorn, Icelandic
130
{255, "ÿ"}, //'ÿ'
132
{338, "Œ"}, //'Œ'
133
{339, "œ"}, //'œ'
134
{352, "Š"},//'Š'
135
{353, "š"},//'š'
136
{376, "Ÿ"}, //'Ÿ'
137
{402, "ƒ"}, //'ƒ'
139
{8211, "–"}, //'–' - en dash (demi-cadratin)
140
{8212, "—"}, //'—' - em dash (cadratin)
142
{8249, "‹"}, //'‹' - left single guillemet
143
{8250, "›"}, //'›' - right single guillemet
144
{8364, "€"}, //'€'
145
{8482, "™"}, //'™' - trademark
147
//TOFIX add more chars if needed
150
static int hexVal(char ch)
152
if ((ch >= 'a') && (ch <= 'f'))
153
return (ch - 'a') + 10;
154
else if ((ch >= 'A') && (ch <= 'F'))
155
return (ch - 'A') + 10;
156
else if ((ch >= '0') && (ch <= '9'))
161
#define SIZE_OF(x) (sizeof(x)/sizeof(x[0]))
163
static int table_bin_search_char(gunichar chFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);
164
static int table_bin_search_escape(const char *szFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);
168
bool operator()(const HtmlEscape &a, const HtmlEscape &b)
170
//operator < (is a<b ?)
171
return (strcmp(a.szEscape, b.szEscape) < 0);
175
HTMLParser::HTMLParser()
179
m_bAllowUnescapedInPreTag = false;
180
m_bInsidePreTag = false;
182
//create new sort table (create only once - global object)
183
if(g_lstTableSort2.empty())
185
for(unsigned int i=0; i<SIZE_OF(_table_char); i++)
186
g_lstTableSort2.push_back(_table_char[i]);
188
std::sort(g_lstTableSort2.begin(), g_lstTableSort2.end(), cmp);
192
HTMLParser::~HTMLParser()
196
void HTMLParser::Clear()
198
m_nState = PARSER_STATE_BLANK;
199
m_strData.erase(m_strData.begin(), m_strData.end());
202
bool HTMLParser::Parse(const char *szBuffer, int len)
207
for(int i=0; i<len; i++)
209
if(PARSER_STATE_INSIDE_COMMENT == m_nState)
211
if( m_strData.size()>2 &&
212
0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2)) //tag ends as comment
214
TRACE("HTML Parser: Comment ended\n");
215
m_strData.erase(m_strData.size()-2, 2); //remove "--" ending
216
OnComment(m_strData.c_str()+3); //trigger event
217
m_strData.erase(m_strData.begin(), m_strData.end());
218
m_nState = PARSER_STATE_BLANK;
221
m_strData += szBuffer[i];
223
else if(PARSER_STATE_INSIDE_TAG == m_nState)
225
//check for the end of tag
226
if(szBuffer[i] == '>'){
227
if(0 == strncmp("!--", m_strData.c_str(), 3)) //is tag comment
229
m_nState = PARSER_STATE_INSIDE_COMMENT;
231
if(0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2)) //tag ends as comment
233
TRACE("HTML Parser: Comment ended\n");
234
m_strData.erase(m_strData.size()-2, 2); //remove "--" ending
235
OnComment(m_strData.c_str()+3); //trigger event
236
m_strData.erase(m_strData.begin(), m_strData.end());
237
m_nState = PARSER_STATE_BLANK;
240
m_strData += szBuffer[i];
242
else if(!m_strData.empty() && m_nState != PARSER_STATE_INSIDE_COMMENT)
244
if(m_strData.at(0) == '/') //is ending tag
246
//strip everything after first space within tag to get real tag name
247
std::string strTag(m_strData.c_str()+1);
248
int nPos = strTag.find_first_of(' ');
250
strTag.erase(strTag.begin()+nPos);
252
if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag) || (0 == strcasecmp(strTag.c_str(), "PRE")))
254
TRACE("HTML Parser: Tag end found (%s)\n", strTag.c_str());
255
OnTagEnd(strTag.c_str()); //trigger event
256
m_strData.erase(m_strData.begin(), m_strData.end());
257
if(0 == strcasecmp(strTag.c_str(), "PRE"))
258
m_bInsidePreTag = false;
262
TRACE("HTML Parser: Push text (%s)\n", m_strData.c_str());
263
m_nState = PARSER_STATE_BLANK;
264
OnText(m_strData.c_str()); //trigger event for previous contents
265
m_strData.erase(m_strData.begin(), m_strData.end());
268
else if(m_nState != PARSER_STATE_INSIDE_COMMENT)
270
std::string strTag(m_strData.c_str());
271
std::string strParams;
273
int nPos = strTag.find_first_of(' ');
275
strTag = strTag.substr(0, nPos);
276
strParams = m_strData.substr(nPos);
279
if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag)){
280
TRACE("HTML Parser: Tag start found (%s)[%s]\n", strTag.c_str(), strParams.c_str());
281
OnTagBegin(strTag.c_str(), strParams.c_str()); //trigger event
282
m_strData.erase(m_strData.begin(), m_strData.end());
283
m_nState = PARSER_STATE_BLANK;
284
if(0 == strcasecmp(strTag.c_str(), "PRE"))
285
m_bInsidePreTag = true;
288
m_nState = PARSER_STATE_BLANK;
289
TRACE("HTML Parser: Push text1 (%s)\n", m_strData.c_str());
290
OnText(m_strData.c_str()); //trigger event for previous contents
291
m_strData.erase(m_strData.begin(), m_strData.end());
296
if(PARSER_STATE_INSIDE_COMMENT != m_nState)
300
m_strData += szBuffer[i];
304
//check for the start of tag
305
if(szBuffer[i] == '<' && m_nState == PARSER_STATE_BLANK)
307
if(!m_strData.empty())
309
TRACE("HTML Parser: Push text2 (%s)\n", m_strData.c_str());
310
OnText(m_strData.c_str()); //trigger event for previous contents
311
m_strData.erase(m_strData.begin(), m_strData.end());
313
m_nState = PARSER_STATE_INSIDE_TAG;
316
m_strData += szBuffer[i];
324
void HTMLParser::Finalize()
326
if(!m_strData.empty() && m_nState == PARSER_STATE_BLANK)
327
OnText(m_strData.c_str()); //trigger event for previous contents
328
m_strData.erase(m_strData.begin(), m_strData.end());
331
void HTMLParser::EscapeURI(std::string &data)
333
//TOFIX replace non-ASCII characters by converting each byte to %HH, where HH is the hexadecimal notation of the byte value
334
replaceall(data, " ", "%20");
335
replaceall(data, "&", "&");
338
void HTMLParser::UnescapeURI(std::string &data)
340
#if GTK_CHECK_VERSION(2,16,0)
341
char *szRes = g_uri_unescape_string(data.c_str(), NULL);
348
unsigned int nSize = data.size();
349
std::string::size_type nPos;
350
while ((nPos = data.find('%', nStart)) != std::string::npos)
352
if(nPos + 2 < nSize) // two chars after %
355
if ((a = hexVal(data[nPos+1])) != -1)
357
if ((b = hexVal(data[nPos+2])) != -1)
359
gunichar cChar = ((a * 16) + b);
361
//int nWritten = g_unichar_to_utf8(cChar, szText);
362
//szText[nWritten] = '\0';
364
//data.insert(nPos, szText);
365
data.insert(data.begin()+nPos, (char)cChar);
367
//nStart = nPos + nWritten;
384
replaceall(data, "&", "&");
388
void HTMLParser::EscapeChars(std::string &data)
390
unsigned int nPos, nWidth;
391
const char *szStart = data.c_str();
392
const char *szString = szStart;
394
//using UTF-8 characters
395
while(NULL != szString && '\0' != *szString)
398
gunichar chLetter = g_utf8_get_char (szString);
399
const char *szNext = g_utf8_find_next_char(szString, NULL);
401
//TRACE("String to escape: %s\n", szString);
403
int nRes = table_bin_search_char(chLetter);
406
//replace escape sequence with original special char
407
nPos = szString - szStart;
408
nWidth = szNext - szString;
409
nSkip = strlen(_table_char[nRes].szEscape);
411
//TRACE("Escape: %d (width=%d) to %s\n", chLetter, nWidth, _table_char[nRes].szEscape);
413
//FIX: data = data.substr(0, nPos) + _table_char[nRes].szEscape + data.substr(nPos+nWidth, 1000000);
414
data.erase(nPos, nWidth);
415
data.insert(nPos, _table_char[nRes].szEscape);
417
//TRACE("Escaped line: %s\n", data.c_str());
419
szStart = data.c_str(); //in case string was reallocated
420
szString = szStart + nPos + nSkip;
427
void HTMLParser::UnescapeChars(std::string &data)
429
unsigned int nPos = 0;
432
int nPosStart = data.find('&', nPos);
436
int nPosEnd = data.find(';', nPosStart+1);
439
//extract escape sequence
440
std::string strChar = data.substr(nPosStart, nPosEnd-nPosStart+1);
441
//TRACE("Escape sequence %s found!\n", strChar.c_str());
443
int nRes = table_bin_search_escape(strChar.c_str());
446
//replace escape sequence with original UTF-8 character
448
int nBytes = g_unichar_to_utf8(g_lstTableSort2[nRes].cLetter, szBuffer);
449
szBuffer[nBytes] = '\0';
451
//FIX: data = data.substr(0, nPosStart) + szBuffer + data.substr(nPosEnd+1, 1000000);
452
data.erase(nPosStart, nPosEnd+1-nPosStart);
453
data.insert(nPosStart, szBuffer);
456
TRACE("ERROR: HTML escape sequence %s is not supported yet!\n", strChar.c_str());
459
break; //no sequence found
465
//use binary search to speed up convertion
466
int table_bin_search_char(gunichar chFind, int nLeft, int nRight)
468
if(nLeft > nRight) return -1; //no match found
470
//check middle of the range
471
int nMid = (nLeft + nRight)/2;
472
if(chFind == _table_char[nMid].cLetter)
473
return nMid; //match found
475
if(nLeft == nRight) return -1; //no match found
477
if(chFind < _table_char[nMid].cLetter)
478
return table_bin_search_char(chFind, nLeft, nMid-1); //search lower half
480
return table_bin_search_char(chFind, nMid+1, nRight); //search upper half
483
int table_bin_search_escape(const char *szFind, int nLeft, int nRight)
485
//TRACE("bin search [Escape:%s], l=%d, r=%d\n", szFind, nLeft, nRight);
488
//TRACE("bin search: no match found\n");
489
return -1; //no match found
492
//check middle of the range
493
int nMid = (nLeft + nRight)/2;
494
if(0 == strcmp(szFind, g_lstTableSort2[nMid].szEscape)){
495
//TRACE("bin search found [Escape:%s], m=%d\n", szFind, nMid);
496
return nMid; //match found
500
//TRACE("bin search: no match found\n");
501
return -1; //no match found
504
if(strcmp(szFind, g_lstTableSort2[nMid].szEscape) < 0)
506
//TRACE("Search lower half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);
507
return table_bin_search_escape(szFind, nLeft, nMid-1); //search lower half
510
//TRACE("Search upper half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);
511
return table_bin_search_escape(szFind, nMid+1, nRight); //search upper half
515
bool HTMLParser::ExtractParam(const std::string &data, const char *szParam, std::string &resValue)
517
std::string strPattern = szParam;
520
std::string::size_type nPos = data.find(strPattern.c_str());
521
if(nPos != std::string::npos)
523
std::string::size_type nEnd = data.find("\"", nPos+strPattern.size());
524
if(nEnd != std::string::npos){
525
resValue = data.substr(nPos+strPattern.size(), nEnd-nPos-strPattern.size());
529
return false; // not found