~ubuntu-branches/ubuntu/raring/notecase/raring

« back to all changes in this revision

Viewing changes to src/lib/HtmlParser.cpp

  • Committer: Bazaar Package Importer
  • Author(s): Nathan Handler
  • Date: 2008-12-21 13:09:58 UTC
  • mfrom: (1.1.6 upstream)
  • Revision ID: james.westby@ubuntu.com-20081221130958-0ri77h0x7j1dclkq
Tags: 1.9.8-0ubuntu1
New upstream release (LP: #307752)

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
////////////////////////////////////////////////////////////////////////////
2
 
// NoteCase notes manager project <http://notecase.sf.net>
3
 
//
4
 
// This code is licensed under BSD license.See "license.txt" for more details.
5
 
//
6
 
// File: Implements basic HTML parser class
7
 
////////////////////////////////////////////////////////////////////////////
8
 
 
9
 
#include "HtmlParser.h"
10
 
#include "debug.h"
11
 
#include <vector>
12
 
#include <algorithm>
13
 
#include <glib.h>
14
 
#include <gtk/gtk.h>
15
 
#include <string.h>
16
 
 
17
 
#ifndef _WIN32
18
 
 #include <strings.h>   //strcasecmp
19
 
#else
20
 
 #define strcasecmp stricmp
21
 
#endif
22
 
 
23
 
void replaceall(std::string &strData, const char *szFind, const char *szReplace);
24
 
 
25
 
// parser states
26
 
#define PARSER_STATE_BLANK      0
27
 
#define PARSER_STATE_INSIDE_TAG 1
28
 
#define PARSER_STATE_INSIDE_COMMENT 2
29
 
 
30
 
// Html escape sequences table
31
 
typedef struct {
32
 
        gunichar  cLetter;
33
 
        const char *szEscape;
34
 
} HtmlEscape;
35
 
 
36
 
static std::vector<HtmlEscape> g_lstTableSort2;
37
 
 
38
 
//table is sorted by first field to enable binary search
39
 
static const HtmlEscape _table_char[] =
40
 
{
41
 
        {'\"',  "&quot;"},  //=34
42
 
        {'&',   "&amp;"},   //=38
43
 
        {'<',   "&lt;"},    //=60
44
 
        {'>',   "&gt;"},    //=62
45
 
        {' ',   "&nbsp;"},  //non-breaking space
46
 
        {161,   "&iexcl;"}, //'¡' - inverted exclamation mark
47
 
        {162,   "&cent;"},  //'¢'
48
 
        {163,   "&pound;"}, //'£'
49
 
        {164,   "&curren;"},//'¤'
50
 
        {165,   "&yen;"},   //'¥'
51
 
        {166,   "&brvbar;"},//'¦' - broken (vertical) bar
52
 
        {167,   "&sect;"},  //'§' - section sign
53
 
        {168,   "&uml;"},   //'¨' - umlaut
54
 
        {169,   "&copy;"},  //'©' - copyright sign
55
 
        {170,   "&ordf;"},  //'ª' - feminine ordinal
56
 
        {171,   "&laquo;"}, //'«' - left guillemet
57
 
        {174,   "&reg;"},   //'®' - registered sign
58
 
 
59
 
        {176,   "&deg;"},   //'°' - degree sign
60
 
        {177,   "&plusmn;"},//'±' - plus or minus
61
 
        {178,   "&sup2;"},  //'²' - superscript two
62
 
        {179,   "&sup3;"},  //'³' - superscript three
63
 
 
64
 
        {187,   "&raquo;"}, //'»' - right guillemet
65
 
 
66
 
        {192,   "&Agrave;"},//'À'
67
 
        {193,   "&Aacute;"},//'Á'
68
 
        {194,   "&Acirc;"}, //'Â'
69
 
        {195,   "&Atilde;"},//'Ã'
70
 
        {196,   "&Auml;"},  //'Ä'
71
 
        {197,   "&Aring;"}, //'Å'
72
 
        {198,   "&AElig;"}, //'Æ'
73
 
        {199,   "&Ccedil;"},//'Ç'
74
 
        {200,   "&Egrave;"},//'È'
75
 
        {201,   "&Eacute;"},//'É'
76
 
        {202,   "&Ecirc;"}, //'Ê'
77
 
        {203,   "&Euml;"},  //'Ë'
78
 
        {204,   "&Igrave;"},//'Ì'
79
 
        {205,   "&Iacute;"},//'Í'
80
 
        {206,   "&Icirc;"}, //'Î'
81
 
        {207,   "&Iuml;"},  //'Ï'
82
 
        {208,   "&ETH;"},   //'Ð' - capital Eth, Icelandic
83
 
        {209,   "&Ntilde;"},//'Ñ'
84
 
        {210,   "&Ograve;"},//'Ò'
85
 
        {211,   "&Oacute;"},//'Ó'
86
 
        {212,   "&Ocirc;"}, //'Ô'
87
 
        {213,   "&Otilde;"},//'Õ'
88
 
        {214,   "&Ouml;"},  //'Ö'
89
 
        {215,   "&times;"}, //'×' - multiply sign
90
 
        {216,   "&Oslash;"},//'Ø'
91
 
        {217,   "&Ugrave;"},//'Ù'
92
 
        {218,   "&Uacute;"},//'Ú'
93
 
        {219,   "&Ucirc;"}, //'Û'
94
 
        {220,   "&Uuml;"},  //'Ü'
95
 
        {221,   "&Yacute;"},//'Ý'
96
 
        {222,   "&THORN;"}, //'Þ' - capital THORN, Icelandic
97
 
        {223,   "&szlig;"}, //'ß'
98
 
        {224,   "&agrave;"},//'à'
99
 
        {225,   "&aacute;"},//'á'
100
 
        {226,   "&acirc;"}, //'â'
101
 
        {227,   "&atilde;"},//'ã'
102
 
        {228,   "&auml;"},  //'ä'
103
 
        {229,   "&aring;"}, //'å'
104
 
 
105
 
        {230,   "&aelig;"}, //'æ'
106
 
        {231,   "&ccedil;"},//'ç'
107
 
        {232,   "&egrave;"},//'è'
108
 
        {233,   "&eacute;"},//'é'
109
 
        {234,   "&ecirc;"}, //'ê'
110
 
        {235,   "&euml;"},  //'ë'
111
 
        {236,   "&igrave;"},//'ì'
112
 
        {237,   "&iacute;"},//'í'
113
 
        {238,   "&icirc;"}, //'î'
114
 
        {239,   "&iuml;"},  //'ï'
115
 
        {240,   "&eth;"},   //'ð' - small eth, Icelandic
116
 
        {241,   "&ntilde;"},//'ñ'
117
 
        {242,   "&ograve;"},//'ò'
118
 
        {243,   "&oacute;"},//'ó'
119
 
        {244,   "&ocirc;"},     //'ô'
120
 
        {245,   "&otilde;"},//'õ'
121
 
        {246,   "&ouml;"},  //'ö'
122
 
 
123
 
        {248,   "&oslash;"},//'ø'
124
 
        {249,   "&ugrave;"},//'ù'
125
 
        {250,   "&uacute;"},//'ú'
126
 
        {251,   "&ucirc;"}, //'û'
127
 
        {252,   "&uuml;"},  //'ü'
128
 
        {253,   "&yacute;"},//'ý'
129
 
        {254,   "&thorn;"}, //'þ' - small thorn, Icelandic
130
 
        {255,   "&yuml;"},  //'ÿ'
131
 
 
132
 
        {338,   "&OElig;"}, //'Œ'
133
 
        {339,   "&oelig;"}, //'œ'
134
 
        {352,   "&Scaron;"},//'Š'
135
 
        {353,   "&scaron;"},//'š'
136
 
        {376,   "&Yuml;"},  //'Ÿ'
137
 
        {402,   "&fnof;"},  //'ƒ'
138
 
 
139
 
        {8211,  "&#8211;"}, //'–' - en dash (demi-cadratin)
140
 
        {8212,  "&#8212;"}, //'—' - em dash (cadratin)
141
 
 
142
 
        {8249,  "&#8249;"}, //'‹' - left single guillemet
143
 
        {8250,  "&#8250;"}, //'›' - right single guillemet
144
 
        {8364,  "&euro;"},  //'€'
145
 
        {8482,  "&trade;"}, //'™' - trademark
146
 
 
147
 
        //TOFIX add more chars if needed
148
 
};
149
 
 
150
 
static int hexVal(char ch)
151
 
{
152
 
    if ((ch >= 'a') && (ch <= 'f'))
153
 
        return (ch - 'a') + 10;
154
 
    else if ((ch >= 'A') && (ch <= 'F'))
155
 
        return (ch - 'A') + 10;
156
 
    else if ((ch >= '0') && (ch <= '9'))
157
 
        return (ch - '0');
158
 
    return -1;
159
 
} // hexVal
160
 
 
161
 
#define SIZE_OF(x) (sizeof(x)/sizeof(x[0]))
162
 
 
163
 
static int table_bin_search_char(gunichar chFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);
164
 
static int table_bin_search_escape(const char *szFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);
165
 
 
166
 
class TblComparator{
167
 
public:
168
 
        bool operator()(const HtmlEscape &a, const HtmlEscape &b)
169
 
        {
170
 
                //operator < (is a<b ?)
171
 
                return (strcmp(a.szEscape, b.szEscape) < 0);
172
 
        };
173
 
};
174
 
 
175
 
HTMLParser::HTMLParser()
176
 
{
177
 
        Clear();
178
 
 
179
 
        m_bAllowUnescapedInPreTag = false;
180
 
        m_bInsidePreTag = false;
181
 
 
182
 
        //create new sort table (create only once - global object)
183
 
        if(g_lstTableSort2.empty())
184
 
        {
185
 
                for(unsigned int i=0; i<SIZE_OF(_table_char); i++)
186
 
                        g_lstTableSort2.push_back(_table_char[i]);
187
 
                TblComparator cmp;
188
 
                std::sort(g_lstTableSort2.begin(), g_lstTableSort2.end(), cmp);
189
 
        }
190
 
}
191
 
 
192
 
HTMLParser::~HTMLParser()
193
 
{
194
 
}
195
 
 
196
 
void HTMLParser::Clear()
197
 
{
198
 
        m_nState = PARSER_STATE_BLANK;
199
 
        m_strData.erase(m_strData.begin(), m_strData.end());
200
 
}
201
 
 
202
 
bool HTMLParser::Parse(const char *szBuffer, int len)
203
 
{
204
 
        if(len < 0)
205
 
                return false;
206
 
 
207
 
        for(int i=0; i<len; i++)
208
 
        {
209
 
                if(PARSER_STATE_INSIDE_COMMENT == m_nState)
210
 
                {
211
 
                        if( m_strData.size()>2 &&
212
 
                                0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2))    //tag ends as comment
213
 
                        {
214
 
                                TRACE("HTML Parser: Comment ended\n");
215
 
                                m_strData.erase(m_strData.size()-2, 2); //remove "--" ending
216
 
                                OnComment(m_strData.c_str()+3); //trigger event
217
 
                                m_strData.erase(m_strData.begin(), m_strData.end());
218
 
                                m_nState = PARSER_STATE_BLANK;
219
 
                        }
220
 
                        else
221
 
                                m_strData += szBuffer[i];
222
 
                }
223
 
                else if(PARSER_STATE_INSIDE_TAG == m_nState)
224
 
                {
225
 
                        //check for the end of tag
226
 
                        if(szBuffer[i] == '>'){
227
 
                                if(0 == strncmp("!--", m_strData.c_str(), 3))   //is tag comment
228
 
                                {
229
 
                                        m_nState = PARSER_STATE_INSIDE_COMMENT;
230
 
 
231
 
                                        if(0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2)) //tag ends as comment
232
 
                                        {
233
 
                                                TRACE("HTML Parser: Comment ended\n");
234
 
                                                m_strData.erase(m_strData.size()-2, 2); //remove "--" ending
235
 
                                                OnComment(m_strData.c_str()+3); //trigger event
236
 
                                                m_strData.erase(m_strData.begin(), m_strData.end());
237
 
                                                m_nState = PARSER_STATE_BLANK;
238
 
                                        }
239
 
                                        else
240
 
                                                m_strData += szBuffer[i];
241
 
                                }
242
 
                                else if(!m_strData.empty() && m_nState != PARSER_STATE_INSIDE_COMMENT)
243
 
                                {
244
 
                                        if(m_strData.at(0) == '/')      //is ending tag
245
 
                                        {
246
 
                                                //strip everything after first space within tag to get real tag name
247
 
                                                std::string strTag(m_strData.c_str()+1);
248
 
                                                int nPos = strTag.find_first_of(' ');
249
 
                                                if(nPos >= 0)
250
 
                                                        strTag.erase(strTag.begin()+nPos);
251
 
 
252
 
                                                if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag) || (0 == strcasecmp(strTag.c_str(), "PRE")))
253
 
                                                {
254
 
                                                        TRACE("HTML Parser: Tag end found (%s)\n", strTag.c_str());
255
 
                                                        OnTagEnd(strTag.c_str());       //trigger event
256
 
                                                        m_strData.erase(m_strData.begin(), m_strData.end());
257
 
                                                        if(0 == strcasecmp(strTag.c_str(), "PRE"))
258
 
                                                                m_bInsidePreTag = false;
259
 
                                                }
260
 
                                                else
261
 
                                                {
262
 
                                                        TRACE("HTML Parser: Push text (%s)\n", m_strData.c_str());
263
 
                                                        m_nState = PARSER_STATE_BLANK;
264
 
                                                        OnText(m_strData.c_str());      //trigger event for previous contents
265
 
                                                        m_strData.erase(m_strData.begin(), m_strData.end());
266
 
                                                }
267
 
                                        }
268
 
                                        else if(m_nState != PARSER_STATE_INSIDE_COMMENT)
269
 
                                        {
270
 
                                                std::string strTag(m_strData.c_str());
271
 
                                                std::string strParams;
272
 
 
273
 
                                                int nPos = strTag.find_first_of(' ');
274
 
                                                if(nPos >= 0){
275
 
                                                        strTag = strTag.substr(0, nPos);
276
 
                                                        strParams = m_strData.substr(nPos);
277
 
                                                }
278
 
 
279
 
                                                if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag)){
280
 
                                                        TRACE("HTML Parser: Tag start found (%s)[%s]\n", strTag.c_str(), strParams.c_str());
281
 
                                                        OnTagBegin(strTag.c_str(), strParams.c_str());  //trigger event
282
 
                                                        m_strData.erase(m_strData.begin(), m_strData.end());
283
 
                                                        m_nState = PARSER_STATE_BLANK;
284
 
                                                        if(0 == strcasecmp(strTag.c_str(), "PRE"))
285
 
                                                                m_bInsidePreTag = true;
286
 
                                                }
287
 
                                                else{
288
 
                                                        m_nState = PARSER_STATE_BLANK;
289
 
                                                        TRACE("HTML Parser: Push text1 (%s)\n", m_strData.c_str());
290
 
                                                        OnText(m_strData.c_str());      //trigger event for previous contents
291
 
                                                        m_strData.erase(m_strData.begin(), m_strData.end());
292
 
                                                }
293
 
                                        }
294
 
                                }
295
 
 
296
 
                                if(PARSER_STATE_INSIDE_COMMENT != m_nState)
297
 
                                        Clear();
298
 
                        }
299
 
                        else
300
 
                                m_strData += szBuffer[i];
301
 
                }
302
 
                else
303
 
                {
304
 
                        //check for the start of tag
305
 
                        if(szBuffer[i] == '<' && m_nState == PARSER_STATE_BLANK)
306
 
                        {
307
 
                                if(!m_strData.empty())
308
 
                                {
309
 
                                        TRACE("HTML Parser: Push text2 (%s)\n", m_strData.c_str());
310
 
                                        OnText(m_strData.c_str());      //trigger event for previous contents
311
 
                                        m_strData.erase(m_strData.begin(), m_strData.end());
312
 
                                }
313
 
                                m_nState = PARSER_STATE_INSIDE_TAG;
314
 
                        }
315
 
                        else{
316
 
                                m_strData += szBuffer[i];
317
 
                        }
318
 
                }
319
 
        }
320
 
 
321
 
        return true;
322
 
}
323
 
 
324
 
void HTMLParser::Finalize()
325
 
{
326
 
        if(!m_strData.empty() && m_nState == PARSER_STATE_BLANK)
327
 
                OnText(m_strData.c_str());      //trigger event for previous contents
328
 
        m_strData.erase(m_strData.begin(), m_strData.end());
329
 
}
330
 
 
331
 
void HTMLParser::EscapeURI(std::string &data)
332
 
{
333
 
        //TOFIX replace non-ASCII characters by converting each byte to %HH, where HH is the hexadecimal notation of the byte value
334
 
        replaceall(data, " ", "%20");
335
 
        replaceall(data, "&", "&amp;");
336
 
}
337
 
 
338
 
void HTMLParser::UnescapeURI(std::string &data)
339
 
{
340
 
#if GTK_CHECK_VERSION(2,16,0)
341
 
        char *szRes = g_uri_unescape_string(data.c_str(), NULL);
342
 
        if(szRes){
343
 
                data = szRes;
344
 
                g_free(szRes);
345
 
        }
346
 
#else
347
 
        int nStart = 0;
348
 
        unsigned int nSize = data.size();
349
 
        std::string::size_type nPos;
350
 
        while ((nPos = data.find('%', nStart)) != std::string::npos)
351
 
        {
352
 
                if(nPos + 2 < nSize) // two chars after %
353
 
                {
354
 
                        int a, b;
355
 
                        if ((a = hexVal(data[nPos+1])) != -1)
356
 
                        {
357
 
                                if ((b = hexVal(data[nPos+2])) != -1)
358
 
                                {
359
 
                                        gunichar cChar = ((a * 16) + b);
360
 
                                        //gchar szText[10];
361
 
                                        //int nWritten = g_unichar_to_utf8(cChar, szText);
362
 
                                        //szText[nWritten] = '\0';
363
 
                                        data.erase(nPos, 3);
364
 
                                        //data.insert(nPos, szText);
365
 
                                        data.insert(data.begin()+nPos, (char)cChar);
366
 
 
367
 
                                        //nStart = nPos + nWritten;
368
 
                                        nStart = nPos + 1;
369
 
                                        nSize -= 3;
370
 
                                        //nSize += nWritten;
371
 
                                        nSize += 1;
372
 
                                }
373
 
                                else
374
 
                                        nStart = nPos + 3;
375
 
                        }
376
 
                        else
377
 
                                nStart = nPos + 3;
378
 
                }
379
 
                else
380
 
                        break;
381
 
        }
382
 
 
383
 
        //TOFIX
384
 
        replaceall(data, "&amp;", "&");
385
 
#endif
386
 
}
387
 
 
388
 
void HTMLParser::EscapeChars(std::string &data)
389
 
{
390
 
        unsigned int nPos, nWidth;
391
 
        const char *szStart  = data.c_str();
392
 
        const char *szString = szStart;
393
 
 
394
 
        //using UTF-8 characters
395
 
        while(NULL != szString && '\0' != *szString)
396
 
        {
397
 
                int nSkip = 0;
398
 
                gunichar chLetter = g_utf8_get_char (szString);
399
 
                const char *szNext = g_utf8_find_next_char(szString, NULL);
400
 
 
401
 
                //TRACE("String to escape: %s\n", szString);
402
 
 
403
 
                int nRes = table_bin_search_char(chLetter);
404
 
                if(nRes >= 0)
405
 
                {
406
 
                        //replace escape sequence with original special char
407
 
                        nPos   = szString - szStart;
408
 
                        nWidth = szNext - szString;
409
 
                        nSkip  = strlen(_table_char[nRes].szEscape);
410
 
 
411
 
                        //TRACE("Escape: %d (width=%d) to %s\n", chLetter, nWidth, _table_char[nRes].szEscape);
412
 
 
413
 
                        //FIX: data = data.substr(0, nPos) + _table_char[nRes].szEscape + data.substr(nPos+nWidth, 1000000);
414
 
                        data.erase(nPos, nWidth);
415
 
                        data.insert(nPos, _table_char[nRes].szEscape);
416
 
 
417
 
                        //TRACE("Escaped line: %s\n", data.c_str());
418
 
 
419
 
                        szStart  = data.c_str();  //in case string was reallocated
420
 
                        szString = szStart + nPos + nSkip;
421
 
                }
422
 
                else
423
 
                        szString = szNext;
424
 
        }
425
 
}
426
 
 
427
 
void HTMLParser::UnescapeChars(std::string &data)
428
 
{
429
 
        unsigned int nPos = 0;
430
 
        while(1)
431
 
        {
432
 
                int nPosStart = data.find('&', nPos);
433
 
                if(nPosStart < 0)
434
 
                        break;
435
 
 
436
 
                int nPosEnd = data.find(';', nPosStart+1);
437
 
                if(nPosEnd >= 0)
438
 
                {
439
 
                        //extract escape sequence
440
 
                        std::string strChar = data.substr(nPosStart, nPosEnd-nPosStart+1);
441
 
                        //TRACE("Escape sequence %s found!\n", strChar.c_str());
442
 
 
443
 
                        int nRes = table_bin_search_escape(strChar.c_str());
444
 
                        if(nRes >= 0)
445
 
                        {
446
 
                                //replace escape sequence with original UTF-8 character
447
 
                                char szBuffer[20];
448
 
                                int nBytes = g_unichar_to_utf8(g_lstTableSort2[nRes].cLetter, szBuffer);
449
 
                                szBuffer[nBytes] = '\0';
450
 
 
451
 
                                //FIX: data = data.substr(0, nPosStart) + szBuffer + data.substr(nPosEnd+1, 1000000);
452
 
                                data.erase(nPosStart, nPosEnd+1-nPosStart);
453
 
                                data.insert(nPosStart, szBuffer);
454
 
                        }
455
 
                        else
456
 
                                TRACE("ERROR: HTML escape sequence %s is not supported yet!\n", strChar.c_str());
457
 
                }
458
 
                else
459
 
                        break;  //no sequence found
460
 
 
461
 
                nPos = nPosStart+1;
462
 
        }
463
 
}
464
 
 
465
 
//use binary search to speed up convertion
466
 
int table_bin_search_char(gunichar chFind, int nLeft, int nRight)
467
 
{
468
 
        if(nLeft > nRight) return -1;   //no match found
469
 
 
470
 
        //check middle of the range
471
 
        int nMid = (nLeft + nRight)/2;
472
 
        if(chFind == _table_char[nMid].cLetter)
473
 
                return nMid;    //match found
474
 
 
475
 
        if(nLeft == nRight) return -1;  //no match found
476
 
 
477
 
        if(chFind < _table_char[nMid].cLetter)
478
 
                return table_bin_search_char(chFind, nLeft, nMid-1);    //search lower half
479
 
        else
480
 
                return table_bin_search_char(chFind, nMid+1, nRight);   //search upper half
481
 
}
482
 
 
483
 
int table_bin_search_escape(const char *szFind, int nLeft, int nRight)
484
 
{
485
 
        //TRACE("bin search [Escape:%s], l=%d, r=%d\n", szFind, nLeft, nRight);
486
 
 
487
 
        if(nLeft > nRight) {
488
 
                //TRACE("bin search: no match found\n");
489
 
                return -1;      //no match found
490
 
        }
491
 
 
492
 
        //check middle of the range
493
 
        int nMid = (nLeft + nRight)/2;
494
 
        if(0 == strcmp(szFind, g_lstTableSort2[nMid].szEscape)){
495
 
                //TRACE("bin search found [Escape:%s], m=%d\n", szFind, nMid);
496
 
                return nMid;    //match found
497
 
        }
498
 
 
499
 
        if(nLeft == nRight){
500
 
                //TRACE("bin search: no match found\n");
501
 
                return -1;      //no match found
502
 
        }
503
 
 
504
 
        if(strcmp(szFind, g_lstTableSort2[nMid].szEscape) < 0)
505
 
        {
506
 
                //TRACE("Search lower half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);
507
 
                return table_bin_search_escape(szFind, nLeft, nMid-1);  //search lower half
508
 
        }
509
 
        else{
510
 
                //TRACE("Search upper half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);
511
 
                return table_bin_search_escape(szFind, nMid+1, nRight); //search upper half
512
 
        }
513
 
}
514
 
 
515
 
bool HTMLParser::ExtractParam(const std::string &data, const char *szParam, std::string &resValue)
516
 
{
517
 
        std::string strPattern = szParam;
518
 
        strPattern += "=\"";
519
 
 
520
 
        std::string::size_type nPos = data.find(strPattern.c_str());
521
 
        if(nPos != std::string::npos)
522
 
        {
523
 
                std::string::size_type nEnd = data.find("\"", nPos+strPattern.size());
524
 
                if(nEnd != std::string::npos){
525
 
                        resValue = data.substr(nPos+strPattern.size(), nEnd-nPos-strPattern.size());
526
 
                        return true;
527
 
                }
528
 
        }
529
 
        return false; // not found
530
 
}
 
1
////////////////////////////////////////////////////////////////////////////
 
2
// NoteCase notes manager project <http://notecase.sf.net>
 
3
//
 
4
// This code is licensed under BSD license.See "license.txt" for more details.
 
5
//
 
6
// File: Implements basic HTML parser class
 
7
////////////////////////////////////////////////////////////////////////////
 
8
 
 
9
#include "HtmlParser.h"
 
10
#include "debug.h"
 
11
#include <vector>
 
12
#include <algorithm>
 
13
#include <glib.h>
 
14
#include <gtk/gtk.h>
 
15
#include <string.h>
 
16
 
 
17
#ifndef _WIN32
 
18
 #include <strings.h>   //strcasecmp
 
19
#else
 
20
 #define strcasecmp stricmp
 
21
#endif
 
22
 
 
23
void replaceall(std::string &strData, const char *szFind, const char *szReplace);
 
24
 
 
25
// parser states
 
26
#define PARSER_STATE_BLANK      0
 
27
#define PARSER_STATE_INSIDE_TAG 1
 
28
#define PARSER_STATE_INSIDE_COMMENT 2
 
29
 
 
30
// Html escape sequences table
 
31
typedef struct {
 
32
        gunichar  cLetter;
 
33
        const char *szEscape;
 
34
} HtmlEscape;
 
35
 
 
36
static std::vector<HtmlEscape> g_lstTableSort2;
 
37
 
 
38
//table is sorted by first field to enable binary search
 
39
static const HtmlEscape _table_char[] =
 
40
{
 
41
        {'\"',  "&quot;"},  //=34
 
42
        {'&',   "&amp;"},   //=38
 
43
        {'<',   "&lt;"},    //=60
 
44
        {'>',   "&gt;"},    //=62
 
45
        {' ',   "&nbsp;"},  //non-breaking space
 
46
        {161,   "&iexcl;"}, //'¡' - inverted exclamation mark
 
47
        {162,   "&cent;"},  //'¢'
 
48
        {163,   "&pound;"}, //'£'
 
49
        {164,   "&curren;"},//'¤'
 
50
        {165,   "&yen;"},   //'¥'
 
51
        {166,   "&brvbar;"},//'¦' - broken (vertical) bar
 
52
        {167,   "&sect;"},  //'§' - section sign
 
53
        {168,   "&uml;"},   //'¨' - umlaut
 
54
        {169,   "&copy;"},  //'©' - copyright sign
 
55
        {170,   "&ordf;"},  //'ª' - feminine ordinal
 
56
        {171,   "&laquo;"}, //'«' - left guillemet
 
57
        {174,   "&reg;"},   //'®' - registered sign
 
58
 
 
59
        {176,   "&deg;"},   //'°' - degree sign
 
60
        {177,   "&plusmn;"},//'±' - plus or minus
 
61
        {178,   "&sup2;"},  //'²' - superscript two
 
62
        {179,   "&sup3;"},  //'³' - superscript three
 
63
 
 
64
        {187,   "&raquo;"}, //'»' - right guillemet
 
65
 
 
66
        {192,   "&Agrave;"},//'À'
 
67
        {193,   "&Aacute;"},//'Á'
 
68
        {194,   "&Acirc;"}, //'Â'
 
69
        {195,   "&Atilde;"},//'Ã'
 
70
        {196,   "&Auml;"},  //'Ä'
 
71
        {197,   "&Aring;"}, //'Å'
 
72
        {198,   "&AElig;"}, //'Æ'
 
73
        {199,   "&Ccedil;"},//'Ç'
 
74
        {200,   "&Egrave;"},//'È'
 
75
        {201,   "&Eacute;"},//'É'
 
76
        {202,   "&Ecirc;"}, //'Ê'
 
77
        {203,   "&Euml;"},  //'Ë'
 
78
        {204,   "&Igrave;"},//'Ì'
 
79
        {205,   "&Iacute;"},//'Í'
 
80
        {206,   "&Icirc;"}, //'Î'
 
81
        {207,   "&Iuml;"},  //'Ï'
 
82
        {208,   "&ETH;"},   //'Ð' - capital Eth, Icelandic
 
83
        {209,   "&Ntilde;"},//'Ñ'
 
84
        {210,   "&Ograve;"},//'Ò'
 
85
        {211,   "&Oacute;"},//'Ó'
 
86
        {212,   "&Ocirc;"}, //'Ô'
 
87
        {213,   "&Otilde;"},//'Õ'
 
88
        {214,   "&Ouml;"},  //'Ö'
 
89
        {215,   "&times;"}, //'×' - multiply sign
 
90
        {216,   "&Oslash;"},//'Ø'
 
91
        {217,   "&Ugrave;"},//'Ù'
 
92
        {218,   "&Uacute;"},//'Ú'
 
93
        {219,   "&Ucirc;"}, //'Û'
 
94
        {220,   "&Uuml;"},  //'Ü'
 
95
        {221,   "&Yacute;"},//'Ý'
 
96
        {222,   "&THORN;"}, //'Þ' - capital THORN, Icelandic
 
97
        {223,   "&szlig;"}, //'ß'
 
98
        {224,   "&agrave;"},//'à'
 
99
        {225,   "&aacute;"},//'á'
 
100
        {226,   "&acirc;"}, //'â'
 
101
        {227,   "&atilde;"},//'ã'
 
102
        {228,   "&auml;"},  //'ä'
 
103
        {229,   "&aring;"}, //'å'
 
104
 
 
105
        {230,   "&aelig;"}, //'æ'
 
106
        {231,   "&ccedil;"},//'ç'
 
107
        {232,   "&egrave;"},//'è'
 
108
        {233,   "&eacute;"},//'é'
 
109
        {234,   "&ecirc;"}, //'ê'
 
110
        {235,   "&euml;"},  //'ë'
 
111
        {236,   "&igrave;"},//'ì'
 
112
        {237,   "&iacute;"},//'í'
 
113
        {238,   "&icirc;"}, //'î'
 
114
        {239,   "&iuml;"},  //'ï'
 
115
        {240,   "&eth;"},   //'ð' - small eth, Icelandic
 
116
        {241,   "&ntilde;"},//'ñ'
 
117
        {242,   "&ograve;"},//'ò'
 
118
        {243,   "&oacute;"},//'ó'
 
119
        {244,   "&ocirc;"},     //'ô'
 
120
        {245,   "&otilde;"},//'õ'
 
121
        {246,   "&ouml;"},  //'ö'
 
122
 
 
123
        {248,   "&oslash;"},//'ø'
 
124
        {249,   "&ugrave;"},//'ù'
 
125
        {250,   "&uacute;"},//'ú'
 
126
        {251,   "&ucirc;"}, //'û'
 
127
        {252,   "&uuml;"},  //'ü'
 
128
        {253,   "&yacute;"},//'ý'
 
129
        {254,   "&thorn;"}, //'þ' - small thorn, Icelandic
 
130
        {255,   "&yuml;"},  //'ÿ'
 
131
 
 
132
        {338,   "&OElig;"}, //'Œ'
 
133
        {339,   "&oelig;"}, //'œ'
 
134
        {352,   "&Scaron;"},//'Š'
 
135
        {353,   "&scaron;"},//'š'
 
136
        {376,   "&Yuml;"},  //'Ÿ'
 
137
        {402,   "&fnof;"},  //'ƒ'
 
138
 
 
139
        {8211,  "&#8211;"}, //'–' - en dash (demi-cadratin)
 
140
        {8212,  "&#8212;"}, //'—' - em dash (cadratin)
 
141
 
 
142
        {8249,  "&#8249;"}, //'‹' - left single guillemet
 
143
        {8250,  "&#8250;"}, //'›' - right single guillemet
 
144
        {8364,  "&euro;"},  //'€'
 
145
        {8482,  "&trade;"}, //'™' - trademark
 
146
 
 
147
        //TOFIX add more chars if needed
 
148
};
 
149
 
 
150
static int hexVal(char ch)
 
151
{
 
152
    if ((ch >= 'a') && (ch <= 'f'))
 
153
        return (ch - 'a') + 10;
 
154
    else if ((ch >= 'A') && (ch <= 'F'))
 
155
        return (ch - 'A') + 10;
 
156
    else if ((ch >= '0') && (ch <= '9'))
 
157
        return (ch - '0');
 
158
    return -1;
 
159
} // hexVal
 
160
 
 
161
#define SIZE_OF(x) (sizeof(x)/sizeof(x[0]))
 
162
 
 
163
static int table_bin_search_char(gunichar chFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);
 
164
static int table_bin_search_escape(const char *szFind, int nLeft = 0, int nRight = SIZE_OF(_table_char)-1);
 
165
 
 
166
class TblComparator{
 
167
public:
 
168
        bool operator()(const HtmlEscape &a, const HtmlEscape &b)
 
169
        {
 
170
                //operator < (is a<b ?)
 
171
                return (strcmp(a.szEscape, b.szEscape) < 0);
 
172
        };
 
173
};
 
174
 
 
175
HTMLParser::HTMLParser()
 
176
{
 
177
        Clear();
 
178
 
 
179
        m_bAllowUnescapedInPreTag = false;
 
180
        m_bInsidePreTag = false;
 
181
 
 
182
        //create new sort table (create only once - global object)
 
183
        if(g_lstTableSort2.empty())
 
184
        {
 
185
                for(unsigned int i=0; i<SIZE_OF(_table_char); i++)
 
186
                        g_lstTableSort2.push_back(_table_char[i]);
 
187
                TblComparator cmp;
 
188
                std::sort(g_lstTableSort2.begin(), g_lstTableSort2.end(), cmp);
 
189
        }
 
190
}
 
191
 
 
192
HTMLParser::~HTMLParser()
 
193
{
 
194
}
 
195
 
 
196
void HTMLParser::Clear()
 
197
{
 
198
        m_nState = PARSER_STATE_BLANK;
 
199
        m_strData.erase(m_strData.begin(), m_strData.end());
 
200
}
 
201
 
 
202
bool HTMLParser::Parse(const char *szBuffer, int len)
 
203
{
 
204
        if(len < 0)
 
205
                return false;
 
206
 
 
207
        for(int i=0; i<len; i++)
 
208
        {
 
209
                if(PARSER_STATE_INSIDE_COMMENT == m_nState)
 
210
                {
 
211
                        if( m_strData.size()>2 &&
 
212
                                0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2))    //tag ends as comment
 
213
                        {
 
214
                                TRACE("HTML Parser: Comment ended\n");
 
215
                                m_strData.erase(m_strData.size()-2, 2); //remove "--" ending
 
216
                                OnComment(m_strData.c_str()+3); //trigger event
 
217
                                m_strData.erase(m_strData.begin(), m_strData.end());
 
218
                                m_nState = PARSER_STATE_BLANK;
 
219
                        }
 
220
                        else
 
221
                                m_strData += szBuffer[i];
 
222
                }
 
223
                else if(PARSER_STATE_INSIDE_TAG == m_nState)
 
224
                {
 
225
                        //check for the end of tag
 
226
                        if(szBuffer[i] == '>'){
 
227
                                if(0 == strncmp("!--", m_strData.c_str(), 3))   //is tag comment
 
228
                                {
 
229
                                        m_nState = PARSER_STATE_INSIDE_COMMENT;
 
230
 
 
231
                                        if(0 == strncmp("--", m_strData.substr(m_strData.size()-2).c_str(), 2)) //tag ends as comment
 
232
                                        {
 
233
                                                TRACE("HTML Parser: Comment ended\n");
 
234
                                                m_strData.erase(m_strData.size()-2, 2); //remove "--" ending
 
235
                                                OnComment(m_strData.c_str()+3); //trigger event
 
236
                                                m_strData.erase(m_strData.begin(), m_strData.end());
 
237
                                                m_nState = PARSER_STATE_BLANK;
 
238
                                        }
 
239
                                        else
 
240
                                                m_strData += szBuffer[i];
 
241
                                }
 
242
                                else if(!m_strData.empty() && m_nState != PARSER_STATE_INSIDE_COMMENT)
 
243
                                {
 
244
                                        if(m_strData.at(0) == '/')      //is ending tag
 
245
                                        {
 
246
                                                //strip everything after first space within tag to get real tag name
 
247
                                                std::string strTag(m_strData.c_str()+1);
 
248
                                                int nPos = strTag.find_first_of(' ');
 
249
                                                if(nPos >= 0)
 
250
                                                        strTag.erase(strTag.begin()+nPos);
 
251
 
 
252
                                                if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag) || (0 == strcasecmp(strTag.c_str(), "PRE")))
 
253
                                                {
 
254
                                                        TRACE("HTML Parser: Tag end found (%s)\n", strTag.c_str());
 
255
                                                        OnTagEnd(strTag.c_str());       //trigger event
 
256
                                                        m_strData.erase(m_strData.begin(), m_strData.end());
 
257
                                                        if(0 == strcasecmp(strTag.c_str(), "PRE"))
 
258
                                                                m_bInsidePreTag = false;
 
259
                                                }
 
260
                                                else
 
261
                                                {
 
262
                                                        TRACE("HTML Parser: Push text (%s)\n", m_strData.c_str());
 
263
                                                        m_nState = PARSER_STATE_BLANK;
 
264
                                                        OnText(m_strData.c_str());      //trigger event for previous contents
 
265
                                                        m_strData.erase(m_strData.begin(), m_strData.end());
 
266
                                                }
 
267
                                        }
 
268
                                        else if(m_nState != PARSER_STATE_INSIDE_COMMENT)
 
269
                                        {
 
270
                                                std::string strTag(m_strData.c_str());
 
271
                                                std::string strParams;
 
272
 
 
273
                                                int nPos = strTag.find_first_of(' ');
 
274
                                                if(nPos >= 0){
 
275
                                                        strTag = strTag.substr(0, nPos);
 
276
                                                        strParams = m_strData.substr(nPos);
 
277
                                                }
 
278
 
 
279
                                                if(!m_bAllowUnescapedInPreTag || (m_bAllowUnescapedInPreTag && !m_bInsidePreTag)){
 
280
                                                        TRACE("HTML Parser: Tag start found (%s)[%s]\n", strTag.c_str(), strParams.c_str());
 
281
                                                        OnTagBegin(strTag.c_str(), strParams.c_str());  //trigger event
 
282
                                                        m_strData.erase(m_strData.begin(), m_strData.end());
 
283
                                                        m_nState = PARSER_STATE_BLANK;
 
284
                                                        if(0 == strcasecmp(strTag.c_str(), "PRE"))
 
285
                                                                m_bInsidePreTag = true;
 
286
                                                }
 
287
                                                else{
 
288
                                                        m_nState = PARSER_STATE_BLANK;
 
289
                                                        TRACE("HTML Parser: Push text1 (%s)\n", m_strData.c_str());
 
290
                                                        OnText(m_strData.c_str());      //trigger event for previous contents
 
291
                                                        m_strData.erase(m_strData.begin(), m_strData.end());
 
292
                                                }
 
293
                                        }
 
294
                                }
 
295
 
 
296
                                if(PARSER_STATE_INSIDE_COMMENT != m_nState)
 
297
                                        Clear();
 
298
                        }
 
299
                        else
 
300
                                m_strData += szBuffer[i];
 
301
                }
 
302
                else
 
303
                {
 
304
                        //check for the start of tag
 
305
                        if(szBuffer[i] == '<' && m_nState == PARSER_STATE_BLANK)
 
306
                        {
 
307
                                if(!m_strData.empty())
 
308
                                {
 
309
                                        TRACE("HTML Parser: Push text2 (%s)\n", m_strData.c_str());
 
310
                                        OnText(m_strData.c_str());      //trigger event for previous contents
 
311
                                        m_strData.erase(m_strData.begin(), m_strData.end());
 
312
                                }
 
313
                                m_nState = PARSER_STATE_INSIDE_TAG;
 
314
                        }
 
315
                        else{
 
316
                                m_strData += szBuffer[i];
 
317
                        }
 
318
                }
 
319
        }
 
320
 
 
321
        return true;
 
322
}
 
323
 
 
324
void HTMLParser::Finalize()
 
325
{
 
326
        if(!m_strData.empty() && m_nState == PARSER_STATE_BLANK)
 
327
                OnText(m_strData.c_str());      //trigger event for previous contents
 
328
        m_strData.erase(m_strData.begin(), m_strData.end());
 
329
}
 
330
 
 
331
void HTMLParser::EscapeURI(std::string &data)
 
332
{
 
333
        //TOFIX replace non-ASCII characters by converting each byte to %HH, where HH is the hexadecimal notation of the byte value
 
334
        replaceall(data, " ", "%20");
 
335
        replaceall(data, "&", "&amp;");
 
336
}
 
337
 
 
338
void HTMLParser::UnescapeURI(std::string &data)
 
339
{
 
340
#if GTK_CHECK_VERSION(2,16,0)
 
341
        char *szRes = g_uri_unescape_string(data.c_str(), NULL);
 
342
        if(szRes){
 
343
                data = szRes;
 
344
                g_free(szRes);
 
345
        }
 
346
#else
 
347
        int nStart = 0;
 
348
        unsigned int nSize = data.size();
 
349
        std::string::size_type nPos;
 
350
        while ((nPos = data.find('%', nStart)) != std::string::npos)
 
351
        {
 
352
                if(nPos + 2 < nSize) // two chars after %
 
353
                {
 
354
                        int a, b;
 
355
                        if ((a = hexVal(data[nPos+1])) != -1)
 
356
                        {
 
357
                                if ((b = hexVal(data[nPos+2])) != -1)
 
358
                                {
 
359
                                        gunichar cChar = ((a * 16) + b);
 
360
                                        //gchar szText[10];
 
361
                                        //int nWritten = g_unichar_to_utf8(cChar, szText);
 
362
                                        //szText[nWritten] = '\0';
 
363
                                        data.erase(nPos, 3);
 
364
                                        //data.insert(nPos, szText);
 
365
                                        data.insert(data.begin()+nPos, (char)cChar);
 
366
 
 
367
                                        //nStart = nPos + nWritten;
 
368
                                        nStart = nPos + 1;
 
369
                                        nSize -= 3;
 
370
                                        //nSize += nWritten;
 
371
                                        nSize += 1;
 
372
                                }
 
373
                                else
 
374
                                        nStart = nPos + 3;
 
375
                        }
 
376
                        else
 
377
                                nStart = nPos + 3;
 
378
                }
 
379
                else
 
380
                        break;
 
381
        }
 
382
 
 
383
        //TOFIX
 
384
        replaceall(data, "&amp;", "&");
 
385
#endif
 
386
}
 
387
 
 
388
void HTMLParser::EscapeChars(std::string &data)
 
389
{
 
390
        unsigned int nPos, nWidth;
 
391
        const char *szStart  = data.c_str();
 
392
        const char *szString = szStart;
 
393
 
 
394
        //using UTF-8 characters
 
395
        while(NULL != szString && '\0' != *szString)
 
396
        {
 
397
                int nSkip = 0;
 
398
                gunichar chLetter = g_utf8_get_char (szString);
 
399
                const char *szNext = g_utf8_find_next_char(szString, NULL);
 
400
 
 
401
                //TRACE("String to escape: %s\n", szString);
 
402
 
 
403
                int nRes = table_bin_search_char(chLetter);
 
404
                if(nRes >= 0)
 
405
                {
 
406
                        //replace escape sequence with original special char
 
407
                        nPos   = szString - szStart;
 
408
                        nWidth = szNext - szString;
 
409
                        nSkip  = strlen(_table_char[nRes].szEscape);
 
410
 
 
411
                        //TRACE("Escape: %d (width=%d) to %s\n", chLetter, nWidth, _table_char[nRes].szEscape);
 
412
 
 
413
                        //FIX: data = data.substr(0, nPos) + _table_char[nRes].szEscape + data.substr(nPos+nWidth, 1000000);
 
414
                        data.erase(nPos, nWidth);
 
415
                        data.insert(nPos, _table_char[nRes].szEscape);
 
416
 
 
417
                        //TRACE("Escaped line: %s\n", data.c_str());
 
418
 
 
419
                        szStart  = data.c_str();  //in case string was reallocated
 
420
                        szString = szStart + nPos + nSkip;
 
421
                }
 
422
                else
 
423
                        szString = szNext;
 
424
        }
 
425
}
 
426
 
 
427
void HTMLParser::UnescapeChars(std::string &data)
 
428
{
 
429
        unsigned int nPos = 0;
 
430
        while(1)
 
431
        {
 
432
                int nPosStart = data.find('&', nPos);
 
433
                if(nPosStart < 0)
 
434
                        break;
 
435
 
 
436
                int nPosEnd = data.find(';', nPosStart+1);
 
437
                if(nPosEnd >= 0)
 
438
                {
 
439
                        //extract escape sequence
 
440
                        std::string strChar = data.substr(nPosStart, nPosEnd-nPosStart+1);
 
441
                        //TRACE("Escape sequence %s found!\n", strChar.c_str());
 
442
 
 
443
                        int nRes = table_bin_search_escape(strChar.c_str());
 
444
                        if(nRes >= 0)
 
445
                        {
 
446
                                //replace escape sequence with original UTF-8 character
 
447
                                char szBuffer[20];
 
448
                                int nBytes = g_unichar_to_utf8(g_lstTableSort2[nRes].cLetter, szBuffer);
 
449
                                szBuffer[nBytes] = '\0';
 
450
 
 
451
                                //FIX: data = data.substr(0, nPosStart) + szBuffer + data.substr(nPosEnd+1, 1000000);
 
452
                                data.erase(nPosStart, nPosEnd+1-nPosStart);
 
453
                                data.insert(nPosStart, szBuffer);
 
454
                        }
 
455
                        else
 
456
                                TRACE("ERROR: HTML escape sequence %s is not supported yet!\n", strChar.c_str());
 
457
                }
 
458
                else
 
459
                        break;  //no sequence found
 
460
 
 
461
                nPos = nPosStart+1;
 
462
        }
 
463
}
 
464
 
 
465
//use binary search to speed up convertion
 
466
int table_bin_search_char(gunichar chFind, int nLeft, int nRight)
 
467
{
 
468
        if(nLeft > nRight) return -1;   //no match found
 
469
 
 
470
        //check middle of the range
 
471
        int nMid = (nLeft + nRight)/2;
 
472
        if(chFind == _table_char[nMid].cLetter)
 
473
                return nMid;    //match found
 
474
 
 
475
        if(nLeft == nRight) return -1;  //no match found
 
476
 
 
477
        if(chFind < _table_char[nMid].cLetter)
 
478
                return table_bin_search_char(chFind, nLeft, nMid-1);    //search lower half
 
479
        else
 
480
                return table_bin_search_char(chFind, nMid+1, nRight);   //search upper half
 
481
}
 
482
 
 
483
int table_bin_search_escape(const char *szFind, int nLeft, int nRight)
 
484
{
 
485
        //TRACE("bin search [Escape:%s], l=%d, r=%d\n", szFind, nLeft, nRight);
 
486
 
 
487
        if(nLeft > nRight) {
 
488
                //TRACE("bin search: no match found\n");
 
489
                return -1;      //no match found
 
490
        }
 
491
 
 
492
        //check middle of the range
 
493
        int nMid = (nLeft + nRight)/2;
 
494
        if(0 == strcmp(szFind, g_lstTableSort2[nMid].szEscape)){
 
495
                //TRACE("bin search found [Escape:%s], m=%d\n", szFind, nMid);
 
496
                return nMid;    //match found
 
497
        }
 
498
 
 
499
        if(nLeft == nRight){
 
500
                //TRACE("bin search: no match found\n");
 
501
                return -1;      //no match found
 
502
        }
 
503
 
 
504
        if(strcmp(szFind, g_lstTableSort2[nMid].szEscape) < 0)
 
505
        {
 
506
                //TRACE("Search lower half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);
 
507
                return table_bin_search_escape(szFind, nLeft, nMid-1);  //search lower half
 
508
        }
 
509
        else{
 
510
                //TRACE("Search upper half, mid[%d]=%s\n", nMid, g_lstTableSort2[nMid].szEscape);
 
511
                return table_bin_search_escape(szFind, nMid+1, nRight); //search upper half
 
512
        }
 
513
}
 
514
 
 
515
bool HTMLParser::ExtractParam(const std::string &data, const char *szParam, std::string &resValue)
 
516
{
 
517
        std::string strPattern = szParam;
 
518
        strPattern += "=\"";
 
519
 
 
520
        std::string::size_type nPos = data.find(strPattern.c_str());
 
521
        if(nPos != std::string::npos)
 
522
        {
 
523
                std::string::size_type nEnd = data.find("\"", nPos+strPattern.size());
 
524
                if(nEnd != std::string::npos){
 
525
                        resValue = data.substr(nPos+strPattern.size(), nEnd-nPos-strPattern.size());
 
526
                        return true;
 
527
                }
 
528
        }
 
529
        return false; // not found
 
530
}