1
/***************************************************************************
3
* Project ___| | | | _ \| |
5
* | (__| |_| | _ <| |___
6
* \___|\___/|_| \_\_____|
8
* Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel@haxx.se>, et al.
10
* This software is licensed as described in the file COPYING, which
11
* you should have received as part of this distribution. The terms
12
* are also available at http://curl.haxx.se/docs/copyright.html.
14
* You may opt to use, copy, modify, merge, publish, distribute and/or sell
15
* copies of the Software, and permit persons to whom the Software is
16
* furnished to do so, under the terms of the COPYING file.
18
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19
* KIND, either express or implied.
21
***************************************************************************/
22
// Get a web page, parse it with libxml.
24
// Written by Lars Nilsson
26
// GNU C++ compile command line suggestion (edit paths accordingly):
28
// g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cc \
29
// -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
35
#include <curl/curl.h>
36
#include <libxml/HTMLparser.h>
39
// Case-insensitive string comparison
43
#define COMPARE(a, b) (!stricmp((a), (b)))
45
#define COMPARE(a, b) (!strcasecmp((a), (b)))
49
// libxml callback context structure
54
Context(): addTitle(false) { }
61
// libcurl variables for error strings and returned data
63
static char errorBuffer[CURL_ERROR_SIZE];
64
static std::string buffer;
67
// libcurl write callback function
70
static int writer(char *data, size_t size, size_t nmemb,
71
std::string *writerData)
73
if (writerData == NULL)
76
writerData->append(data, size*nmemb);
82
// libcurl connection initialization
85
static bool init(CURL *&conn, char *url)
89
conn = curl_easy_init();
93
fprintf(stderr, "Failed to create CURL connection\n");
98
code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
101
fprintf(stderr, "Failed to set error buffer [%d]\n", code);
106
code = curl_easy_setopt(conn, CURLOPT_URL, url);
107
if (code != CURLE_OK)
109
fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
114
code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
115
if (code != CURLE_OK)
117
fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
122
code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
123
if (code != CURLE_OK)
125
fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
130
code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
131
if (code != CURLE_OK)
133
fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
142
// libxml start element callback function
145
static void StartElement(void *voidContext,
147
const xmlChar **attributes)
149
Context *context = (Context *)voidContext;
151
if (COMPARE((char *)name, "TITLE"))
154
context->addTitle = true;
160
// libxml end element callback function
163
static void EndElement(void *voidContext,
166
Context *context = (Context *)voidContext;
168
if (COMPARE((char *)name, "TITLE"))
169
context->addTitle = false;
173
// Text handling helper function
176
static void handleCharacters(Context *context,
177
const xmlChar *chars,
180
if (context->addTitle)
181
context->title.append((char *)chars, length);
185
// libxml PCDATA callback function
188
static void Characters(void *voidContext,
189
const xmlChar *chars,
192
Context *context = (Context *)voidContext;
194
handleCharacters(context, chars, length);
198
// libxml CDATA callback function
201
static void cdata(void *voidContext,
202
const xmlChar *chars,
205
Context *context = (Context *)voidContext;
207
handleCharacters(context, chars, length);
211
// libxml SAX callback structure
214
static htmlSAXHandler saxHandler =
246
// Parse given (assumed to be) HTML text and return the title
249
static void parseHtml(const std::string &html,
252
htmlParserCtxtPtr ctxt;
255
ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
256
XML_CHAR_ENCODING_NONE);
258
htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
259
htmlParseChunk(ctxt, "", 0, 1);
261
htmlFreeParserCtxt(ctxt);
263
title = context.title;
266
int main(int argc, char *argv[])
272
// Ensure one argument is given
276
fprintf(stderr, "Usage: %s <url>\n", argv[0]);
281
curl_global_init(CURL_GLOBAL_DEFAULT);
283
// Initialize CURL connection
285
if (!init(conn, argv[1]))
287
fprintf(stderr, "Connection initializion failed\n");
292
// Retrieve content for the URL
294
code = curl_easy_perform(conn);
295
curl_easy_cleanup(conn);
297
if (code != CURLE_OK)
299
fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
304
// Parse the (assumed) HTML code
306
parseHtml(buffer, title);
308
// Display the extracted title
310
printf("Title: %s\n", title.c_str());