1
/* This file is part of Strigi Desktop Search
3
* Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4
* 2007 Tobias G. Pfeiffer <tgpfeiffer@web.de>
5
* 2009 Evgeny Egorochkin <phreedom.stdin@gmail.com>
7
* This library is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU Library General Public
9
* License as published by the Free Software Foundation; either
10
* version 2 of the License, or (at your option) any later version.
12
* This library is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
* Library General Public License for more details.
17
* You should have received a copy of the GNU Library General Public License
18
* along with this library; see the file COPYING.LIB. If not, write to
19
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20
* Boston, MA 02110-1301, USA.
22
#ifndef STRIGI_SAXHELPERANALYZER_H
23
#define STRIGI_SAXHELPERANALYZER_H
25
#include <strigi/strigiconfig.h>
26
#include <libxml/SAX2.h>
27
#include "streambase.h"
28
#include "analysisresult.h"
32
* This class is well suited for analyzing XML based substreams
35
class SaxHelperAnalyzer {
37
xmlParserCtxtPtr ctxt;
38
xmlSAXHandler handler;
42
static void charactersSAXFunc(void* ctx, const xmlChar * ch, int len) {
43
((SaxHelperAnalyzer*)ctx)->characters((const char *)ch, len);
45
static void errorSAXFunc(void* ctx, const char * msg, ...) {
46
((SaxHelperAnalyzer*)ctx)->error = true;
48
static void startElementNsSAX2Func(void * ctx,
49
const xmlChar* localname, const xmlChar* prefix, const xmlChar* URI,
50
int nb_namespaces, const xmlChar ** namespaces, int nb_attributes,
51
int nb_defaulted, const xmlChar ** attributes) {
53
((SaxHelperAnalyzer*)ctx)->startElement((const char *)localname, (const char *)prefix, (const char *)URI,
54
nb_namespaces, (const char **)namespaces,
55
nb_attributes, nb_defaulted, (const char **)attributes);
57
static void endElementNsSAX2Func(void *ctx,
58
const xmlChar *localname, const xmlChar *prefix, const xmlChar *URI){
60
((SaxHelperAnalyzer*)ctx)->endElement((const char *)localname, (const char *)prefix, (const char *)URI);
62
void handleData(const char*, uint32_t);
64
void push(const char* data, int32_t len);
65
void init(const char* data, int32_t len);
68
AnalysisResult *result;
71
* Call this function to analyze a stream
73
void analyze(AnalysisResult& idx, InputStream* in);
76
* Constructor. Should register fields if necessary
80
* Destructor. Clean up your room, if dirty :-)
82
virtual ~SaxHelperAnalyzer();
84
* Returns the name of this analyzer. Taking the class name is fine
87
virtual const char* name() const = 0;
89
* Is called to signal the beginning of a stream analysis. This is the place
90
* to initialize variables that need to be set again for every stream.
91
* If there are single objects that will be needed again for every analyzed
92
* stream, they should be created in the constructor.
93
* \param result pointer to the AnalysisResult to write your results to
95
virtual void startAnalysis(AnalysisResult&) = 0;
97
* Is called when the analysis of a stream is finished. You can do
98
* cleanups here, if necessary. Note: This is also called if, while
99
* parsing the XML document, non-well-formedness is detected.
101
virtual void endAnalysis(bool complete) = 0;
103
* This is called when an opening XML tag was detected. For documentation
104
* purposes, let's suppose we have
105
* <xsl:template match="author:*" xmlns:xdc="http://www.xml.com/books">
106
* The parameter descriptions will have in parentheses the value that would be
107
* passed for this example.
108
* \param localname pointer to local name of the element ("template")
109
* \param prefix pointer to element namespace prefix, if available ("xsl"),
111
* \param uri pointer to element namespace URI, if available, i.e. if it was
112
* declared in the document element (could be
113
* "http://www.w3.org/1999/XSL/Transform"), to 0 otherwise
114
* \param nb_namespaces number of namespace <i>definitions</i> on that node (1)
115
* \param namespaces pointer to the array (of length 2*nb_namespaces) of
116
* prefix/URI pairs of namespace <i>definitions</i> (in this case:
117
* ["xdc", "http://www.xml.com/books"])
118
* \param nb_attributes number of attributes on that node (1)
119
* \param nb_defaulted number of defaulted attributes (0)
120
* \param attributes pointer to the array (of length 5*nb_attributes) of
121
* attributes with the following content:<br />
122
* 1. items with index i mod 5 == 0 point to the local attribute name ("author")<br />
123
* 2. items with index i mod 5 == 1 point to the namespace prefix of the attribute,
124
* if existing, to 0 otherwise (0)<br />
125
* 3. items with index i mod 5 == 2 point to the namespace URI of the attribute,
126
* if existing, to 0 otherwise (0)<br />
127
* 4. items with index i mod 5 == 3 point to the beginning of the attribute value
128
* in the XML char array. (i.e. to '*')<br />
129
* 5. items with index i mod 5 == 4 point to the character <b>after the end</b> of
130
* the attribute value in the XML char array. (i.e. to '"') So if you want
131
* to get the actual attribute value, start reading at the pointer in
132
* (4.) and stop at this one.
134
virtual void startElement(const char* localname, const char* prefix,
135
const char* uri, int nb_namespaces, const char** namespaces,
136
int nb_attributes,int nb_defaulted, const char** attributes) = 0;
138
* This is called when a closing XML tag was detected. For documentation
139
* purposes, let's suppose we have
140
* </xsl:template>
141
* The parameter descriptions will have in parentheses the value that would be
142
* passed for this example.
143
* \param localname pointer to local name of the element ("template")
144
* \param prefix pointer to element namespace prefix, if existing ("xsl"),
146
* \param uri pointer to element namespace name if available, i.e. if it
147
* was declared in the document element (could be
148
* "http://www.w3.org/1999/XSL/Transform"), to 0 otherwise
150
virtual void endElement(const char* localname, const char* prefix,
151
const char* uri) = 0;
153
* Is called to pass some XML data to the analyzer. No assumptions like
154
* "all data until the next opening or closing tag" etc. can be made!
155
* \param data character data of the line
156
* \param length number of characters in that line
158
virtual void characters(const char* data, uint32_t length) = 0;
160
* Tells the caller whether you are finished with your analysis or not.
161
* If this returns true, this Analyzer will receive no more data from
163
* If this analyzer notices it is not able to use the given data (for
164
* example, this is a completely different file format), have this
165
* function return true ASAP.
166
* \return true if you are finished with this stream, false otherwise
168
virtual bool isReadyWithStream() = 0;