1
1
# Parsers for XML and HTML
3
from lxml.includes cimport xmlparser
4
from lxml.includes cimport htmlparser
6
6
cdef class _ParserContext(_ResolverContext)
7
7
cdef class _SaxParserContext(_ParserContext)
183
185
__GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
184
186
__GLOBAL_PARSER_CONTEXT.initMainParserContext()
186
cdef int _checkThreadDict(tree.xmlDict* c_dict):
187
u"""Check that c_dict is either the local thread dictionary or the global
190
#if __GLOBAL_PARSER_CONTEXT._c_dict is c_dict:
191
# return 1 # main thread
192
if __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL) is c_dict:
193
return 1 # local thread dict
196
188
############################################################
197
189
## support for Python unicode I/O
198
190
############################################################
200
192
# name of Python unicode encoding as known to libxml2
201
cdef char* _UNICODE_ENCODING = NULL
193
cdef const_char* _UNICODE_ENCODING = NULL
203
195
cdef int _setupPythonUnicode() except -1:
204
196
u"""Sets _UNICODE_ENCODING to the internal encoding name of Python unicode
209
201
cdef tree.xmlCharEncodingHandler* enchandler
210
202
cdef Py_ssize_t l
203
cdef const_char* buffer
213
205
utext = python.PyUnicode_DecodeUTF8("<test/>", 7, NULL)
214
206
l = python.PyUnicode_GET_DATA_SIZE(utext)
215
207
buffer = python.PyUnicode_AS_DATA(utext)
234
226
_UNICODE_ENCODING = enc
237
cdef char* _findEncodingName(char* buffer, int size):
229
cdef const_char* _findEncodingName(const_char* buffer, int size):
238
230
u"Work around bug in libxml2: find iconv name of encoding on our own."
239
231
cdef tree.xmlCharEncoding enc
240
enc = tree.xmlDetectCharEncoding(buffer, size)
232
enc = tree.xmlDetectCharEncoding(<const_xmlChar*>buffer, size)
241
233
if enc == tree.XML_CHAR_ENCODING_UTF16LE:
242
234
return "UTF-16LE"
243
235
elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
297
291
cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self):
298
cdef cstd.FILE* c_stream
292
cdef stdio.FILE* c_stream
299
293
cdef xmlparser.xmlParserInputBuffer* c_buffer
300
294
c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
301
295
c_stream = python.PyFile_AsFile(self._filelike)
322
316
cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
323
317
cdef xmlDoc* result
324
318
cdef char* c_encoding
325
cdef cstd.FILE* c_stream
319
cdef stdio.FILE* c_stream
326
320
cdef xmlparser.xmlInputReadCallback c_read_callback
327
321
cdef xmlparser.xmlInputCloseCallback c_close_callback
328
322
cdef void* c_callback_context
368
362
remaining = byte_count - self._bytes_read
369
363
while c_requested > remaining:
370
364
c_start = _cstr(self._bytes) + self._bytes_read
371
cstd.memcpy(c_buffer, c_start, remaining)
365
cstring_h.memcpy(c_buffer, c_start, remaining)
372
366
c_byte_count += remaining
373
367
c_buffer += remaining
374
368
c_requested -= remaining
396
390
if c_requested > 0:
397
391
c_start = _cstr(self._bytes) + self._bytes_read
398
cstd.memcpy(c_buffer, c_start, c_requested)
392
cstring_h.memcpy(c_buffer, c_start, c_requested)
399
393
c_byte_count += c_requested
400
394
self._bytes_read += c_requested
401
395
return c_byte_count
408
402
return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
410
404
cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil:
411
return cstd.fread(c_buffer, 1, c_size, <cstd.FILE*>ctxt)
405
return stdio.fread(c_buffer, 1, c_size, <stdio.FILE*>ctxt)
413
407
############################################################
414
408
## support for custom document loaders
415
409
############################################################
417
cdef xmlparser.xmlParserInput* _local_resolver(char* c_url, char* c_pubid,
411
cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
418
412
xmlparser.xmlParserCtxt* c_context) with gil:
419
413
cdef _ResolverContext context
420
414
cdef xmlparser.xmlParserInput* c_input
440
434
# parsing a related document (DTD etc.) => UTF-8 encoded URL?
441
url = _decodeFilename(c_url)
435
url = _decodeFilename(<const_xmlChar*>c_url)
442
436
if c_pubid is NULL:
445
pubid = funicode(c_pubid) # always UTF-8
439
pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
447
441
doc_ref = context._resolvers.resolve(url, pubid, context)
454
448
data = doc_ref._data_bytes
455
449
c_input = xmlparser.xmlNewInputStream(c_context)
456
450
if c_input is not NULL:
457
c_input.base = _cstr(data)
451
c_input.base = _xcstr(data)
458
452
c_input.length = python.PyBytes_GET_SIZE(data)
459
453
c_input.cur = c_input.base
460
c_input.end = &c_input.base[c_input.length]
454
c_input.end = c_input.base + c_input.length
461
455
elif doc_ref._type == PARSER_DATA_FILENAME:
463
457
c_input = xmlparser.xmlNewInputFromFile(
483
477
cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
484
478
__DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
486
xmlparser.xmlSetExternalEntityLoader(_local_resolver)
480
xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
488
482
############################################################
490
484
############################################################
492
487
cdef class _ParserContext(_ResolverContext):
493
488
cdef _ErrorLog _error_log
494
489
cdef _ParserSchemaValidationContext _validator
538
533
self._lock, python.WAIT_LOCK)
540
535
raise ParserError, u"parser locking failed"
541
self._error_log.connect()
536
self._error_log.clear()
537
self._c_ctxt.sax.serror = _receiveParserError
542
538
if self._validator is not None:
543
self._validator.connect(self._c_ctxt)
539
self._validator.connect(self._c_ctxt, self._error_log)
546
542
cdef int cleanup(self) except -1:
548
544
self._validator.disconnect()
549
545
self._resetParserContext()
551
self._error_log.disconnect()
547
self._c_ctxt.sax.serror = NULL
552
548
if config.ENABLE_THREADING and self._lock is not NULL:
553
549
python.PyThread_release_lock(self._lock)
576
572
if c_ctxt is not NULL:
577
573
context._initParserContext(c_ctxt)
575
cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, xmlerror.xmlError* error) with gil:
576
(<_ParserContext>_parser_context._private)._error_log._receive(error)
578
cdef void _receiveParserError(void* c_context, xmlerror.xmlError* error) nogil:
580
if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
581
_forwardError(NULL, error)
583
_forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
579
585
cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
580
586
_ErrorLog error_log) except 0:
581
587
if filename is not None and \
618
624
if context._validator is not None and \
619
625
not context._validator.isvalid():
620
626
well_formed = 0 # actually not 'valid', but anyway ...
621
elif recover or (c_ctxt.wellFormed and \
622
c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
627
elif recover or (c_ctxt.wellFormed and
628
c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
624
630
elif not c_ctxt.replaceEntities and not c_ctxt.validate \
625
631
and context is not None:
652
658
_raiseParseError(c_ctxt, filename, None)
654
660
if result.URL is NULL and filename is not None:
655
result.URL = tree.xmlStrdup(_cstr(filename))
661
result.URL = tree.xmlStrdup(_xcstr(filename))
656
662
if result.encoding is NULL:
657
result.encoding = tree.xmlStrdup("UTF-8")
663
result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
659
665
if context._validator is not None and \
660
666
context._validator._add_default_attributes:
680
686
cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
681
687
xmlNode* c_node) nogil:
682
688
cdef xmlNode* c_attr
684
689
c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
685
690
if c_name is NULL:
687
692
if c_name is not c_node.name:
688
tree.xmlFree(c_node.name)
693
tree.xmlFree(<char*>c_node.name)
689
694
c_node.name = c_name
690
695
c_attr = <xmlNode*>c_node.properties
691
696
while c_attr is not NULL:
754
759
self._parse_options & xmlparser.XML_PARSE_DTDATTR)
755
760
pctxt = self._newParserCtxt()
756
761
if pctxt is NULL:
757
python.PyErr_NoMemory()
758
763
_initParserContext(self._parser_context, self._resolvers, pctxt)
759
764
if self._remove_comments:
760
765
pctxt.sax.comment = NULL
775
780
self._parse_options & xmlparser.XML_PARSE_DTDATTR)
776
781
pctxt = self._newPushParserCtxt()
777
782
if pctxt is NULL:
778
python.PyErr_NoMemory()
779
784
_initParserContext(
780
785
self._push_parser_context, self._resolvers, pctxt)
781
786
if self._remove_comments:
795
800
context._setTarget(target)
803
cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
804
cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
805
if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
806
# need to extend SAX1 context to SAX2 to get proper error reports
807
if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
808
sax = <xmlparser.xmlSAXHandler*> stdlib.malloc(sizeof(xmlparser.xmlSAXHandler))
811
cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
812
sizeof(htmlparser.htmlDefaultSAXHandler))
814
sax.initialized = xmlparser.XML_SAX2_MAGIC
815
sax.serror = _receiveParserError
816
sax.startElementNs = NULL
817
sax.endElementNs = NULL
798
821
cdef xmlparser.xmlParserCtxt* _newParserCtxt(self):
822
cdef xmlparser.xmlParserCtxt* c_ctxt
799
823
if self._for_html:
800
return htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
824
c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
825
self._registerHtmlErrorHandler(c_ctxt)
802
return xmlparser.xmlNewParserCtxt()
827
c_ctxt = xmlparser.xmlNewParserCtxt()
804
830
cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self):
805
831
cdef xmlparser.xmlParserCtxt* c_ctxt
806
cdef char* c_filename
807
if self._filename is not None:
808
c_filename = _cstr(self._filename)
832
cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
811
833
if self._for_html:
812
834
c_ctxt = htmlparser.htmlCreatePushParserCtxt(
813
835
NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
814
836
if c_ctxt is not NULL:
837
self._registerHtmlErrorHandler(c_ctxt)
815
838
htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
817
840
c_ctxt = xmlparser.xmlCreatePushParserCtxt(
893
916
cdef xmlparser.xmlParserCtxt* pctxt
894
917
cdef Py_ssize_t py_buffer_len
895
918
cdef int buffer_len
919
cdef const_char* c_text
897
920
py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
898
if py_buffer_len > python.INT_MAX or _UNICODE_ENCODING is NULL:
921
if py_buffer_len > limits.INT_MAX or _UNICODE_ENCODING is NULL:
899
922
text_utf = python.PyUnicode_AsUTF8String(utext)
900
923
py_buffer_len = python.PyBytes_GET_SIZE(text_utf)
901
924
return self._parseDoc(_cstr(text_utf), py_buffer_len, c_filename)
934
957
cdef xmlDoc* result
935
958
cdef xmlparser.xmlParserCtxt* pctxt
936
959
cdef char* c_encoding
937
if c_len > python.INT_MAX:
960
if c_len > limits.INT_MAX:
938
961
raise ParserError, u"string is too long to parse it with libxml2"
940
963
context = self._getParserContext()
1065
1088
cdef _ParserContext context
1066
1089
cdef xmlparser.xmlParserCtxt* pctxt
1067
1090
cdef Py_ssize_t py_buffer_len
1069
cdef char* c_encoding
1091
cdef const_char* c_data
1092
cdef const_char* c_encoding
1070
1093
cdef int buffer_len
1072
1095
cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
1094
1117
context.prepare()
1095
1118
self._feed_parser_running = 1
1096
1119
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1098
if py_buffer_len > python.INT_MAX:
1099
buffer_len = python.INT_MAX
1101
buffer_len = <int>py_buffer_len
1102
1120
if self._for_html:
1103
error = _htmlCtxtResetPush(pctxt, c_data, buffer_len,
1104
c_encoding, self._parse_options)
1121
error = _htmlCtxtResetPush(
1122
pctxt, NULL, 0, c_encoding, self._parse_options)
1106
1124
xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
1107
1125
error = xmlparser.xmlCtxtResetPush(
1108
pctxt, c_data, buffer_len, NULL, c_encoding)
1109
py_buffer_len -= buffer_len
1110
c_data += buffer_len
1126
pctxt, NULL, 0, NULL, c_encoding)
1112
1128
#print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
1114
1130
while py_buffer_len > 0 and (error == 0 or recover):
1116
if py_buffer_len > python.INT_MAX:
1117
buffer_len = python.INT_MAX
1132
if py_buffer_len > limits.INT_MAX:
1133
buffer_len = limits.INT_MAX
1119
1135
buffer_len = <int>py_buffer_len
1120
1136
if self._for_html:
1180
1196
cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
1181
char* c_data, int buffer_len,
1182
char* c_encoding, int parse_options) except -1:
1197
const_char* c_data, int buffer_len,
1198
const_char* c_encoding, int parse_options) except -1:
1183
1199
cdef xmlparser.xmlParserInput* c_input_stream
1184
1200
# libxml2 crashes if spaceTab is not initialised
1185
1201
if _LIBXML_VERSION_INT < 20629 and c_ctxt.spaceTab is NULL:
1444
1460
c_filename = _cstr(filename_utf)
1445
1461
if python.PyUnicode_Check(text):
1446
1462
c_len = python.PyUnicode_GET_DATA_SIZE(text)
1447
if c_len > python.INT_MAX:
1463
if c_len > limits.INT_MAX:
1448
1464
return (<_BaseParser>parser)._parseDocFromFilelike(
1449
1465
StringIO(text), filename)
1450
1466
return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
1452
1468
c_len = python.PyBytes_GET_SIZE(text)
1453
if c_len > python.INT_MAX:
1469
if c_len > limits.INT_MAX:
1454
1470
return (<_BaseParser>parser)._parseDocFromFilelike(
1455
1471
BytesIO(text), filename)
1456
1472
c_text = _cstr(text)
1471
1487
cdef xmlDoc* result
1472
1488
result = tree.xmlNewDoc(NULL)
1473
1489
if result is NULL:
1474
python.PyErr_NoMemory()
1475
1491
if result.encoding is NULL:
1476
result.encoding = tree.xmlStrdup("UTF-8")
1492
result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
1477
1493
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
1507
1523
c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
1508
1524
if c_node is NULL:
1509
python.PyErr_NoMemory()
1510
1526
tree.xmlDocSetRootElement(result, c_node)
1511
1527
_copyTail(c_new_root.next, c_node)
1535
1551
if base_url is not None:
1536
1552
base_url = _encodeFilenameUTF8(base_url)
1537
1553
if doc._c_doc.URL is not NULL:
1538
tree.xmlFree(doc._c_doc.URL)
1539
doc._c_doc.URL = tree.xmlStrdup(_cstr(base_url))
1554
tree.xmlFree(<char*>doc._c_doc.URL)
1555
doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
1542
1558
if base_url is not None:
1555
1571
return _parseFilelikeDocument(
1556
1572
source, _encodeFilenameUTF8(url), parser)
1558
raise TypeError, u"cannot parse from '%s'" % funicode(python._fqtypename(source))
1574
raise TypeError, u"cannot parse from '%s'" % python._fqtypename(source).decode('UTF-8')
1560
1576
cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
1561
1577
cdef xmlDoc* c_doc