1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
|
# Proxy functions and low level node allocation stuff
# Proxies represent elements, their reference is stored in the C
# structure of the respective node to avoid multiple instantiation of
# the Python class.
@cython.linetrace(False)
cdef inline _Element getProxy(xmlNode* c_node):
u"""Get a proxy for a given node.
"""
#print "getProxy for:", <int>c_node
if c_node is not NULL and c_node._private is not NULL:
return <_Element>c_node._private
else:
return None
@cython.linetrace(False)
cdef inline bint hasProxy(xmlNode* c_node):
if c_node._private is NULL:
return False
return True
@cython.linetrace(False)
cdef inline int _registerProxy(_Element proxy, _Document doc,
xmlNode* c_node) except -1:
u"""Register a proxy and type for the node it's proxying for.
"""
#print "registering for:", <int>proxy._c_node
assert not hasProxy(c_node), u"double registering proxy!"
proxy._doc = doc
proxy._c_node = c_node
c_node._private = <void*>proxy
return 0
@cython.linetrace(False)
cdef inline int _unregisterProxy(_Element proxy) except -1:
u"""Unregister a proxy for the node it's proxying for.
"""
cdef xmlNode* c_node = proxy._c_node
assert c_node._private is <void*>proxy, u"Tried to unregister unknown proxy"
c_node._private = NULL
return 0
################################################################################
# temporarily make a node the root node of its document
cdef xmlDoc* _fakeRootDoc(xmlDoc* c_base_doc, xmlNode* c_node) except NULL:
return _plainFakeRootDoc(c_base_doc, c_node, 1)
cdef xmlDoc* _plainFakeRootDoc(xmlDoc* c_base_doc, xmlNode* c_node,
bint with_siblings) except NULL:
# build a temporary document that has the given node as root node
# note that copy and original must not be modified during its lifetime!!
# always call _destroyFakeDoc() after use!
cdef xmlNode* c_child
cdef xmlNode* c_root
cdef xmlNode* c_new_root
cdef xmlDoc* c_doc
if with_siblings or (c_node.prev is NULL and c_node.next is NULL):
c_root = tree.xmlDocGetRootElement(c_base_doc)
if c_root is c_node:
# already the root node, no siblings
return c_base_doc
c_doc = _copyDoc(c_base_doc, 0) # non recursive!
c_new_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive!
tree.xmlDocSetRootElement(c_doc, c_new_root)
_copyParentNamespaces(c_node, c_new_root)
c_new_root.children = c_node.children
c_new_root.last = c_node.last
c_new_root.next = c_new_root.prev = NULL
# store original node
c_doc._private = c_node
# divert parent pointers of children
c_child = c_new_root.children
while c_child is not NULL:
c_child.parent = c_new_root
c_child = c_child.next
c_doc.children = c_new_root
return c_doc
cdef void _destroyFakeDoc(xmlDoc* c_base_doc, xmlDoc* c_doc):
# delete a temporary document
cdef xmlNode* c_child
cdef xmlNode* c_parent
cdef xmlNode* c_root
if c_doc is c_base_doc:
return
c_root = tree.xmlDocGetRootElement(c_doc)
# restore parent pointers of children
c_parent = <xmlNode*>c_doc._private
c_child = c_root.children
while c_child is not NULL:
c_child.parent = c_parent
c_child = c_child.next
# prevent recursive removal of children
c_root.children = c_root.last = NULL
tree.xmlFreeDoc(c_doc)
cdef _Element _fakeDocElementFactory(_Document doc, xmlNode* c_element):
u"""Special element factory for cases where we need to create a fake
root document, but still need to instantiate arbitrary nodes from
it. If we instantiate the fake root node, things will turn bad
when it's destroyed.
Instead, if we are asked to instantiate the fake root node, we
instantiate the original node instead.
"""
if c_element.doc is not doc._c_doc:
if c_element.doc._private is not NULL:
if c_element is c_element.doc.children:
c_element = <xmlNode*>c_element.doc._private
#assert c_element.type == tree.XML_ELEMENT_NODE
return _elementFactory(doc, c_element)
################################################################################
# support for freeing tree elements when proxy objects are destroyed
cdef int attemptDeallocation(xmlNode* c_node):
u"""Attempt deallocation of c_node (or higher up in tree).
"""
cdef xmlNode* c_top
# could be we actually aren't referring to the tree at all
if c_node is NULL:
#print "not freeing, node is NULL"
return 0
c_top = getDeallocationTop(c_node)
if c_top is not NULL:
#print "freeing:", c_top.name
_removeText(c_top.next) # tail
tree.xmlFreeNode(c_top)
return 1
return 0
cdef xmlNode* getDeallocationTop(xmlNode* c_node):
u"""Return the top of the tree that can be deallocated, or NULL.
"""
cdef xmlNode* c_next
#print "trying to do deallocating:", c_node.type
if hasProxy(c_node):
#print "Not freeing: proxies still exist"
return NULL
while c_node.parent is not NULL:
c_node = c_node.parent
#print "checking:", c_current.type
if c_node.type == tree.XML_DOCUMENT_NODE or \
c_node.type == tree.XML_HTML_DOCUMENT_NODE:
#print "not freeing: still in doc"
return NULL
# if we're still attached to the document, don't deallocate
if hasProxy(c_node):
#print "Not freeing: proxies still exist"
return NULL
# see whether we have children to deallocate
if not canDeallocateChildNodes(c_node):
return NULL
# see whether we have siblings to deallocate
c_next = c_node.prev
while c_next:
if _isElement(c_next):
if hasProxy(c_next) or not canDeallocateChildNodes(c_next):
return NULL
c_next = c_next.prev
c_next = c_node.next
while c_next:
if _isElement(c_next):
if hasProxy(c_next) or not canDeallocateChildNodes(c_next):
return NULL
c_next = c_next.next
return c_node
cdef int canDeallocateChildNodes(xmlNode* c_parent):
cdef xmlNode* c_node
c_node = c_parent.children
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_parent, c_node, 1)
if hasProxy(c_node):
return 0
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
return 1
################################################################################
# fix _Document references and namespaces when a node changes documents
cdef void _copyParentNamespaces(xmlNode* c_from_node, xmlNode* c_to_node) nogil:
u"""Copy the namespaces of all ancestors of c_from_node to c_to_node.
"""
cdef xmlNode* c_parent
cdef xmlNs* c_ns
cdef xmlNs* c_new_ns
cdef int prefix_known
c_parent = c_from_node.parent
while c_parent and (tree._isElementOrXInclude(c_parent) or
c_parent.type == tree.XML_DOCUMENT_NODE):
c_new_ns = c_parent.nsDef
while c_new_ns:
# libxml2 will check if the prefix is already defined
tree.xmlNewNs(c_to_node, c_new_ns.href, c_new_ns.prefix)
c_new_ns = c_new_ns.next
c_parent = c_parent.parent
ctypedef struct _ns_update_map:
xmlNs* old
xmlNs* new
ctypedef struct _nscache:
_ns_update_map* ns_map
size_t size
size_t last
cdef int _growNsCache(_nscache* c_ns_cache) except -1:
cdef _ns_update_map* ns_map_ptr
if c_ns_cache.size == 0:
c_ns_cache.size = 20
else:
c_ns_cache.size *= 2
ns_map_ptr = <_ns_update_map*> python.lxml_realloc(
c_ns_cache.ns_map, c_ns_cache.size, sizeof(_ns_update_map))
if not ns_map_ptr:
python.lxml_free(c_ns_cache.ns_map)
c_ns_cache.ns_map = NULL
raise MemoryError()
c_ns_cache.ns_map = ns_map_ptr
return 0
cdef inline int _appendToNsCache(_nscache* c_ns_cache,
xmlNs* c_old_ns, xmlNs* c_new_ns) except -1:
if c_ns_cache.last >= c_ns_cache.size:
_growNsCache(c_ns_cache)
c_ns_cache.ns_map[c_ns_cache.last] = _ns_update_map(old=c_old_ns, new=c_new_ns)
c_ns_cache.last += 1
cdef int _stripRedundantNamespaceDeclarations(xmlNode* c_element, _nscache* c_ns_cache,
xmlNs** c_del_ns_list) except -1:
u"""Removes namespace declarations from an element that are already
defined in its parents. Does not free the xmlNs's, just prepends
them to the c_del_ns_list.
"""
cdef xmlNs* c_ns
cdef xmlNs* c_ns_next
cdef xmlNs** c_nsdef
# use a xmlNs** to handle assignments to "c_element.nsDef" correctly
c_nsdef = &c_element.nsDef
while c_nsdef[0] is not NULL:
c_ns = tree.xmlSearchNsByHref(
c_element.doc, c_element.parent, c_nsdef[0].href)
if c_ns is NULL:
# new namespace href => keep and cache the ns declaration
_appendToNsCache(c_ns_cache, c_nsdef[0], c_nsdef[0])
c_nsdef = &c_nsdef[0].next
else:
# known namespace href => cache mapping and strip old ns
_appendToNsCache(c_ns_cache, c_nsdef[0], c_ns)
# cut out c_nsdef.next and prepend it to garbage chain
c_ns_next = c_nsdef[0].next
c_nsdef[0].next = c_del_ns_list[0]
c_del_ns_list[0] = c_nsdef[0]
c_nsdef[0] = c_ns_next
return 0
cdef void _cleanUpFromNamespaceAdaptation(xmlNode* c_start_node,
_nscache* c_ns_cache, xmlNs* c_del_ns_list):
# Try to recover from exceptions with really bad timing. We were in the middle
# of ripping out xmlNS-es and likely ran out of memory. Try to fix up the tree
# by re-adding the original xmlNs declarations (which might still be used in some
# places).
if c_ns_cache.ns_map:
python.lxml_free(c_ns_cache.ns_map)
if c_del_ns_list:
if not c_start_node.nsDef:
c_start_node.nsDef = c_del_ns_list
else:
c_ns = c_start_node.nsDef
while c_ns.next:
c_ns = c_ns.next
c_ns.next = c_del_ns_list
cdef int moveNodeToDocument(_Document doc, xmlDoc* c_source_doc,
xmlNode* c_element) except -1:
u"""Fix the xmlNs pointers of a node and its subtree that were moved.
Originally copied from libxml2's xmlReconciliateNs(). Expects
libxml2 doc pointers of node to be correct already, but fixes
_Document references.
For each node in the subtree, we do this:
1) Remove redundant declarations of namespace that are already
defined in its parents.
2) Replace namespaces that are *not* defined on the node or its
parents by the equivalent namespace declarations that *are*
defined on the node or its parents (possibly using a different
prefix). If a namespace is unknown, declare a new one on the
node.
3) Reassign the names of tags and attribute from the dict of the
target document *iff* it is different from the dict used in the
source subtree.
4) Set the Document reference to the new Document (if different).
This is done on backtracking to keep the original Document
alive as long as possible, until all its elements are updated.
Note that the namespace declarations are removed from the tree in
step 1), but freed only after the complete subtree was traversed
and all occurrences were replaced by tree-internal pointers.
"""
cdef xmlNode* c_start_node
cdef xmlNode* c_node
cdef char* c_name
cdef _nscache c_ns_cache = [NULL, 0, 0]
cdef xmlNs* c_ns
cdef xmlNs* c_ns_next
cdef xmlNs* c_nsdef
cdef xmlNs* c_del_ns_list = NULL
cdef size_t i, proxy_count = 0
cdef bint is_prefixed_attr
if not tree._isElementOrXInclude(c_element):
return 0
c_start_node = c_element
tree.BEGIN_FOR_EACH_FROM(c_element, c_element, 1)
if tree._isElementOrXInclude(c_element):
if hasProxy(c_element):
proxy_count += 1
# 1) cut out namespaces defined here that are already known by
# the ancestors
if c_element.nsDef is not NULL:
try:
_stripRedundantNamespaceDeclarations(c_element, &c_ns_cache, &c_del_ns_list)
except:
_cleanUpFromNamespaceAdaptation(c_start_node, &c_ns_cache, c_del_ns_list)
raise
# 2) make sure the namespaces of an element and its attributes
# are declared in this document (i.e. on the node or its parents)
c_node = c_element
while c_node is not NULL:
if c_node.ns is not NULL:
c_ns = NULL
is_prefixed_attr = (c_node.type == tree.XML_ATTRIBUTE_NODE and c_node.ns.prefix)
for i in range(c_ns_cache.last):
if c_node.ns is c_ns_cache.ns_map[i].old:
if is_prefixed_attr and not c_ns_cache.ns_map[i].new.prefix:
# avoid dropping prefix from attributes
continue
c_ns = c_ns_cache.ns_map[i].new
break
if c_ns:
c_node.ns = c_ns
else:
# not in cache or not acceptable
# => find a replacement from this document
try:
c_ns = doc._findOrBuildNodeNs(
c_start_node, c_node.ns.href, c_node.ns.prefix,
c_node.type == tree.XML_ATTRIBUTE_NODE)
c_node.ns = c_ns
_appendToNsCache(&c_ns_cache, c_node.ns, c_ns)
except:
_cleanUpFromNamespaceAdaptation(c_start_node, &c_ns_cache, c_del_ns_list)
raise
if c_node is c_element:
# after the element, continue with its attributes
c_node = <xmlNode*>c_element.properties
else:
c_node = c_node.next
tree.END_FOR_EACH_FROM(c_element)
# free now unused namespace declarations
if c_del_ns_list is not NULL:
tree.xmlFreeNsList(c_del_ns_list)
# cleanup
if c_ns_cache.ns_map is not NULL:
python.lxml_free(c_ns_cache.ns_map)
# 3) fix the names in the tree if we moved it from a different thread
if doc._c_doc.dict is not c_source_doc.dict:
fixThreadDictNames(c_start_node, c_source_doc.dict, doc._c_doc.dict)
# 4) fix _Document references
# (and potentially deallocate the source document)
if proxy_count > 0:
if proxy_count == 1 and c_start_node._private is not NULL:
proxy = getProxy(c_start_node)
if proxy is not None:
if proxy._doc is not doc:
proxy._doc = doc
else:
fixElementDocument(c_start_node, doc, proxy_count)
else:
fixElementDocument(c_start_node, doc, proxy_count)
return 0
cdef void fixElementDocument(xmlNode* c_element, _Document doc,
size_t proxy_count):
cdef xmlNode* c_node = c_element
cdef _Element proxy = None # init-to-None required due to fake-loop below
tree.BEGIN_FOR_EACH_FROM(c_element, c_node, 1)
if c_node._private is not NULL:
proxy = getProxy(c_node)
if proxy is not None:
if proxy._doc is not doc:
proxy._doc = doc
proxy_count -= 1
if proxy_count == 0:
return
tree.END_FOR_EACH_FROM(c_node)
cdef void fixThreadDictNames(xmlNode* c_element,
tree.xmlDict* c_src_dict,
tree.xmlDict* c_dict) nogil:
# re-assign the names of tags and attributes
#
# this should only be called when the element is based on a
# different libxml2 tag name dictionary
if c_element.type == tree.XML_DOCUMENT_NODE or \
c_element.type == tree.XML_HTML_DOCUMENT_NODE:
# may define "xml" namespace
fixThreadDictNsForNode(c_element, c_src_dict, c_dict)
if c_element.doc.extSubset:
fixThreadDictNamesForDtd(c_element.doc.extSubset, c_src_dict, c_dict)
if c_element.doc.intSubset:
fixThreadDictNamesForDtd(c_element.doc.intSubset, c_src_dict, c_dict)
c_element = c_element.children
while c_element is not NULL:
fixThreadDictNamesForNode(c_element, c_src_dict, c_dict)
c_element = c_element.next
elif tree._isElementOrXInclude(c_element):
fixThreadDictNamesForNode(c_element, c_src_dict, c_dict)
cdef inline void _fixThreadDictPtr(const_xmlChar** c_ptr,
tree.xmlDict* c_src_dict,
tree.xmlDict* c_dict) nogil:
c_str = c_ptr[0]
if c_str and c_src_dict and tree.xmlDictOwns(c_src_dict, c_str):
# return value can be NULL on memory error, but we don't handle that here
c_str = tree.xmlDictLookup(c_dict, c_str, -1)
if c_str:
c_ptr[0] = c_str
cdef void fixThreadDictNamesForNode(xmlNode* c_element,
tree.xmlDict* c_src_dict,
tree.xmlDict* c_dict) nogil:
cdef xmlNode* c_node = c_element
tree.BEGIN_FOR_EACH_FROM(c_element, c_node, 1)
if c_node.type in (tree.XML_ELEMENT_NODE, tree.XML_XINCLUDE_START):
fixThreadDictNamesForAttributes(
c_node.properties, c_src_dict, c_dict)
fixThreadDictNsForNode(c_node, c_src_dict, c_dict)
_fixThreadDictPtr(&c_node.name, c_src_dict, c_dict)
elif c_node.type == tree.XML_TEXT_NODE:
# libxml2's SAX2 parser interns some indentation space
fixThreadDictContentForNode(c_node, c_src_dict, c_dict)
elif c_node.type == tree.XML_COMMENT_NODE:
pass # don't touch c_node.name
else:
_fixThreadDictPtr(&c_node.name, c_src_dict, c_dict)
tree.END_FOR_EACH_FROM(c_node)
cdef inline void fixThreadDictNamesForAttributes(tree.xmlAttr* c_attr,
tree.xmlDict* c_src_dict,
tree.xmlDict* c_dict) nogil:
cdef xmlNode* c_child
cdef xmlNode* c_node = <xmlNode*>c_attr
while c_node is not NULL:
if c_node.type not in (tree.XML_TEXT_NODE, tree.XML_COMMENT_NODE):
_fixThreadDictPtr(&c_node.name, c_src_dict, c_dict)
# libxml2 keeps some (!) attribute values in the dict
c_child = c_node.children
while c_child is not NULL:
fixThreadDictContentForNode(c_child, c_src_dict, c_dict)
c_child = c_child.next
c_node = c_node.next
cdef inline void fixThreadDictContentForNode(xmlNode* c_node,
tree.xmlDict* c_src_dict,
tree.xmlDict* c_dict) nogil:
if c_node.content is not NULL and \
c_node.content is not <xmlChar*>&c_node.properties:
if tree.xmlDictOwns(c_src_dict, c_node.content):
# result can be NULL on memory error, but we don't handle that here
c_node.content = <xmlChar*>tree.xmlDictLookup(c_dict, c_node.content, -1)
cdef inline void fixThreadDictNsForNode(xmlNode* c_node,
tree.xmlDict* c_src_dict,
tree.xmlDict* c_dict) nogil:
cdef xmlNs* c_ns = c_node.nsDef
while c_ns is not NULL:
_fixThreadDictPtr(&c_ns.href, c_src_dict, c_dict)
_fixThreadDictPtr(&c_ns.prefix, c_src_dict, c_dict)
c_ns = c_ns.next
cdef void fixThreadDictNamesForDtd(tree.xmlDtd* c_dtd,
tree.xmlDict* c_src_dict,
tree.xmlDict* c_dict) nogil:
cdef xmlNode* c_node
cdef tree.xmlElement* c_element
cdef tree.xmlAttribute* c_attribute
cdef tree.xmlEntity* c_entity
c_node = c_dtd.children
while c_node:
if c_node.type == tree.XML_ELEMENT_DECL:
c_element = <tree.xmlElement*>c_node
if c_element.content:
_fixThreadDictPtr(&c_element.content.name, c_src_dict, c_dict)
_fixThreadDictPtr(&c_element.content.prefix, c_src_dict, c_dict)
c_attribute = c_element.attributes
while c_attribute:
_fixThreadDictPtr(&c_attribute.defaultValue, c_src_dict, c_dict)
_fixThreadDictPtr(&c_attribute.name, c_src_dict, c_dict)
_fixThreadDictPtr(&c_attribute.prefix, c_src_dict, c_dict)
_fixThreadDictPtr(&c_attribute.elem, c_src_dict, c_dict)
c_attribute = c_attribute.nexth
elif c_node.type == tree.XML_ENTITY_DECL:
c_entity = <tree.xmlEntity*>c_node
_fixThreadDictPtr(&c_entity.name, c_src_dict, c_dict)
_fixThreadDictPtr(&c_entity.ExternalID, c_src_dict, c_dict)
_fixThreadDictPtr(&c_entity.SystemID, c_src_dict, c_dict)
_fixThreadDictPtr(<const_xmlChar**>&c_entity.content, c_src_dict, c_dict)
c_node = c_node.next
################################################################################
# adopt an xmlDoc from an external libxml2 document source
cdef _Document _adoptForeignDoc(xmlDoc* c_doc, _BaseParser parser=None, bint is_owned=True):
"""Convert and wrap an externally produced xmlDoc for use in lxml.
Assures that all '_private' pointers are NULL to prevent accidental
dereference into lxml proxy objects.
"""
if c_doc is NULL:
raise ValueError("Illegal document provided: NULL")
if c_doc.type not in (tree.XML_DOCUMENT_NODE, tree.XML_HTML_DOCUMENT_NODE):
doc_type = c_doc.type
if is_owned:
tree.xmlFreeDoc(c_doc)
raise ValueError(f"Illegal document provided: expected XML or HTML, found {doc_type}")
cdef xmlNode* c_node = <xmlNode*>c_doc
if is_owned:
tree.BEGIN_FOR_EACH_FROM(<xmlNode*>c_doc, c_node, 1)
c_node._private = NULL
tree.END_FOR_EACH_FROM(c_node)
else:
# create a fresh copy that lxml owns
c_doc = tree.xmlCopyDoc(c_doc, 1)
if c_doc is NULL:
raise MemoryError()
return _documentFactory(c_doc, parser)
|