usr/lib/python3.6/site-packages/html5lib/treebuilders/etree_lxml.py 0000644 00000033521 14720533754 0021354 0 ustar 00 """Module for supporting the lxml.etree library. The idea here is to use as much
of the native library as possible, without using fragile hacks like custom element
names that break between releases. The downside of this is that we cannot represent
all possible trees; specifically the following are known to cause problems:
Text or comments as siblings of the root element
Docypes with no name
When any of these things occur, we emit a DataLossWarning
"""
from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
import warnings
import re
import sys
from . import base
from ..constants import DataLossWarning
from .. import constants
from . import etree as etree_builders
from .. import _ihatexml
import lxml.etree as etree
fullTree = True
tag_regexp = re.compile("{([^}]*)}(.*)")
comment_type = etree.Comment("asd").tag
class DocumentType(object):
def __init__(self, name, publicId, systemId):
self.name = name
self.publicId = publicId
self.systemId = systemId
class Document(object):
def __init__(self):
self._elementTree = None
self._childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
def _getChildNodes(self):
return self._childNodes
childNodes = property(_getChildNodes)
def testSerializer(element):
rv = []
infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
if hasattr(element, "getroot"):
# Full tree case
rv.append("#document")
if element.docinfo.internalDTD:
if not (element.docinfo.public_id or
element.docinfo.system_url):
dtd_str = "" % element.docinfo.root_name
else:
dtd_str = """""" % (
element.docinfo.root_name,
element.docinfo.public_id,
element.docinfo.system_url)
rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
next_element = element.getroot()
while next_element.getprevious() is not None:
next_element = next_element.getprevious()
while next_element is not None:
serializeElement(next_element, indent + 2)
next_element = next_element.getnext()
elif isinstance(element, str) or isinstance(element, bytes):
# Text in a fragment
assert isinstance(element, str) or sys.version_info[0] == 2
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
# Fragment case
rv.append("#document-fragment")
for next_element in element:
serializeElement(next_element, indent + 2)
elif element.tag == comment_type:
rv.append("|%s" % (' ' * indent, element.text))
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
else:
assert isinstance(element, etree._Element)
nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None:
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
infosetFilter.fromXmlName(tag)))
else:
rv.append("|%s<%s>" % (' ' * indent,
infosetFilter.fromXmlName(element.tag)))
if hasattr(element, "attrib"):
attributes = []
for name, value in element.attrib.items():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
name = infosetFilter.fromXmlName(name)
prefix = constants.prefixes[ns]
attr_string = "%s %s" % (prefix, name)
else:
attr_string = infosetFilter.fromXmlName(name)
attributes.append((attr_string, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
if element.text:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
indent += 2
for child in element:
serializeElement(child, indent)
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0)
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
def serializeElement(element):
if not hasattr(element, "tag"):
if element.docinfo.internalDTD:
if element.docinfo.doctype:
dtd_str = element.docinfo.doctype
else:
dtd_str = "" % element.docinfo.root_name
rv.append(dtd_str)
serializeElement(element.getroot())
elif element.tag == comment_type:
rv.append("" % (element.text,))
else:
# This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>" % (element.tag,))
else:
attr = " ".join(["%s=\"%s\"" % (name, value)
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
if element.text:
rv.append(element.text)
for child in element:
serializeElement(child)
rv.append("%s>" % (element.tag,))
if hasattr(element, "tail") and element.tail:
rv.append(element.tail)
serializeElement(element)
return "".join(rv)
class TreeBuilder(base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
fragmentClass = Document
implementation = etree
def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
def __init__(self, element, value=None):
if value is None:
value = {}
self._element = element
dict.__init__(self, value) # pylint:disable=non-parent-init-called
for key, value in self.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
def __setitem__(self, key, value):
dict.__setitem__(self, key, value)
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
class Element(builder.Element):
def __init__(self, name, namespace):
name = infosetFilter.coerceElement(name)
builder.Element.__init__(self, name, namespace=namespace)
self._attributes = Attributes(self)
def _setName(self, name):
self._name = infosetFilter.coerceElement(name)
self._element.tag = self._getETreeTag(
self._name, self._namespace)
def _getName(self):
return infosetFilter.fromXmlName(self._name)
name = property(_getName, _setName)
def _getAttributes(self):
return self._attributes
def _setAttributes(self, attributes):
self._attributes = Attributes(self, attributes)
attributes = property(_getAttributes, _setAttributes)
def insertText(self, data, insertBefore=None):
data = infosetFilter.coerceCharacters(data)
builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child):
builder.Element.appendChild(self, child)
class Comment(builder.Comment):
def __init__(self, data):
data = infosetFilter.coerceComment(data)
builder.Comment.__init__(self, data)
def _setData(self, data):
data = infosetFilter.coerceComment(data)
self._element.text = data
def _getData(self):
return self._element.text
data = property(_getData, _setData)
self.elementClass = Element
self.commentClass = Comment
# self.fragmentClass = builder.DocumentFragment
base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self):
base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
self.initial_comments = []
self.doctype = None
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._elementTree
else:
return self.document._elementTree.getroot()
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(list(element))
if element.tail:
fragment.append(element.tail)
return fragment
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if not name:
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
self.doctype = None
else:
coercedName = self.infosetFilter.coerceElement(name)
if coercedName != name:
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(coercedName, publicId, systemId)
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
assert parent is None or parent is self.document
assert self.document._elementTree is None
self.initial_comments.append(data)
def insertCommentMain(self, data, parent=None):
if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type):
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token):
"""Create the document root"""
# Because of the way libxml2 works, it doesn't seem to be possible to
# alter information like the doctype after the tree has been parsed.
# Therefore we need to use the built-in parser to create our initial
# tree, after which we can add elements like normal
docStr = ""
if self.doctype:
assert self.doctype.name
docStr += "= 0 and sysid.find('"') >= 0:
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
sysid = sysid.replace("'", 'U00027')
if sysid.find("'") >= 0:
docStr += '"%s"' % sysid
else:
docStr += "'%s'" % sysid
else:
docStr += "''"
docStr += ">"
if self.doctype.name != token["name"]:
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
docStr += ""
root = etree.fromstring(docStr)
# Append the initial comments:
for comment_token in self.initial_comments:
comment = self.commentClass(comment_token["data"])
root.addprevious(comment._element)
# Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
# Give the root element the right name
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s" % (namespace, name)
root.tag = etree_tag
# Add the root element to the internal child/open data structures
root_element = self.elementClass(name, namespace)
root_element._element = root
self.document._childNodes.append(root_element)
self.openElements.append(root_element)
# Reset to the default insert comment function
self.insertComment = self.insertCommentMain
usr/lib/python3.6/site-packages/html5lib/treewalkers/etree_lxml.py 0000644 00000014231 14720544373 0021207 0 ustar 00 from __future__ import absolute_import, division, unicode_literals
from six import text_type
from lxml import etree
from ..treebuilders.etree import tag_regexp
from . import base
from .. import _ihatexml
def ensure_str(s):
if s is None:
return None
elif isinstance(s, text_type):
return s
else:
return s.decode("ascii", "strict")
class Root(object):
def __init__(self, et):
self.elementtree = et
self.children = []
try:
if et.docinfo.internalDTD:
self.children.append(Doctype(self,
ensure_str(et.docinfo.root_name),
ensure_str(et.docinfo.public_id),
ensure_str(et.docinfo.system_url)))
except AttributeError:
pass
try:
node = et.getroot()
except AttributeError:
node = et
while node.getprevious() is not None:
node = node.getprevious()
while node is not None:
self.children.append(node)
node = node.getnext()
self.text = None
self.tail = None
def __getitem__(self, key):
return self.children[key]
def getnext(self):
return None
def __len__(self):
return 1
class Doctype(object):
def __init__(self, root_node, name, public_id, system_id):
self.root_node = root_node
self.name = name
self.public_id = public_id
self.system_id = system_id
self.text = None
self.tail = None
def getnext(self):
return self.root_node.children[1]
class FragmentRoot(Root):
def __init__(self, children):
self.children = [FragmentWrapper(self, child) for child in children]
self.text = self.tail = None
def getnext(self):
return None
class FragmentWrapper(object):
def __init__(self, fragment_root, obj):
self.root_node = fragment_root
self.obj = obj
if hasattr(self.obj, 'text'):
self.text = ensure_str(self.obj.text)
else:
self.text = None
if hasattr(self.obj, 'tail'):
self.tail = ensure_str(self.obj.tail)
else:
self.tail = None
def __getattr__(self, name):
return getattr(self.obj, name)
def getnext(self):
siblings = self.root_node.children
idx = siblings.index(self)
if idx < len(siblings) - 1:
return siblings[idx + 1]
else:
return None
def __getitem__(self, key):
return self.obj[key]
def __bool__(self):
return bool(self.obj)
def getparent(self):
return None
def __str__(self):
return str(self.obj)
def __unicode__(self):
return str(self.obj)
def __len__(self):
return len(self.obj)
class TreeWalker(base.NonRecursiveTreeWalker):
def __init__(self, tree):
# pylint:disable=redefined-variable-type
if isinstance(tree, list):
self.fragmentChildren = set(tree)
tree = FragmentRoot(tree)
else:
self.fragmentChildren = set()
tree = Root(tree)
base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = _ihatexml.InfosetFilter()
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
return base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root):
return (base.DOCUMENT,)
elif isinstance(node, Doctype):
return base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
return base.TEXT, ensure_str(node.obj)
elif node.tag == etree.Comment:
return base.COMMENT, ensure_str(node.text)
elif node.tag == etree.Entity:
return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
else:
# This is assumed to be an ordinary element
match = tag_regexp.match(ensure_str(node.tag))
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = ensure_str(node.tag)
attrs = {}
for name, value in list(node.attrib.items()):
name = ensure_str(name)
value = ensure_str(value)
match = tag_regexp.match(name)
if match:
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
attrs, len(node) > 0 or node.text)
def getFirstChild(self, node):
assert not isinstance(node, tuple), "Text nodes have no children"
assert len(node) or node.text, "Node has no children"
if node.text:
return (node, "text")
else:
return node[0]
def getNextSibling(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text":
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
# because node[0] might evaluate to False if it has no child element
if len(node):
return node[0]
else:
return None
else: # tail
return node.getnext()
return (node, "tail") if node.tail else node.getnext()
def getParentNode(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text":
return node
# else: fallback to "normal" processing
elif node in self.fragmentChildren:
return None
return node.getparent()