D7net
Home
Console
Upload
information
Create File
Create Folder
About
Tools
:
/
usr
/
lib
/
python2.7
/
site-packages
/
pip
/
_vendor
/
html5lib
/
treewalkers
/
Filename :
__init__.py
back
Copy
"""A collection of modules for iterating through different kinds of tree, generating tokens identical to those produced by the tokenizer module. To create a tree walker for a new type of tree, you need to do implement a tree walker object (called TreeWalker by convention) that implements a 'serialize' method taking a tree as sole argument and returning an iterator generating tokens. """ from __future__ import absolute_import, division, unicode_literals __all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree", "pulldom"] import sys from .. import constants from ..utils import default_etree treeWalkerCache = {} def getTreeWalker(treeType, implementation=None, **kwargs): """Get a TreeWalker class for various types of tree with built-in support treeType - the name of the tree type required (case-insensitive). Supported values are: "dom" - The xml.dom.minidom DOM implementation "pulldom" - The xml.dom.pulldom event stream "etree" - A generic walker for tree implementations exposing an elementtree-like interface (known to work with ElementTree, cElementTree and lxml.etree). "lxml" - Optimized walker for lxml.etree "genshi" - a Genshi stream implementation - (Currently applies to the "etree" tree type only). A module implementing the tree type e.g. xml.etree.ElementTree or cElementTree.""" treeType = treeType.lower() if treeType not in treeWalkerCache: if treeType in ("dom", "pulldom"): name = "%s.%s" % (__name__, treeType) __import__(name) mod = sys.modules[name] treeWalkerCache[treeType] = mod.TreeWalker elif treeType == "genshi": from . import genshistream treeWalkerCache[treeType] = genshistream.TreeWalker elif treeType == "lxml": from . import lxmletree treeWalkerCache[treeType] = lxmletree.TreeWalker elif treeType == "etree": from . import etree if implementation is None: implementation = default_etree # XXX: NEVER cache here, caching is done in the etree submodule return etree.getETreeModule(implementation, **kwargs).TreeWalker return treeWalkerCache.get(treeType) def concatenateCharacterTokens(tokens): pendingCharacters = [] for token in tokens: type = token["type"] if type in ("Characters", "SpaceCharacters"): pendingCharacters.append(token["data"]) else: if pendingCharacters: yield {"type": "Characters", "data": "".join(pendingCharacters)} pendingCharacters = [] yield token if pendingCharacters: yield {"type": "Characters", "data": "".join(pendingCharacters)} def pprint(walker): """Pretty printer for tree walkers""" output = [] indent = 0 for token in concatenateCharacterTokens(walker): type = token["type"] if type in ("StartTag", "EmptyTag"): # tag name if token["namespace"] and token["namespace"] != constants.namespaces["html"]: if token["namespace"] in constants.prefixes: ns = constants.prefixes[token["namespace"]] else: ns = token["namespace"] name = "%s %s" % (ns, token["name"]) else: name = token["name"] output.append("%s<%s>" % (" " * indent, name)) indent += 2 # attributes (sorted for consistent ordering) attrs = token["data"] for (namespace, localname), value in sorted(attrs.items()): if namespace: if namespace in constants.prefixes: ns = constants.prefixes[namespace] else: ns = namespace name = "%s %s" % (ns, localname) else: name = localname output.append("%s%s=\"%s\"" % (" " * indent, name, value)) # self-closing if type == "EmptyTag": indent -= 2 elif type == "EndTag": indent -= 2 elif type == "Comment": output.append("%s<!-- %s -->" % (" " * indent, token["data"])) elif type == "Doctype": if token["name"]: if token["publicId"]: output.append("""%s<!DOCTYPE %s "%s" "%s">""" % (" " * indent, token["name"], token["publicId"], token["systemId"] if token["systemId"] else "")) elif token["systemId"]: output.append("""%s<!DOCTYPE %s "" "%s">""" % (" " * indent, token["name"], token["systemId"])) else: output.append("%s<!DOCTYPE %s>" % (" " * indent, token["name"])) else: output.append("%s<!DOCTYPE >" % (" " * indent,)) elif type == "Characters": output.append("%s\"%s\"" % (" " * indent, token["data"])) elif type == "SpaceCharacters": assert False, "concatenateCharacterTokens should have got rid of all Space tokens" else: raise ValueError("Unknown token type, %s" % type) return "\n".join(output)