"""
HTML processing utilities
"""
from html import escape as html_escape
import lxml.html
from lxml.html.clean import Cleaner
from formasaurus.text import normalize_whitespaces
[docs]
def remove_by_xpath(tree, xpath):
"""
Remove all HTML elements which match a given XPath expression.
"""
for bad in tree.xpath(xpath):
bad.getparent().remove(bad)
parser = lxml.html.HTMLParser(encoding="utf8")
[docs]
def load_html(tree_or_html, base_url=None):
"""
Parse HTML data to a lxml tree.
``tree_or_html`` must be either unicode or utf8-encoded
(even if original page declares a different encoding).
If ``tree_or_html`` is not a string then it is returned as-is.
"""
if not isinstance(tree_or_html, ((str,), bytes)):
return tree_or_html
html = tree_or_html
if isinstance(html, str):
html = html.encode("utf8")
return lxml.html.fromstring(html, base_url=base_url, parser=parser)
def html_tostring(tree):
return lxml.html.tostring(tree, pretty_print=True, encoding="unicode")
def get_forms(tree):
return tree.xpath("//form")
[docs]
def get_field_names(elems):
"""Return unique name attributes"""
res = []
seen = set()
for el in elems:
if (not getattr(el, "name", None)) or (el.name in seen):
continue
seen.add(el.name)
res.append(el.name)
return res
[docs]
def get_visible_fields(form):
"""
Return visible form fields (the ones users should fill).
"""
# FIXME: don't suggest readonly fields
return form.xpath(
"descendant::textarea"
"|descendant::select"
"|descendant::button"
'|(descendant::input[(@type!="hidden" and @type!="HIDDEN" and @type!="Hidden") or not(@type)])'
)
[docs]
def get_fields_to_annotate(form):
"""
Return fields which should be annotated:
1. they should be visible to user, and
2. they should have non-empty name (i.e. affect form submission result).
"""
return [f for f in get_visible_fields(form) if getattr(f, "name", None)]
[docs]
def escaped_with_field_highlighted(form_html, field_name):
"""
Return escaped HTML source code suitable for displaying;
fields with name==field_name are highlighted.
"""
form = load_html(form_html)
for elem in form.xpath(f'.//*[@name="{field_name}"]'):
add_text_before(elem, "__START__")
add_text_after(elem, "__END__")
text = html_tostring(form)
text = (
html_escape(text)
.replace("__START__", '<span style="font-size:large;color:#000">')
.replace("__END__", "</span>")
)
return text
[docs]
def highlight_fields(html, field_name):
"""
Return HTML source code with all fields with name==field_name
highlighted by adding ``formasaurus-field-highlighted`` CSS class.
"""
tree = load_html(html)
xpath = f'.//*[@name="{field_name}"]'
for elem in tree.xpath(xpath):
elem.set("class", elem.get("class", "") + " formasaurus-field-highlighted")
return html_tostring(tree)
[docs]
def add_text_after(elem, text):
"""Add text after elem"""
tail = elem.tail or ""
elem.tail = text + tail
[docs]
def add_text_before(elem, text):
"""Add text before elem"""
prev = elem.getprevious()
if prev is not None:
# not a first child
prev.tail = (prev.tail or "") + text
else:
# first child
parent = elem.getparent()
parent.text = (parent.text or "") + text
# def assert_html_equal(want, got):
# """ Assert that 2 HTML documents are equal """
# checker = LXMLOutputChecker()
# if not checker.check_output(want, got, PARSE_HTML):
# message = checker.output_difference(Example("", want), got, 0)
# raise AssertionError(message)
[docs]
def get_text_around_elems(tree, elems):
"""
Return (before, after) tuple with {elem: text} dicts containing
text before a specified lxml DOM Element and after it.
"""
if not elems:
return {}, {}
buf = []
before = dict.fromkeys(elems, "")
after = dict.fromkeys(elems, "")
def flush_buf():
res = " ".join(
[normalize_whitespaces(b.strip()) for b in buf if b and b.strip()]
)
buf[:] = []
return res
def visit(elem):
if elem in before:
before[elem] = flush_buf()
buf.append(elem.tail)
return
buf.append(elem.text)
for child in elem:
visit(child)
buf.append(elem.tail)
visit(tree)
for prev, next in zip(elems[:-1], elems[1:]):
after[prev] = before[next]
after[elems[-1]] = flush_buf()
return before, after