Source code for formasaurus.html

"""
HTML processing utilities
"""

from html import escape as html_escape

import lxml.html
from lxml.html.clean import Cleaner

from formasaurus.text import normalize_whitespaces


[docs] def remove_by_xpath(tree, xpath): """ Remove all HTML elements which match a given XPath expression. """ for bad in tree.xpath(xpath): bad.getparent().remove(bad)
parser = lxml.html.HTMLParser(encoding="utf8")
[docs] def load_html(tree_or_html, base_url=None): """ Parse HTML data to a lxml tree. ``tree_or_html`` must be either unicode or utf8-encoded (even if original page declares a different encoding). If ``tree_or_html`` is not a string then it is returned as-is. """ if not isinstance(tree_or_html, ((str,), bytes)): return tree_or_html html = tree_or_html if isinstance(html, str): html = html.encode("utf8") return lxml.html.fromstring(html, base_url=base_url, parser=parser)
def html_tostring(tree): return lxml.html.tostring(tree, pretty_print=True, encoding="unicode") def get_forms(tree): return tree.xpath("//form")
[docs] def get_cleaned_form_html(form, human_readable=True): """ Return a cleaned up version of <form> HTML contents. If ``human_readable`` is True, HTML is cleaned to make source code more readable for humans; otherwise it is cleaned to make rendered form more safe to render. """ params = dict( forms=False, javascript=True, scripts=True, remove_unknown_tags=False, ) if human_readable: params.update( style=True, allow_tags={ "form", "input", "textarea", "label", "option", "select", "submit", "a", }, ) else: params.update(style=False) cleaner = Cleaner(**params) raw_html = lxml.html.tostring(form, pretty_print=True, encoding="unicode") html = cleaner.clean_html(raw_html) if human_readable: lines = [line.strip() for line in html.splitlines(False) if line.strip()] html = "\n".join(lines) return html
[docs] def get_field_names(elems): """Return unique name attributes""" res = [] seen = set() for el in elems: if (not getattr(el, "name", None)) or (el.name in seen): continue seen.add(el.name) res.append(el.name) return res
[docs] def get_visible_fields(form): """ Return visible form fields (the ones users should fill). """ # FIXME: don't suggest readonly fields return form.xpath( "descendant::textarea" "|descendant::select" "|descendant::button" '|(descendant::input[(@type!="hidden" and @type!="HIDDEN" and @type!="Hidden") or not(@type)])' )
[docs] def get_fields_to_annotate(form): """ Return fields which should be annotated: 1. they should be visible to user, and 2. they should have non-empty name (i.e. affect form submission result). """ return [f for f in get_visible_fields(form) if getattr(f, "name", None)]
[docs] def escaped_with_field_highlighted(form_html, field_name): """ Return escaped HTML source code suitable for displaying; fields with name==field_name are highlighted. """ form = load_html(form_html) for elem in form.xpath(f'.//*[@name="{field_name}"]'): add_text_before(elem, "__START__") add_text_after(elem, "__END__") text = html_tostring(form) text = ( html_escape(text) .replace("__START__", '<span style="font-size:large;color:#000">') .replace("__END__", "</span>") ) return text
[docs] def highlight_fields(html, field_name): """ Return HTML source code with all fields with name==field_name highlighted by adding ``formasaurus-field-highlighted`` CSS class. """ tree = load_html(html) xpath = f'.//*[@name="{field_name}"]' for elem in tree.xpath(xpath): elem.set("class", elem.get("class", "") + " formasaurus-field-highlighted") return html_tostring(tree)
[docs] def add_text_after(elem, text): """Add text after elem""" tail = elem.tail or "" elem.tail = text + tail
[docs] def add_text_before(elem, text): """Add text before elem""" prev = elem.getprevious() if prev is not None: # not a first child prev.tail = (prev.tail or "") + text else: # first child parent = elem.getparent() parent.text = (parent.text or "") + text
# def assert_html_equal(want, got): # """ Assert that 2 HTML documents are equal """ # checker = LXMLOutputChecker() # if not checker.check_output(want, got, PARSE_HTML): # message = checker.output_difference(Example("", want), got, 0) # raise AssertionError(message)
[docs] def get_text_around_elems(tree, elems): """ Return (before, after) tuple with {elem: text} dicts containing text before a specified lxml DOM Element and after it. """ if not elems: return {}, {} buf = [] before = dict.fromkeys(elems, "") after = dict.fromkeys(elems, "") def flush_buf(): res = " ".join( [normalize_whitespaces(b.strip()) for b in buf if b and b.strip()] ) buf[:] = [] return res def visit(elem): if elem in before: before[elem] = flush_buf() buf.append(elem.tail) return buf.append(elem.text) for child in elem: visit(child) buf.append(elem.tail) visit(tree) for prev, next in zip(elems[:-1], elems[1:]): after[prev] = before[next] after[elems[-1]] = flush_buf() return before, after