Source code for formasaurus.classifiers

from __future__ import annotations

import json
import os

import joblib
from lxml.html import HtmlElement
from platformdirs import user_data_path

from formasaurus import fieldtype_model, formtype_model
from formasaurus.html import get_fields_to_annotate, get_forms, load_html
from formasaurus.storage import Storage
from formasaurus.utils import at_root, thresholded

DEFAULT_DATA_PATH = at_root("data")



[docs]
def extract_forms(
    tree_or_html: HtmlElement | str | bytes,
    proba: bool = False,
    threshold: float = 0.05,
    fields: bool = True,
):
    """Return a list of ``(form_elem, form_info)`` tuples, one tuple for each
    form found on *tree_or_html*.

    ``form_info`` are :class:`dict` objects with the results of
    :meth:`classify` or :meth:`classify_proba`` calls, depending on *proba*.

    *tree_or_html* is the HTML document from which form data should be
    extracted, either an lxml tree or HTML source code as a string or bytes.

    *proba* determines whether *form_info* values in the result include
    probability data (``True``) or not (``False``, default).

    *threshold* is the minimum probability, in the [0, 1] range, for data to be
    included in the result.

    *fields* determines whether field type data is computed and included into
    the *form_info* values in the result (``True``, default) or not
    (``False``).
    """
    return get_instance().extract_forms(
        tree_or_html=tree_or_html,
        proba=proba,
        threshold=threshold,
        fields=fields,
    )




[docs]
def classify(form, fields=True):
    """
    Return ``{'form': 'type', 'fields': {'name': 'type', ...}}``
    dict with form type and types of its visible submittable fields.

    If ``fields`` argument is False, only information about form type is
    returned: ``{'form': 'type'}``.
    """
    return get_instance().classify(form, fields=fields)




[docs]
def classify_proba(form, threshold=0.0, fields=True):
    """
    Return dict with probabilities of ``form`` and its fields belonging
    to various form and field classes::

        {
            'form': {'type1': prob1, 'type2': prob2, ...},
            'fields': {
                'name': {'type1': prob1, 'type2': prob2, ...},
                ...
            }
        }

    ``form`` should be an lxml HTML <form> element.
    Only classes with probability >= ``threshold`` are preserved.

    If ``fields`` is False, only information about the form is returned::

        {
            'form': {'type1': prob1, 'type2': prob2, ...}
        }

    """
    return get_instance().classify_proba(
        form=form,
        threshold=threshold,
        fields=fields,
    )




[docs]
class FormFieldClassifier:
    """
    FormFieldClassifier detects HTML form and field types.
    """

    def __init__(self, form_classifier=None, field_model=None):
        self.form_classifier = form_classifier
        self._field_model = field_model

    @staticmethod
    def _field_filename(filename):
        return f"{filename}-field.joblib"

    @staticmethod
    def _form_filename(filename):
        return f"{filename}-form.json"


[docs]
    @classmethod
    def load(cls, filename=None, autocreate=True, rebuild=False):
        """
        Load extractor from file ``filename``.

        If the file is missing and ``autocreate`` option is True (default),
        the model is created using default parameters and training data.
        If ``filename`` is None then default model file name is used.

        Example - load the default extractor::

            ffc = FormFieldClassifier.load()

        """
        if filename is None:
            if env_path := os.environ.get("FORMASAURUS_MODEL"):
                filename = os.path.expanduser(env_path)
            else:
                filename = at_root("data", "model")
                if rebuild or not os.path.exists(cls._form_filename(filename)):
                    writable_folder = user_data_path(
                        appname="Formasaurus",
                        appauthor="Zyte",
                        roaming=True,
                        ensure_exists=True,
                    )
                    filename = str(writable_folder / "model")

        if rebuild or (autocreate and not os.path.exists(cls._form_filename(filename))):
            ex = cls.trained_on(DEFAULT_DATA_PATH)
            ex.save(filename)
            return ex

        with open(cls._form_filename(filename)) as fp:
            form_classifier = FormClassifier.from_dict(json.load(fp))
        field_model = joblib.load(cls._field_filename(filename))
        return cls(
            form_classifier=form_classifier,
            field_model=field_model,
        )



[docs]
    @classmethod
    def trained_on(cls, data_folder):
        """Return Formasaurus object trained on data from data_folder"""
        store = Storage(data_folder)
        print("Loading training data...")
        annotations = list(
            store.iter_annotations(
                simplify_form_types=True,
                simplify_field_types=True,
                verbose=True,
                leave=True,
            )
        )
        ex = cls()
        ex.train(annotations)
        return ex


    def save(self, filename):
        if self.form_classifier is None or self._field_model is None:
            raise ValueError("FormFieldExtractor is not trained")
        # Using joblib here is fine because we have control over
        # sklearn-cfrsuite, used for the field model.
        joblib.dump(self._field_model, self._field_filename(filename), compress=3)
        # For the form classifier we use a custom serialization implementation,
        # as using joblib could lead to breakages when mixing different
        # scikit-learn versions.
        with open(self._form_filename(filename), "w") as fp:
            json.dump(self.form_classifier.to_dict(), fp, ensure_ascii=False)


[docs]
    def train(self, annotations):
        """Train FormFieldExtractor on a list of FormAnnotation objects."""
        print("Training form type detector on %d example(s)..." % len(annotations))
        self.form_classifier = FormClassifier(full_type_names=True)
        self.form_classifier.train(annotations)

        print("Training field type detector...")
        self._field_model = fieldtype_model.train(
            annotations=annotations,
            use_precise_form_types=True,
            full_field_type_names=True,
            full_form_type_names=self.form_classifier.full_type_names,
            verbose=True,
        )



[docs]
    def classify(self, form, fields=True):
        """
        Return ``{'form': 'type', 'fields': {'name': 'type', ...}}``
        dict with form type and types of its visible submittable fields.

        If ``fields`` argument is False, only information about form type is
        returned: ``{'form': 'type'}``.
        """
        form_type = self.form_classifier.classify(form)
        res = {"form": form_type}
        if fields:
            field_elems = get_fields_to_annotate(form)
            xseq = fieldtype_model.get_form_features(form, form_type, field_elems)
            yseq = self._field_model.predict_single(xseq)
            res["fields"] = {elem.name: cls for elem, cls in zip(field_elems, yseq)}
        return res



[docs]
    def classify_proba(self, form, threshold=0.0, fields=True):
        """
        Return dict with probabilities of ``form`` and its fields belonging
        to various form and field classes::

            {
                'form': {'type1': prob1, 'type2': prob2, ...},
                'fields': {
                    'name': {'type1': prob1, 'type2': prob2, ...},
                    ...
                }
            }

        ``form`` should be an lxml HTML <form> element.
        Only classes with probability >= ``threshold`` are preserved.

        If ``fields`` is False, only information about the form is returned::

            {
                'form': {'type1': prob1, 'type2': prob2, ...}
            }

        """
        form_types_proba = self.form_classifier.classify_proba(form, threshold)
        res = {"form": form_types_proba}

        if fields:
            form_type = max(form_types_proba, key=lambda p: form_types_proba[p])
            field_elems = get_fields_to_annotate(form)
            xseq = fieldtype_model.get_form_features(form, form_type, field_elems)
            yseq = self._field_model.predict_marginals_single(xseq)
            res["fields"] = {
                elem.name: thresholded(probs, threshold)
                for elem, probs in zip(field_elems, yseq)
            }

        return res



[docs]
    def extract_forms(self, tree_or_html, proba=False, threshold=0.05, fields=True):
        """
        Given a lxml tree or HTML source code, return a list of
        ``(form_elem, form_info)`` tuples.

        ``form_info`` dicts contain results of :meth:`classify` or
        :meth:`classify_proba`` calls, depending on ``proba`` parameter.

        When ``fields`` is False, field type information is not computed.
        """
        if isinstance(tree_or_html, (str, bytes)):
            tree = load_html(tree_or_html)
        else:
            tree = tree_or_html
        forms = get_forms(tree)
        if proba:
            return [
                (form, self.classify_proba(form, threshold, fields)) for form in forms
            ]
        else:
            return [(form, self.classify(form, fields)) for form in forms]


    @property
    def form_classes(self):
        """Possible form classes"""
        return self.form_classifier.classes

    @property
    def field_classes(self):
        """Possible field classes"""
        return self._field_model.classes_




[docs]
class FormClassifier:
    """
    Convenience wrapper for scikit-learn based form type detection model.
    """

    def __init__(self, form_model=None, full_type_names=True):
        self.model = form_model
        self.full_type_names = full_type_names

    @classmethod
    def from_dict(cls, obj):
        return cls(
            form_model=formtype_model.from_dict(obj["model"]),
            full_type_names=obj["full_type_names"],
        )

    def to_dict(self):
        return {
            "model": formtype_model.to_dict(self.model),
            "full_type_names": self.full_type_names,
        }


[docs]
    def classify(self, form):
        """
        Return form class.
        ``form`` should be an lxml HTML <form> element.
        """
        return self.model.predict([form])[0]



[docs]
    def classify_proba(self, form, threshold=0.0):
        """
        Return form class.
        ``form`` should be an lxml HTML <form> element.
        """
        probs = self.model.predict_proba([form])[0]
        return self._probs2dict(probs, threshold)



[docs]
    def train(self, annotations):
        """Train formtype_model on a list of FormAnnotation objects."""
        self.model = formtype_model.train(
            annotations=annotations,
            full_type_names=self.full_type_names,
        )



[docs]
    def extract_forms(self, tree_or_html, proba=False, threshold=0.05):
        """
        Given a lxml tree or HTML source code, return a list of
        ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results
        of :meth:`classify` or :meth:`classify_proba`` calls, depending on
        ``proba`` parameter.
        """
        forms = get_forms(load_html(tree_or_html))
        if proba:
            return [(form, self.classify_proba(form, threshold)) for form in forms]
        else:
            return [(form, self.classify(form)) for form in forms]


    @property
    def classes(self):
        if self.model is None:
            raise ValueError("formtype_model is not trained")
        return self.model.steps[-1][1].classes_

    def _probs2dict(self, probs, threshold):
        return thresholded(dict(zip(self.classes, probs)), threshold)



_form_field_classifier = None



[docs]
def get_instance():
    """Return a shared FormFieldClassifier instance"""
    global _form_field_classifier
    if _form_field_classifier is None:
        _form_field_classifier = FormFieldClassifier.load()
    return _form_field_classifier