Source code for formasaurus.classifiers

# -*- coding: utf-8 -*-
from __future__ import absolute_import
import os

import six
from sklearn.externals import joblib

from formasaurus import formtype_model, fieldtype_model
from formasaurus.html import get_forms, get_fields_to_annotate, load_html
from formasaurus.storage import Storage
from formasaurus.utils import dependencies_string, at_root, thresholded

DEFAULT_DATA_PATH = at_root('data')


[docs]def extract_forms(tree_or_html, proba=False, threshold=0.05):
    """
    Given a lxml tree or HTML source code, return a list of
    ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results
    of :meth:`FormFieldClassifier.classify` or
    :meth:`FormFieldClassifier.classify_proba` calls, depending on
    ``proba`` parameter.
    """
    return instance().extract_forms(tree_or_html,
                                    proba=proba, threshold=threshold)


[docs]def classify(form):
    """
    Return ``{'form': 'type', 'fields': {'name': 'type', ...}}``
    dict with form type and types of its visible submittable fields.
    """
    return instance().classify(form)


[docs]def classify_proba(form, threshold=0.0):
    """
    Return dict with probabilities of ``form`` and its fields belonging
    to various form and field classes::

        {
            'form': {'type1': prob1, 'type2': prob2, ...},
            'fields': {
                'name': {'type1': prob1, 'type2': prob2, ...},
                ...
            }
        }

    ``form`` should be an lxml HTML <form> element.
    Only classes with probability >= ``threshold`` are preserved.

    """
    return instance().classify_proba(
        form=form,
        threshold=threshold,
    )


[docs]class FormFieldClassifier(object):
    """
    FormFieldClassifier detects HTML form and field types.
    """
    def __init__(self, form_classifier=None, field_model=None):
        self.form_classifier = form_classifier
        self._field_model = field_model

    @classmethod
[docs]    def load(cls, filename=None, autocreate=True, rebuild=False):
        """
        Load extractor from file ``filename``.

        If the file is missing and ``autocreate`` option is True (default),
        the model is created using default parameters and training data.
        If ``filename`` is None then default model file name is used.

        Example - load the default extractor::

            ffc = FormFieldClassifier.load()

        """
        if filename is None:
            filename = cls._cached_model_path()

        if rebuild or (autocreate and not os.path.exists(filename)):
            ex = cls.trained_on(DEFAULT_DATA_PATH)
            ex.save(filename)
            return ex

        return joblib.load(filename)

    @classmethod
[docs]    def trained_on(cls, data_folder):
        """ Return Formasaurus object trained on data from data_folder """
        store = Storage(data_folder)
        print("Loading training data...")
        annotations = list(store.iter_annotations(
            simplify_form_types=True,
            simplify_field_types=True,
            verbose=True,
            leave=True,
        ))
        ex = cls()
        ex.train(annotations)
        return ex

    def save(self, filename):
        if self.form_classifier is None or self._field_model is None:
            raise ValueError("FormFieldExtractor is not trained")
        joblib.dump(self, filename, compress=3)

[docs]    def train(self, annotations):
        """ Train FormFieldExtractor on a list of FormAnnotation objects. """
        print("Training form type detector on %d example(s)..." % len(annotations))
        self.form_classifier = FormClassifier(full_type_names=True)
        self.form_classifier.train(annotations)

        print("Training field type detector...")
        self._field_model = fieldtype_model.train(
            annotations=annotations,
            use_precise_form_types=True,
            full_field_type_names=True,
            full_form_type_names=self.form_classifier.full_type_names,
            verbose=True,
        )

[docs]    def classify(self, form):
        """
        Return ``{'form': 'type', 'fields': {'name': 'type', ...}}``
        dict with form type and types of its visible submittable fields.
        """
        form_type = self.form_classifier.classify(form)
        field_elems = get_fields_to_annotate(form)
        xseq = fieldtype_model.get_form_features(form, form_type, field_elems)
        yseq = self._field_model.predict_single(xseq)
        return {
            'form': form_type,
            'fields': {
                elem.name: cls
                for elem, cls in zip(field_elems, yseq)
            }
        }

[docs]    def classify_proba(self, form, threshold=0.0):
        """
        Return dict with probabilities of ``form`` and its fields belonging
        to various form and field classes::

            {
                'form': {'type1': prob1, 'type2': prob2, ...},
                'fields': {
                    'name': {'type1': prob1, 'type2': prob2, ...},
                    ...
                }
            }

        ``form`` should be an lxml HTML <form> element.
        Only classes with probability >= ``threshold`` are preserved.
        """
        form_types_proba = self.form_classifier.classify_proba(form, threshold)
        form_type = max(form_types_proba, key=lambda p: form_types_proba[p])
        field_elems = get_fields_to_annotate(form)
        xseq = fieldtype_model.get_form_features(form, form_type, field_elems)
        yseq = self._field_model.predict_marginals_single(xseq)

        return {
            'form': form_types_proba,
            'fields': {
                elem.name: thresholded(probs, threshold)
                for elem, probs in zip(field_elems, yseq)
            },
        }

[docs]    def extract_forms(self, tree_or_html, proba=False, threshold=0.05):
        """
        Given a lxml tree or HTML source code, return a list of
        ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results
        of :meth:`classify` or :meth:`classify_proba`` calls, depending on
        ``proba`` parameter.
        """
        if isinstance(tree_or_html, (six.string_types, bytes)):
            tree = load_html(tree_or_html)
        else:
            tree = tree_or_html
        forms = get_forms(tree)
        if proba:
            return [(form, self.classify_proba(form, threshold))
                    for form in forms]
        else:
            return [(form, self.classify(form)) for form in forms]

    @classmethod
    def _cached_model_path(cls):
        env_path = os.environ.get("FORMASAURUS_MODEL")
        if env_path:
            return os.path.expanduser(env_path)
        path = "formasaurus-%s.joblib" % dependencies_string()
        return at_root(path)


[docs]class FormClassifier(object):
    """
    Convenience wrapper for scikit-learn based form type detection model.
    """
    def __init__(self, form_model=None, full_type_names=True):
        self.model = form_model
        self.full_type_names = full_type_names

[docs]    def classify(self, form):
        """
        Return form class.
        ``form`` should be an lxml HTML <form> element.
        """
        return self.model.predict([form])[0]

[docs]    def classify_proba(self, form, threshold=0.0):
        """
        Return form class.
        ``form`` should be an lxml HTML <form> element.
        """
        probs = self.model.predict_proba([form])[0]
        return self._probs2dict(probs, threshold)

[docs]    def train(self, annotations):
        """ Train FormExtractor on a list of FormAnnotation objects. """
        self.model = formtype_model.train(
            annotations=annotations,
            full_type_names=self.full_type_names,
        )

[docs]    def extract_forms(self, tree_or_html, proba=False, threshold=0.05):
        """
        Given a lxml tree or HTML source code, return a list of
        ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results
        of :meth:`classify` or :meth:`classify_proba`` calls, depending on
        ``proba`` parameter.
        """
        forms = get_forms(load_html(tree_or_html))
        if proba:
            return [(form, self.classify_proba(form, threshold))
                    for form in forms]
        else:
            return [(form, self.classify(form)) for form in forms]

    @property
    def classes(self):
        if self.model is None:
            raise ValueError("FormExtractor is not trained")
        return self.model.steps[-1][1].classes_

    def _probs2dict(self, probs, threshold):
        return thresholded(dict(zip(self.classes, probs)), threshold)



_form_field_classifier = None

[docs]def instance():
    """ Return a shared FormFieldClassifier instance """
    global _form_field_classifier
    if _form_field_classifier is None:
        _form_field_classifier = FormFieldClassifier.load()
    return _form_field_classifier