Source code for formasaurus.annotation

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function
import collections

from sklearn.cross_validation import LabelKFold

from formasaurus.html import get_fields_to_annotate
from formasaurus.utils import get_domain


AnnotationSchema = collections.namedtuple(
    'AnnotationSchema',
    'types types_inv na_value skip_value simplify_map'
)


_FormAnnotation = collections.namedtuple(
    'FormAnnotation',
    'form type index info key form_schema field_schema'
)

[docs]class FormAnnotation(_FormAnnotation):
    """ Annotated HTML form """
    @property
    def url(self):
        return self.info['url']

    @property
    def fields(self):
        """
        {"field name": "field type"} dict.
        """
        return self.info['visible_html_fields'][self.index]

    @property
    def fields_annotated(self):
        """ True if form has fields and all fields are annotated. """
        if not self.fields:
            return False
        return all(
            v != self.field_schema.na_value
            for v in self.fields.values()
        )

    @property
    def form_annotated(self):
        return self.type != self.form_schema.na_value

    @property
    def fields_partially_annotated(self):
        """
        True when some fields are annotated and some are not annotated.
        """
        if not self.fields:
            return False
        values = self.fields.values()
        has_na = any(v == self.field_schema.na_value for v in values)
        has_annotated = not all(v == self.field_schema.na_value for v in values)
        return has_na and has_annotated

    @property
    def field_elems(self):
        """
        Return a list of lxml Elements for fields which are annotated.
        Fields are returned in in order they appear in form;
        only visible submittable fields are considered.
        """
        return get_fields_to_annotate(self.form)

    @property
    def field_types(self):
        """
        A list of field types, in order they appear in form.
        Only visible submittable fields are considered.
        """
        fields = self.fields
        return [fields[field.name] for field in self.field_elems]

    @property
    def field_types_full(self):
        """
        A list of long field type names, in order they appear in form.
        Only visible submittable fields are considered.
        """
        return [self.field_schema.types_inv[tp] for tp in self.field_types]

    @property
    def type_full(self):
        """ Full form type name """
        return self.form_schema.types_inv[self.type]

    def __repr__(self):
        return "FormAnnotation(form={!r}, type={!r}, index={!r}, url={!r}, key={!r}, fields={!r})".format(
            self.form, self.type, self.index, self.url, self.key, self.fields
        )


[docs]def get_annotation_folds(annotations, n_folds):
    """
    Return (train_indices, test_indices) folds iterator.
    It is guaranteed forms from the same website can't be both in
    train and test parts.

    We must be careful when splitting the dataset into training and
    evaluation parts: forms from the same domain should be in the same
    "bin". There could be several pages from the same domain, and these
    pages may have duplicate or similar forms (e.g. a search form on each
    page). If we put one such form in training dataset and another in
    evaluation dataset then the metrics will be too optimistic, and they
    can make us to choose wrong features/models. For example,
    train_test_split from scikit-learn shouldn't be used here. To fix it
    LabelKFold from scikit-learn is used.
    """
    return LabelKFold(
        labels=[get_domain(ann.url) for ann in annotations],
        n_folds=n_folds
    )
Source code for formasaurus.annotation

Formasaurus

Navigation