Source code for formasaurus.annotation

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function
import collections

from sklearn.model_selection import GroupKFold

from formasaurus.html import get_fields_to_annotate
from formasaurus.utils import get_domain


AnnotationSchema = collections.namedtuple(
    'AnnotationSchema',
    'types types_inv na_value skip_value simplify_map'
)


_FormAnnotation = collections.namedtuple(
    'FormAnnotation',
    'form type index info key form_schema field_schema'
)

[docs]class FormAnnotation(_FormAnnotation): """ Annotated HTML form """ @property def url(self): return self.info['url'] @property def fields(self): """ {"field name": "field type"} dict. """ return self.info['visible_html_fields'][self.index] @property def fields_annotated(self): """ True if form has fields and all fields are annotated. """ if not self.fields: return False return all( v != self.field_schema.na_value for v in self.fields.values() ) @property def form_annotated(self): return self.type != self.form_schema.na_value @property def fields_partially_annotated(self): """ True when some fields are annotated and some are not annotated. """ if not self.fields: return False values = self.fields.values() has_na = any(v == self.field_schema.na_value for v in values) has_annotated = not all(v == self.field_schema.na_value for v in values) return has_na and has_annotated @property def field_elems(self): """ Return a list of lxml Elements for fields which are annotated. Fields are returned in in order they appear in form; only visible submittable fields are considered. """ return get_fields_to_annotate(self.form) @property def field_types(self): """ A list of field types, in order they appear in form. Only visible submittable fields are considered. """ fields = self.fields return [fields[field.name] for field in self.field_elems] @property def field_types_full(self): """ A list of long field type names, in order they appear in form. Only visible submittable fields are considered. """ return [self.field_schema.types_inv[tp] for tp in self.field_types] @property def type_full(self): """ Full form type name """ return self.form_schema.types_inv[self.type] def __repr__(self): return "FormAnnotation(form={!r}, type={!r}, index={!r}, url={!r}, key={!r}, fields={!r})".format( self.form, self.type, self.index, self.url, self.key, self.fields )