Source code for formasaurus.formtype_features

# -*- coding: utf-8 -*-
"""
This module provides scikit-learn transformers
for extracting features from HTML forms.

For all features X is a list of lxml <form> elements.
"""
from __future__ import absolute_import

import collections
from six.moves.urllib import parse as urlparse

import lxml.html

try:
    from sklearn.base import BaseEstimator, TransformerMixin
except ImportError:
    # for docs
    class BaseEstimator(object): pass
    class TransformerMixin(object): pass


from .utils import add_scheme_if_missing


class BaseFormFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return [self.get_form_features(form) for form in X]

    def get_form_features(self, form):
        raise NotImplementedError()


[docs]class FormElements(BaseFormFeatureExtractor):
    """
    Features based on form HTML elements: counts of elements
    of different types, GET/POST form method.
    """
    def get_form_features(self, form):
        typecounts = _get_type_counts(form)
        return {
            'has <textarea>': typecounts['textarea'] > 0,
            'has <input type=radio>': typecounts['radio'] > 0,
            'has <select>': typecounts['select'] > 0,
            'has <input type=checkbox>': typecounts['checkbox'] > 0,
            'has <input type=email>': typecounts['email'] > 0,

            '2 or 3 inputs': len(form.inputs.keys()) in {2, 3},

            'no <input type=password>': typecounts['password'] == 0,
            'exactly one <input type=password>': typecounts['password'] == 1,
            'exactly two <input type=password>': typecounts['password'] == 2,

            'no <input type=text>': typecounts['text'] == 0,
            'exactly one <input type=text>': typecounts['text'] == 1,
            'exactly two <input type=text>': typecounts['text'] == 2,
            '3 or more <input type=text>': typecounts['text'] >= 3,

            '<form method': form.method.lower().strip() or "MISSING",
        }


[docs]class Bias(BaseFormFeatureExtractor):
    """
    The same as ``clf.intercept_``, but with regularization applied.
    Used mostly for debugging.
    """
    def get_form_features(self, form):
        return {'bias': 1}


[docs]class FormText(BaseFormFeatureExtractor):
    """
    Text contents inside the form.
    """
    def get_form_features(self, form):
        return " ".join(form.xpath(".//text()"))


[docs]class FormInputNames(BaseFormFeatureExtractor):
    """
    Names of all non-hidden <input> elements, joined to a single string.
    """
    def get_form_features(self, form):
        names = " ".join(form.xpath('.//input[not(@type="hidden")]/@name'))
        return names.replace("_", "").replace("[", "").replace("]", "")


[docs]class FormInputHiddenNames(BaseFormFeatureExtractor):
    """
    Names of all <input type=hidden> elements, joined to a single string.
    """
    def get_form_features(self, form):
        names = " ".join(form.xpath('.//input[@type="hidden"]/@name'))
        return names.replace("_", "").replace("[", "").replace("]", "")


[docs]class FormLinksText(BaseFormFeatureExtractor):
    """
    Text of all links inside the form.
    It is helpful because e.g. registration links
    inside login forms are common.
    """
    def get_form_features(self, form):
        return " ".join(form.xpath(".//a//text()"))


[docs]class SubmitText(BaseFormFeatureExtractor):
    """
    Text of all <submit> buttons, joined to a single string.
    """
    def get_form_features(self, form):
        return " ".join(form.xpath('.//input[@type="submit"]/@value'))


[docs]class FormUrl(BaseFormFeatureExtractor):
    """ <form action> value """
    def get_form_features(self, form):
        url = form.get("action", "")
        if not url:
            return url
        url = add_scheme_if_missing(url)
        p = urlparse.urlparse(url)
        parts = [
            self._normalize(part)
            for part in [p.path, p.params, p.query, p.fragment]
        ]
        return "%s%s%s#%s" % tuple(parts)

    def _normalize(self, part):
        return part.replace("/", "").replace("_", "").replace("-", "")


[docs]class FormCss(BaseFormFeatureExtractor):
    """ Form CSS classes and ID """
    def get_form_features(self, form):
        return " ".join([
            form.get("class", ""),
            form.get("id", ""),
        ])


[docs]class FormInputTitle(BaseFormFeatureExtractor):
    """ <input title=...> values """
    def get_form_features(self, form):
        return " ".join(form.xpath('.//input[not(@type="hidden")]/@title'))


[docs]class FormLabelText(BaseFormFeatureExtractor):
    """ <label> values """
    def get_form_features(self, form):
        return " ".join(form.xpath('.//label//text()'))


[docs]class FormInputCss(BaseFormFeatureExtractor):
    """ CSS classes and IDs of <input> elemnts """
    def get_form_features(self, form):
        inputs = form.xpath('.//input[not(@type="hidden")]')
        return " ".join([
            "%s %s" % (inp.get("class", ""), inp.get("id", ""))
            for inp in inputs
        ])


[docs]class OldLoginformFeatures(BaseFormFeatureExtractor):
    """ Features that loginform library used. """
    def get_form_features(self, form):
        return loginform_features(form)


[docs]def loginform_features(form):
    """ A dict with features from loginform library """
    typecount = _get_type_counts(form)
    res = {
        '2_or_3_inputs': len(form.inputs.keys()) in {2, 3},
        'typecount_text_gt1': (typecount['text'] > 1),
        'typecount_text_0': (typecount['text'] == 0),
        'typecount_password_eq1': (typecount['password'] == 1),
        'typecount_password_0': (typecount['password'] == 0),
        'typecount_checkbox_gt1': (typecount['checkbox'] > 1),
        'typecount_radio_gt0': (typecount['radio'] > 0),
    }
    return res


def _get_type_counts(form):
    typecount = collections.defaultdict(int)
    for x in form.inputs:
        if isinstance(x, lxml.html.InputElement):
            type_ = x.type
        elif isinstance(x, lxml.html.TextareaElement):
            type_ = "textarea"
        elif isinstance(x, lxml.html.SelectElement):
            type_ = "select"
        else:
            type_ = "other"
        typecount[type_] += 1
    return typecount
Source code for formasaurus.formtype_features

Formasaurus

Navigation