Source code for formasaurus.formtype_features

# -*- coding: utf-8 -*-
"""
This module provides scikit-learn transformers
for extracting features from HTML forms.

For all features X is a list of lxml <form> elements.
"""
from __future__ import absolute_import

import collections
from six.moves.urllib import parse as urlparse

import lxml.html

try:
    from sklearn.base import BaseEstimator, TransformerMixin
except ImportError:
    # for docs
    class BaseEstimator(object): pass
    class TransformerMixin(object): pass


from .utils import add_scheme_if_missing


class BaseFormFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return [self.get_form_features(form) for form in X]

    def get_form_features(self, form):
        raise NotImplementedError()


[docs]class FormElements(BaseFormFeatureExtractor): """ Features based on form HTML elements: counts of elements of different types, GET/POST form method. """ def get_form_features(self, form): typecounts = _get_type_counts(form) return { 'has <textarea>': typecounts['textarea'] > 0, 'has <input type=radio>': typecounts['radio'] > 0, 'has <select>': typecounts['select'] > 0, 'has <input type=checkbox>': typecounts['checkbox'] > 0, 'has <input type=email>': typecounts['email'] > 0, '2 or 3 inputs': len(form.inputs.keys()) in {2, 3}, 'no <input type=password>': typecounts['password'] == 0, 'exactly one <input type=password>': typecounts['password'] == 1, 'exactly two <input type=password>': typecounts['password'] == 2, 'no <input type=text>': typecounts['text'] == 0, 'exactly one <input type=text>': typecounts['text'] == 1, 'exactly two <input type=text>': typecounts['text'] == 2, '3 or more <input type=text>': typecounts['text'] >= 3, '<form method': form.method.lower().strip() or "MISSING", }
[docs]class Bias(BaseFormFeatureExtractor): """ The same as ``clf.intercept_``, but with regularization applied. Used mostly for debugging. """ def get_form_features(self, form): return {'bias': 1}
[docs]class FormText(BaseFormFeatureExtractor): """ Text contents inside the form. """ def get_form_features(self, form): return " ".join(form.xpath(".//text()"))
[docs]class FormInputNames(BaseFormFeatureExtractor): """ Names of all non-hidden <input> elements, joined to a single string. """ def get_form_features(self, form): names = " ".join(form.xpath('.//input[not(@type="hidden")]/@name')) return names.replace("_", "").replace("[", "").replace("]", "")
[docs]class FormInputHiddenNames(BaseFormFeatureExtractor): """ Names of all <input type=hidden> elements, joined to a single string. """ def get_form_features(self, form): names = " ".join(form.xpath('.//input[@type="hidden"]/@name')) return names.replace("_", "").replace("[", "").replace("]", "")
[docs]class FormLinksText(BaseFormFeatureExtractor): """ Text of all links inside the form. It is helpful because e.g. registration links inside login forms are common. """ def get_form_features(self, form): return " ".join(form.xpath(".//a//text()"))
[docs]class SubmitText(BaseFormFeatureExtractor): """ Text of all <submit> buttons, joined to a single string. """ def get_form_features(self, form): return " ".join(form.xpath('.//input[@type="submit"]/@value'))
[docs]class FormUrl(BaseFormFeatureExtractor): """ <form action> value """ def get_form_features(self, form): url = form.get("action", "") if not url: return url url = add_scheme_if_missing(url) p = urlparse.urlparse(url) parts = [ self._normalize(part) for part in [p.path, p.params, p.query, p.fragment] ] return "%s%s%s#%s" % tuple(parts) def _normalize(self, part): return part.replace("/", "").replace("_", "").replace("-", "")
[docs]class FormCss(BaseFormFeatureExtractor): """ Form CSS classes and ID """ def get_form_features(self, form): return " ".join([ form.get("class", ""), form.get("id", ""), ])
[docs]class FormInputTitle(BaseFormFeatureExtractor): """ <input title=...> values """ def get_form_features(self, form): return " ".join(form.xpath('.//input[not(@type="hidden")]/@title'))
[docs]class FormLabelText(BaseFormFeatureExtractor): """ <label> values """ def get_form_features(self, form): return " ".join(form.xpath('.//label//text()'))
[docs]class FormInputCss(BaseFormFeatureExtractor): """ CSS classes and IDs of <input> elemnts """ def get_form_features(self, form): inputs = form.xpath('.//input[not(@type="hidden")]') return " ".join([ "%s %s" % (inp.get("class", ""), inp.get("id", "")) for inp in inputs ])
[docs]class OldLoginformFeatures(BaseFormFeatureExtractor): """ Features that loginform library used. """ def get_form_features(self, form): return loginform_features(form)
[docs]def loginform_features(form): """ A dict with features from loginform library """ typecount = _get_type_counts(form) res = { '2_or_3_inputs': len(form.inputs.keys()) in {2, 3}, 'typecount_text_gt1': (typecount['text'] > 1), 'typecount_text_0': (typecount['text'] == 0), 'typecount_password_eq1': (typecount['password'] == 1), 'typecount_password_0': (typecount['password'] == 0), 'typecount_checkbox_gt1': (typecount['checkbox'] > 1), 'typecount_radio_gt0': (typecount['radio'] > 0), } return res
def _get_type_counts(form): typecount = collections.defaultdict(int) for x in form.inputs: if isinstance(x, lxml.html.InputElement): type_ = x.type elif isinstance(x, lxml.html.TextareaElement): type_ = "textarea" elif isinstance(x, lxml.html.SelectElement): type_ = "select" else: type_ = "other" typecount[type_] += 1 return typecount