Source code for formasaurus.utils

import os
import sys
from warnings import warn

import requests
import tldextract
from requests.compat import chardet
from w3lib.encoding import html_to_unicode


[docs] def dependencies_string(): """ Return a string with versions of formasaurus, numpy, scipy and scikit-learn. Saved scikit-learn models may be not compatible between different numpy/scipy/scikit-learn versions; a string returned by this function can be used as a part of file name. """ warn( "formasaurus.utils.dependencies_string() is deprecated.", DeprecationWarning, stacklevel=2, ) import numpy import scipy import sklearn import formasaurus py_version = "%s.%s" % sys.version_info[:2] return "{}-py{}-numpy{}-scipy{}-sklearn{}".format( formasaurus.__version__, py_version, numpy.__version__, scipy.__version__, sklearn.__version__, )
[docs] def add_scheme_if_missing(url): """ >>> add_scheme_if_missing("example.org") 'http://example.org' >>> add_scheme_if_missing("https://example.org") 'https://example.org' """ if "//" not in url: url = "http://%s" % url return url
[docs] def get_domain(url): """ >>> get_domain('example.org') 'example' >>> get_domain('foo.example.co.uk') 'example' """ return tldextract.extract(url).domain
[docs] def inverse_mapping(dct): """ Return reverse mapping: >>> inverse_mapping({'x': 5}) {5: 'x'} """ return {v: k for k, v in dct.items()}
[docs] def at_root(*args): """Return path relative to formasaurus source code""" return os.path.join(os.path.dirname(__file__), *args)
[docs] def thresholded(dct, threshold): """ Return dict ``dct`` without all values less than threshold. >>> thresholded({'foo': 0.5, 'bar': 0.1}, 0.5) {'foo': 0.5} >>> thresholded({'foo': 0.5, 'bar': 0.1, 'baz': 1.0}, 0.6) {'baz': 1.0} >>> dct = {'foo': 0.5, 'bar': 0.1, 'baz': 1.0, 'spam': 0.0} >>> thresholded(dct, 0.0) == dct True """ return {k: v for k, v in dct.items() if v >= threshold}
[docs] def download(url): """ Download a web page from url, return its content as unicode. """ url = add_scheme_if_missing(url) resp = requests.get(url) return response2unicode(resp)
[docs] def response2unicode(resp): """ Convert requests.Response body to unicode. Unlike ``response.text`` it handles <meta> tags in response content. """ enc, html = html_to_unicode( content_type_header=resp.headers.get("Content-Type"), html_body_str=resp.content, auto_detect_fun=_autodetect_encoding, ) return html
def _autodetect_encoding(binary_data): return chardet.detect(binary_data)["encoding"]