Source code for formasaurus.utils
import os
import sys
from warnings import warn
import requests
import tldextract
from requests.compat import chardet
from w3lib.encoding import html_to_unicode
[docs]
def dependencies_string():
"""
Return a string with versions of formasaurus, numpy, scipy and scikit-learn.
Saved scikit-learn models may be not compatible between different
numpy/scipy/scikit-learn versions; a string returned by this function
can be used as a part of file name.
"""
warn(
"formasaurus.utils.dependencies_string() is deprecated.",
DeprecationWarning,
stacklevel=2,
)
import numpy
import scipy
import sklearn
import formasaurus
py_version = "%s.%s" % sys.version_info[:2]
return "{}-py{}-numpy{}-scipy{}-sklearn{}".format(
formasaurus.__version__,
py_version,
numpy.__version__,
scipy.__version__,
sklearn.__version__,
)
[docs]
def add_scheme_if_missing(url):
"""
>>> add_scheme_if_missing("example.org")
'http://example.org'
>>> add_scheme_if_missing("https://example.org")
'https://example.org'
"""
if "//" not in url:
url = "http://%s" % url
return url
[docs]
def get_domain(url):
"""
>>> get_domain('example.org')
'example'
>>> get_domain('foo.example.co.uk')
'example'
"""
return tldextract.extract(url).domain
[docs]
def inverse_mapping(dct):
"""
Return reverse mapping:
>>> inverse_mapping({'x': 5})
{5: 'x'}
"""
return {v: k for k, v in dct.items()}
[docs]
def at_root(*args):
"""Return path relative to formasaurus source code"""
return os.path.join(os.path.dirname(__file__), *args)
[docs]
def thresholded(dct, threshold):
"""
Return dict ``dct`` without all values less than threshold.
>>> thresholded({'foo': 0.5, 'bar': 0.1}, 0.5)
{'foo': 0.5}
>>> thresholded({'foo': 0.5, 'bar': 0.1, 'baz': 1.0}, 0.6)
{'baz': 1.0}
>>> dct = {'foo': 0.5, 'bar': 0.1, 'baz': 1.0, 'spam': 0.0}
>>> thresholded(dct, 0.0) == dct
True
"""
return {k: v for k, v in dct.items() if v >= threshold}
[docs]
def download(url):
"""
Download a web page from url, return its content as unicode.
"""
url = add_scheme_if_missing(url)
resp = requests.get(url)
return response2unicode(resp)
[docs]
def response2unicode(resp):
"""
Convert requests.Response body to unicode.
Unlike ``response.text`` it handles <meta> tags in response content.
"""
enc, html = html_to_unicode(
content_type_header=resp.headers.get("Content-Type"),
html_body_str=resp.content,
auto_detect_fun=_autodetect_encoding,
)
return html
def _autodetect_encoding(binary_data):
return chardet.detect(binary_data)["encoding"]