Source code for formasaurus.text

import re

tokenize = re.compile(r"(?u)\b\w+\b").findall
""" Tokenize text """



[docs]
def ngrams(seq, min_n, max_n):
    """
    Return min_n to max_n n-grams of elements from a given sequence.
    """
    text_len = len(seq)
    res = []
    for n in range(min_n, min(max_n + 1, text_len + 1)):
        for i in range(text_len - n + 1):
            res.append(seq[i : i + n])
    return res




[docs]
def token_ngrams(tokens, min_n, max_n):
    """
    Return n-grams given a list of tokens.
    """
    return [" ".join(t) for t in ngrams(tokens, min_n, max_n)]



_replace_white_spaces = re.compile(r"\s\s+").sub
_replace_newlines = re.compile(r"[\n\r]").sub



[docs]
def normalize_whitespaces(text):
    """Replace newlines and whitespaces with a single white space"""
    text = _replace_newlines(" ", text)
    return _replace_white_spaces(" ", text)




[docs]
def normalize(text):
    """Default text normalization function"""
    return normalize_whitespaces(text.lower())




[docs]
def number_pattern(text, ratio=0.3):
    """
    Replace digits with X and letters with C if text contains > ratio
    of digits; return empty string otherwise.
    """
    if not text:
        return ""
    digit_ratio = sum(1 for ch in text if ch.isdigit()) / len(text)

    if digit_ratio >= ratio:
        num_pattern = re.sub(r"\d", "X", text)
        return re.sub(r"[^X\W]", "C", num_pattern)
    else:
        return ""
Source code for formasaurus.text

Formasaurus

Navigation