Module `hela.math.levenshtein`

Expand source code

import numpy as np
from typing import Sequence
from hela.math.preprocessing import clean_corpus, clean_document
from hela.math.score_info import SimilarityInfo


def distance(s1: str, s2: str) -> float:
    size_x, size_y = len(s1) + 1, len(s2) + 1

    matrix = np.zeros((size_x, size_y), dtype=int)

    # Populate the top row, and leftmost column in the matrix
    for x in range(1, size_x):
        matrix[x, 0] = x
    for y in range(1, size_y):
        matrix[0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            # If the characters equal eachother we have 0 cost
            cost = 0 if s1[x - 1] == s2[y - 1] else 1

            matrix[x, y] = min(
                matrix[x - 1, y] + 1,  # Deletions
                matrix[x, y - 1] + 1,  # Insertions
                matrix[x - 1, y - 1] + cost  # Substitutions
            )
    return round(1 - matrix[-1, -1] / max(len(s1), len(s2)), 3)


def sort(query_str: str, corpus: Sequence[str], min_similarity: float = .75) -> Sequence[SimilarityInfo]:
    """Sorts a corpus of documents based on levenshtein distance similarity.

    Args:
        query_str:      The string to compare against the corpus
        corpus:         A sequence of documents as strings
        min_similarity: The minimum required similarity [0 to 1]

    Returns:
        A list of SimilarityInfo Objects
    """
    documents, tokenized_documents = clean_corpus(corpus)
    query_str = clean_document(query_str)

    matches = [
        SimilarityInfo(
            score=distance(query_str, document),
            match_idx=i,
            match_string=documents[i],
            target_string=query_str
        )
        for i, document in enumerate(tokenized_documents)
    ]
    matches = [m for m in matches if m.score >= min_similarity]
    return sorted(matches, key=lambda x: -x.score)


def replace(query_str: str, corpus: Sequence[str], min_similarity: float = .75) -> str:
    # Create boundaries for length of replacement string
    boundary_length = round(len(query_str) * (1 - min_similarity))
    min_len, max_len = len(query_str) - boundary_length, len(query_str) + boundary_length
    # Slim down corpus based on string length
    slimmed_corpus = [doc for doc in corpus if len(doc) >= min_len and len(doc) <= max_len]
    results = sort(query_str, slimmed_corpus, min_similarity=min_similarity)
    if len(results) > 0:
        return results[0].match_string
    return query_str

Functions

def distance(s1: str, s2: str) ‑> float

Expand source code

def distance(s1: str, s2: str) -> float:
    size_x, size_y = len(s1) + 1, len(s2) + 1

    matrix = np.zeros((size_x, size_y), dtype=int)

    # Populate the top row, and leftmost column in the matrix
    for x in range(1, size_x):
        matrix[x, 0] = x
    for y in range(1, size_y):
        matrix[0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            # If the characters equal eachother we have 0 cost
            cost = 0 if s1[x - 1] == s2[y - 1] else 1

            matrix[x, y] = min(
                matrix[x - 1, y] + 1,  # Deletions
                matrix[x, y - 1] + 1,  # Insertions
                matrix[x - 1, y - 1] + cost  # Substitutions
            )
    return round(1 - matrix[-1, -1] / max(len(s1), len(s2)), 3)

def replace(query_str: str, corpus: Sequence[str], min_similarity: float = 0.75) ‑> str

Expand source code

def replace(query_str: str, corpus: Sequence[str], min_similarity: float = .75) -> str:
    # Create boundaries for length of replacement string
    boundary_length = round(len(query_str) * (1 - min_similarity))
    min_len, max_len = len(query_str) - boundary_length, len(query_str) + boundary_length
    # Slim down corpus based on string length
    slimmed_corpus = [doc for doc in corpus if len(doc) >= min_len and len(doc) <= max_len]
    results = sort(query_str, slimmed_corpus, min_similarity=min_similarity)
    if len(results) > 0:
        return results[0].match_string
    return query_str

def sort(query_str: str, corpus: Sequence[str], min_similarity: float = 0.75) ‑> Sequence[SimilarityInfo]

Sorts a corpus of documents based on levenshtein distance similarity.

Args

query_str

The string to compare against the corpus

corpus

A sequence of documents as strings

min_similarity

The minimum required similarity [0 to 1]

Returns

A list of SimilarityInfo Objects

Expand source code

def sort(query_str: str, corpus: Sequence[str], min_similarity: float = .75) -> Sequence[SimilarityInfo]:
    """Sorts a corpus of documents based on levenshtein distance similarity.

    Args:
        query_str:      The string to compare against the corpus
        corpus:         A sequence of documents as strings
        min_similarity: The minimum required similarity [0 to 1]

    Returns:
        A list of SimilarityInfo Objects
    """
    documents, tokenized_documents = clean_corpus(corpus)
    query_str = clean_document(query_str)

    matches = [
        SimilarityInfo(
            score=distance(query_str, document),
            match_idx=i,
            match_string=documents[i],
            target_string=query_str
        )
        for i, document in enumerate(tokenized_documents)
    ]
    matches = [m for m in matches if m.score >= min_similarity]
    return sorted(matches, key=lambda x: -x.score)