Module hela.test_suite.description_tests
Expand source code
from hela import Catalog
from hela.math.tf_idf import find_similar_occurrences
from hela.math.preprocessing import clean_document
from hela.errors import ValidationError
from collections import defaultdict
def validate_description_similarity(root_catalog: Catalog, min_similarity: float = .75) -> True:
"""Runs a validation check to make sure there descriptions too similar within the catalog.
This check does not consider duplicated descriptions, for that use validate_no_duplicated_descriptions().
Args:
root_catalog: The root catalog of your project
max_similarity: The minimum required similarity score before raising a validation error (0 to 1)
Returns:
True if no description was too similar to another.
Raises:
ValidationError: Raised when descriptions were found that too closely resembles each other.
"""
desc_dict = {sd.description: sd for sd in root_catalog._all_descriptions()}
similar_descriptions = find_similar_occurrences(list(desc_dict.keys()), min_similarity=min_similarity)
msgs = []
for sd in similar_descriptions:
obj1 = desc_dict[sd.target_string]
obj2 = desc_dict[sd.match_string]
msgs.append(
f'{obj1.type} <{obj1.name}> too closely resembles {obj2.type} <{obj2.name}>'
f' ({round(sd.score, 2)}), descriptions:\n'
f'<{obj1.name}> {obj1.description}\n<{obj2.name}> {obj2.description}'
)
if msgs:
msgs = ['Found descriptions too close in similarity:'] + msgs
raise ValidationError('\n'.join(msgs))
return True
def validate_no_description_duplication(root_catalog: Catalog) -> True:
"""Runs a validation check to make sure no descriptions are duplicated within the catalog.
Args:
root_catalog: The root catalog of your project
Returns:
True if no descriptions were duplicated
Raises:
ValidationError: If any duplicated descriptions were found.
"""
desc_dict = defaultdict(list)
for sd in root_catalog._all_descriptions():
if sd is not None:
desc_dict[clean_document(sd.description)].append(sd)
msgs = []
for description, objects in desc_dict.items():
if len(objects) > 1:
obj_list = [f'{obj.name} ({obj.type})' for obj in objects]
msgs.append(
f'The following objects share the (tokenized) description <{description}>: {", ".join(obj_list)}'
)
if msgs:
msgs = ['Duplicated descriptions found:'] + msgs
raise ValidationError('\n'.join(msgs))
return True
Functions
def validate_description_similarity(root_catalog: hela._catalog_class.Catalog, min_similarity: float = 0.75) ‑> True-
Runs a validation check to make sure there descriptions too similar within the catalog.
This check does not consider duplicated descriptions, for that use validate_no_duplicated_descriptions().
Args
root_catalog- The root catalog of your project
max_similarity- The minimum required similarity score before raising a validation error (0 to 1)
Returns
True if no description was too similar to another.
Raises
ValidationError- Raised when descriptions were found that too closely resembles each other.
Expand source code
def validate_description_similarity(root_catalog: Catalog, min_similarity: float = .75) -> True: """Runs a validation check to make sure there descriptions too similar within the catalog. This check does not consider duplicated descriptions, for that use validate_no_duplicated_descriptions(). Args: root_catalog: The root catalog of your project max_similarity: The minimum required similarity score before raising a validation error (0 to 1) Returns: True if no description was too similar to another. Raises: ValidationError: Raised when descriptions were found that too closely resembles each other. """ desc_dict = {sd.description: sd for sd in root_catalog._all_descriptions()} similar_descriptions = find_similar_occurrences(list(desc_dict.keys()), min_similarity=min_similarity) msgs = [] for sd in similar_descriptions: obj1 = desc_dict[sd.target_string] obj2 = desc_dict[sd.match_string] msgs.append( f'{obj1.type} <{obj1.name}> too closely resembles {obj2.type} <{obj2.name}>' f' ({round(sd.score, 2)}), descriptions:\n' f'<{obj1.name}> {obj1.description}\n<{obj2.name}> {obj2.description}' ) if msgs: msgs = ['Found descriptions too close in similarity:'] + msgs raise ValidationError('\n'.join(msgs)) return True def validate_no_description_duplication(root_catalog: hela._catalog_class.Catalog) ‑> True-
Runs a validation check to make sure no descriptions are duplicated within the catalog.
Args
root_catalog- The root catalog of your project
Returns
True if no descriptions were duplicated
Raises
ValidationError- If any duplicated descriptions were found.
Expand source code
def validate_no_description_duplication(root_catalog: Catalog) -> True: """Runs a validation check to make sure no descriptions are duplicated within the catalog. Args: root_catalog: The root catalog of your project Returns: True if no descriptions were duplicated Raises: ValidationError: If any duplicated descriptions were found. """ desc_dict = defaultdict(list) for sd in root_catalog._all_descriptions(): if sd is not None: desc_dict[clean_document(sd.description)].append(sd) msgs = [] for description, objects in desc_dict.items(): if len(objects) > 1: obj_list = [f'{obj.name} ({obj.type})' for obj in objects] msgs.append( f'The following objects share the (tokenized) description <{description}>: {", ".join(obj_list)}' ) if msgs: msgs = ['Duplicated descriptions found:'] + msgs raise ValidationError('\n'.join(msgs)) return True