src/sec_certs/utils/strings.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

from __future__ import annotations

import re
from functools import lru_cache

import spacy


@lru_cache
def load_spacy_model(spacy_model_to_load: str = "en_core_web_sm"):
    return spacy.load(spacy_model_to_load, disable=["parser", "ner"])


def fully_sanitize_string(string: str) -> str:
    return replace_special_chars_with_space(discard_trademark_symbols(string.lower())).strip()


def replace_special_chars_with_space(string: str) -> str:
    return re.sub(r"[^a-zA-Z0-9 \n\.]", " ", string)


def discard_trademark_symbols(string: str) -> str:
    return string.replace("®", "").replace("™", "")


def strip_manufacturer_and_version(string: str, manufacturers: set[str] | None, versions: set[str]) -> str:
    to_strip = versions | manufacturers if manufacturers else versions
    for x in to_strip:
        string = string.lower().replace(replace_special_chars_with_space(x.lower()), " ").strip()
    return string


def standardize_version_in_cert_name(string: str, detected_versions: set[str]) -> str:
    for ver in detected_versions:
        version_regex = r"(" + r"(\bversion)\s*" + ver + r"+) | (\bv\s*" + ver + r"+)"
        string = re.sub(version_regex, " " + ver + " ", string, flags=re.IGNORECASE)
    return string


def lemmatize_product_name(nlp, product_name: str) -> str:
    if not product_name:
        return product_name
    return " ".join([token.lemma_ for token in nlp(fully_sanitize_string(product_name))])