1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
from __future__ import annotations
import html
import logging
import re
from datetime import date
from pathlib import Path
from urllib.parse import urlparse
import numpy as np
import pandas as pd
from bs4 import NavigableString
logger = logging.getLogger(__name__)
def sanitize_navigable_string(string: NavigableString | str | None) -> str | None:
if not string:
return None
rex = re.compile(r"\s+")
string = str(string).strip().replace("\xad", "").replace("\xa0", "")
return rex.sub(" ", string)
def sanitize_link(record: str | None) -> str | None:
if not record:
return None
return record.replace(":443", "").replace(" ", "%20").replace("http://", "https://")
def sanitize_link_fname(record: str | None) -> str | None:
if not record:
return None
parsed = urlparse(record)
return Path(parsed.path).name
def sanitize_cc_link(record: str | None) -> str | None:
record = sanitize_link(record)
if not record:
return None
record = record.replace("nfs/ccpfiles/", "")
if record == "https://www.commoncriteriaportal.org/files/epfiles/":
return None
return record
def sanitize_date(record: pd.Timestamp | date | np.datetime64) -> date | None:
if pd.isnull(record):
return None
if isinstance(record, pd.Timestamp):
return record.date()
if isinstance(record, date | type(None)):
return record
raise ValueError("Unsupported type given as input")
def sanitize_string(record: str) -> str:
# There is a sample with name 'ATMEL Secure Microcontroller AT90SC12872RCFT / AT90SC12836RCFT rev. I & J' that has to be unescaped twice
string = html.unescape(html.unescape(record)).replace("\n", "")
return " ".join(string.split())
def sanitize_security_levels(record: str | set[str]) -> set[str]:
if isinstance(record, str):
record = set(record.split(","))
return record - {"Basic", "ND-PP", "PP\xa0Compliant", "None", "Medium", ""}
def sanitize_protection_profiles(record: str) -> list:
if not record:
return []
return record.split(",")
|