diff options
| author | J08nY | 2023-04-14 20:46:12 +0200 |
|---|---|---|
| committer | J08nY | 2023-04-14 20:47:32 +0200 |
| commit | ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153 (patch) | |
| tree | 844e2f32d0896938b07ca020edfdb15c580ba411 /src | |
| parent | 89b3d880088b5c30fa10036f280e73b1c1aee05e (diff) | |
| download | sec-certs-ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153.tar.gz sec-certs-ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153.tar.zst sec-certs-ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153.zip | |
Share code between FIPS and CC matching.
Diffstat (limited to 'src')
| -rw-r--r-- | src/sec_certs/dataset/cc_scheme.py | 4 | ||||
| -rw-r--r-- | src/sec_certs/model/cc_matching.py | 48 | ||||
| -rw-r--r-- | src/sec_certs/model/fips_matching.py | 41 | ||||
| -rw-r--r-- | src/sec_certs/model/matching.py | 43 |
4 files changed, 70 insertions, 66 deletions
diff --git a/src/sec_certs/dataset/cc_scheme.py b/src/sec_certs/dataset/cc_scheme.py index 49c059d8..64f0eb9b 100644 --- a/src/sec_certs/dataset/cc_scheme.py +++ b/src/sec_certs/dataset/cc_scheme.py @@ -59,7 +59,7 @@ class CCSchemeDataset: :return: The entries. """ soup = CCSchemeDataset._get_page(constants.CC_AUSTRALIA_INEVAL_URL) - header = soup.find("h2", text="Products in evaluation") + header = soup.find("h2", string="Products in evaluation") table = header.find_next_sibling("table") results = [] for tr in tqdm(table.find_all("tr"), desc="Get AU scheme in evaluation."): @@ -983,7 +983,7 @@ class CCSchemeDataset: v["product"] = value elif "Common Criteria" in title: v["cc_version"] = value - elif "Date of Certification" in title or "Date issued": + elif "Date of Certification" in title or "Date issued" in title: v["certification_date"] = value elif "EvaluationAssurance Level" in title: v["assurance_level"] = value diff --git a/src/sec_certs/model/cc_matching.py b/src/sec_certs/model/cc_matching.py index 9c081ed2..a4c9c2d5 100644 --- a/src/sec_certs/model/cc_matching.py +++ b/src/sec_certs/model/cc_matching.py @@ -1,22 +1,18 @@ from __future__ import annotations -import typing -from heapq import heappop, heappush from typing import Any, Mapping from rapidfuzz import fuzz from sec_certs.configuration import config - -if typing.TYPE_CHECKING: - from sec_certs.dataset.cc import CCDataset - from sec_certs.sample.cc import CCCertificate - +from sec_certs.dataset.cc import CCDataset +from sec_certs.model.matching import AbstractMatcher +from sec_certs.sample.cc import CCCertificate from sec_certs.sample.cc_certificate_id import CertificateId from sec_certs.utils.strings import fully_sanitize_string -class CCSchemeMatcher: +class CCSchemeMatcher(AbstractMatcher[CCCertificate, CCDataset]): """ A heuristic matcher between entries on CC scheme websites (see CCSchemeDataset) and CC certificates from the Common Criteria portal (as in CCDataset). @@ -73,6 +69,7 @@ class CCSchemeMatcher: cert_manufacturer = fully_sanitize_string(cert.manufacturer) if self._product == cert.name and self._vendor == cert.manufacturer: return 99 + # TODO: Add matching based on document hashes: cert_hash, report_hash, target_hash. product_ratings = [ fuzz.token_set_ratio(self._product, cert_name), @@ -85,30 +82,15 @@ class CCSchemeMatcher: return max((0, max(product_ratings) * 0.5 + max(vendor_ratings) * 0.5 - 2)) @classmethod - def match_all(cls, entries: list[dict[str, Any]], scheme: str, dset: CCDataset): + def match_all(cls, entries: list[dict[str, Any]], scheme: str, dset: CCDataset) -> dict[str, Mapping]: + """ + Match all entries of a given CC scheme to certificates from the dataset. + + :param entries: The entries from the scheme, obtained from CCSchemeDataset. + :param scheme: The scheme, e.g. "DE". + :param dset: The dataset to match against. + :return: A mapping of certificate digests to entries, without duplicates, not all entries may be present. + """ certs: list[CCCertificate] = list(filter(lambda cert: cert.scheme == scheme, dset)) matchers = [CCSchemeMatcher(entry, scheme) for entry in entries] - scores: list[tuple[float, int, int]] = [] - matched_is: set[int] = set() - matched_js: set[int] = set() - for i, cert in enumerate(certs): - for j, matcher in enumerate(matchers): - score = matcher.match(cert) - triple = (100 - score, i, j) - heappush(scores, triple) - results = {} - for triple in (heappop(scores) for _ in range(len(scores))): - inv_score, i, j = triple - # Do not match already matched entries/certs. - if i in matched_is or j in matched_js: - continue - # Compute the actual score from the inverse. - score = 100 - inv_score - # Do not match if we are below threshold, all the following will be as well. - if score < config.cc_matching_threshold: - break - # Match cert dgst to entry - cert = certs[i] - entry = matchers[j].entry - results[cert.dgst] = entry - return results + return cls._match_all(matchers, certs, config.cc_matching_threshold) diff --git a/src/sec_certs/model/fips_matching.py b/src/sec_certs/model/fips_matching.py index f73067b3..e4f9fbc5 100644 --- a/src/sec_certs/model/fips_matching.py +++ b/src/sec_certs/model/fips_matching.py @@ -1,22 +1,22 @@ from __future__ import annotations import typing -from operator import itemgetter -from typing import Mapping, MutableMapping +from typing import Mapping from rapidfuzz import fuzz from sec_certs.configuration import config +from sec_certs.dataset.fips import FIPSDataset +from sec_certs.model.matching import AbstractMatcher +from sec_certs.sample.fips import FIPSCertificate from sec_certs.utils.strings import fully_sanitize_string if typing.TYPE_CHECKING: - from sec_certs.dataset.fips import FIPSDataset - from sec_certs.sample.fips import FIPSCertificate from sec_certs.sample.fips_iut import IUTEntry, IUTSnapshot from sec_certs.sample.fips_mip import MIPEntry, MIPSnapshot -class FIPSProcessMatcher: +class FIPSProcessMatcher(AbstractMatcher[FIPSCertificate, FIPSDataset]): """ A heuristic matcher between entries on the FIPS IUT/MIP lists and the FIPS certificates. @@ -64,31 +64,10 @@ class FIPSProcessMatcher: """ Match a whole snapshot of IUT/MIP entries to a FIPS certificate dataset. - Duplicates may occur. - :param snapshot: The snapshot to match the entries of. - :param dset: The dataset tot match to. - :return: The matching. + :param dset: The dataset to match to. + :return: A mapping of certificate digests to entries, without duplicates, not all entries may be present. """ - matches: MutableMapping[IUTEntry | MIPEntry, FIPSCertificate | None] = {} - for entry in snapshot: - matcher = FIPSProcessMatcher(entry) - scores = sorted(((matcher.match(cert), cert) for cert in dset), key=itemgetter(0), reverse=True) - found = False - for score, cert in scores: - if score < config.fips_matching_threshold: - break - validations = cert.web_data.validation_history - if not validations: - continue - for validation in validations: - if validation.date >= snapshot.timestamp.date(): - # It could be this cert, so take it - found = True - matches[entry] = cert - break - if found: - break - else: - matches[entry] = None - return matches + certs: list[FIPSCertificate] = list(dset) + matchers = [FIPSProcessMatcher(entry) for entry in snapshot] + return cls._match_all(matchers, certs, config.fips_matching_threshold) diff --git a/src/sec_certs/model/matching.py b/src/sec_certs/model/matching.py new file mode 100644 index 00000000..687a5888 --- /dev/null +++ b/src/sec_certs/model/matching.py @@ -0,0 +1,43 @@ +import typing +from abc import ABC, abstractmethod +from heapq import heappop, heappush +from typing import Generic + +from sec_certs.dataset.dataset import Dataset +from sec_certs.sample.certificate import Certificate + +CertSubType = typing.TypeVar("CertSubType", bound=Certificate) +DatasetSubType = typing.TypeVar("DatasetSubType", bound=Dataset) + + +class AbstractMatcher(Generic[CertSubType, DatasetSubType], ABC): + @abstractmethod + def match(self, cert: CertSubType) -> float: + raise NotImplementedError + + @staticmethod + def _match_all(matchers, certs, threshold): + scores: list[tuple[float, int, int]] = [] + matched_is: set[int] = set() + matched_js: set[int] = set() + for i, cert in enumerate(certs): + for j, matcher in enumerate(matchers): + score = matcher.match(cert) + triple = (100 - score, i, j) + heappush(scores, triple) + results = {} + for triple in (heappop(scores) for _ in range(len(scores))): + inv_score, i, j = triple + # Do not match already matched entries/certs. + if i in matched_is or j in matched_js: + continue + # Compute the actual score from the inverse. + score = 100 - inv_score + # Do not match if we are below threshold, all the following will be as well. + if score < threshold: + break + # Match cert dgst to entry + cert = certs[i] + entry = matchers[j].entry + results[cert.dgst] = entry + return results |
