Share code between FIPS and CC matching.

author: J08nY 2023-04-14 20:46:12 +0200
committer: J08nY 2023-04-14 20:47:32 +0200
commit: ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153 (patch)
tree: 844e2f32d0896938b07ca020edfdb15c580ba411 /src
parent: 89b3d880088b5c30fa10036f280e73b1c1aee05e (diff)
download: sec-certs-ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153.tar.gz
sec-certs-ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153.tar.zst
sec-certs-ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153.zip
4 files changed, 70 insertions, 66 deletions
diff --git a/src/sec_certs/dataset/cc_scheme.py b/src/sec_certs/dataset/cc_scheme.py
index 49c059d8..64f0eb9b 100644
--- a/src/sec_certs/dataset/cc_scheme.py
+++ b/src/sec_certs/dataset/cc_scheme.py
@@ -59,7 +59,7 @@ class CCSchemeDataset:
         :return: The entries.
         """
         soup = CCSchemeDataset._get_page(constants.CC_AUSTRALIA_INEVAL_URL)
-        header = soup.find("h2", text="Products in evaluation")
+        header = soup.find("h2", string="Products in evaluation")
         table = header.find_next_sibling("table")
         results = []
         for tr in tqdm(table.find_all("tr"), desc="Get AU scheme in evaluation."):
@@ -983,7 +983,7 @@ class CCSchemeDataset:
                             v["product"] = value
                         elif "Common Criteria" in title:
                             v["cc_version"] = value
-                        elif "Date of Certification" in title or "Date issued":
+                        elif "Date of Certification" in title or "Date issued" in title:
                             v["certification_date"] = value
                         elif "EvaluationAssurance Level" in title:
                             v["assurance_level"] = value
diff --git a/src/sec_certs/model/cc_matching.py b/src/sec_certs/model/cc_matching.py
index 9c081ed2..a4c9c2d5 100644
--- a/src/sec_certs/model/cc_matching.py
+++ b/src/sec_certs/model/cc_matching.py
@@ -1,22 +1,18 @@
 from __future__ import annotations
 
-import typing
-from heapq import heappop, heappush
 from typing import Any, Mapping
 
 from rapidfuzz import fuzz
 
 from sec_certs.configuration import config
-
-if typing.TYPE_CHECKING:
-    from sec_certs.dataset.cc import CCDataset
-    from sec_certs.sample.cc import CCCertificate
-
+from sec_certs.dataset.cc import CCDataset
+from sec_certs.model.matching import AbstractMatcher
+from sec_certs.sample.cc import CCCertificate
 from sec_certs.sample.cc_certificate_id import CertificateId
 from sec_certs.utils.strings import fully_sanitize_string
 
 
-class CCSchemeMatcher:
+class CCSchemeMatcher(AbstractMatcher[CCCertificate, CCDataset]):
     """
     A heuristic matcher between entries on CC scheme websites (see CCSchemeDataset) and
     CC certificates from the Common Criteria portal (as in CCDataset).
@@ -73,6 +69,7 @@ class CCSchemeMatcher:
         cert_manufacturer = fully_sanitize_string(cert.manufacturer)
         if self._product == cert.name and self._vendor == cert.manufacturer:
             return 99
+        # TODO: Add matching based on document hashes: cert_hash, report_hash, target_hash.
 
         product_ratings = [
             fuzz.token_set_ratio(self._product, cert_name),
@@ -85,30 +82,15 @@ class CCSchemeMatcher:
         return max((0, max(product_ratings) * 0.5 + max(vendor_ratings) * 0.5 - 2))
 
     @classmethod
-    def match_all(cls, entries: list[dict[str, Any]], scheme: str, dset: CCDataset):
+    def match_all(cls, entries: list[dict[str, Any]], scheme: str, dset: CCDataset) -> dict[str, Mapping]:
+        """
+        Match all entries of a given CC scheme to certificates from the dataset.
+
+        :param entries: The entries from the scheme, obtained from CCSchemeDataset.
+        :param scheme: The scheme, e.g. "DE".
+        :param dset: The dataset to match against.
+        :return: A mapping of certificate digests to entries, without duplicates, not all entries may be present.
+        """
         certs: list[CCCertificate] = list(filter(lambda cert: cert.scheme == scheme, dset))
         matchers = [CCSchemeMatcher(entry, scheme) for entry in entries]
-        scores: list[tuple[float, int, int]] = []
-        matched_is: set[int] = set()
-        matched_js: set[int] = set()
-        for i, cert in enumerate(certs):
-            for j, matcher in enumerate(matchers):
-                score = matcher.match(cert)
-                triple = (100 - score, i, j)
-                heappush(scores, triple)
-        results = {}
-        for triple in (heappop(scores) for _ in range(len(scores))):
-            inv_score, i, j = triple
-            # Do not match already matched entries/certs.
-            if i in matched_is or j in matched_js:
-                continue
-            # Compute the actual score from the inverse.
-            score = 100 - inv_score
-            # Do not match if we are below threshold, all the following will be as well.
-            if score < config.cc_matching_threshold:
-                break
-            # Match cert dgst to entry
-            cert = certs[i]
-            entry = matchers[j].entry
-            results[cert.dgst] = entry
-        return results
+        return cls._match_all(matchers, certs, config.cc_matching_threshold)
diff --git a/src/sec_certs/model/fips_matching.py b/src/sec_certs/model/fips_matching.py
index f73067b3..e4f9fbc5 100644
--- a/src/sec_certs/model/fips_matching.py
+++ b/src/sec_certs/model/fips_matching.py
@@ -1,22 +1,22 @@
 from __future__ import annotations
 
 import typing
-from operator import itemgetter
-from typing import Mapping, MutableMapping
+from typing import Mapping
 
 from rapidfuzz import fuzz
 
 from sec_certs.configuration import config
+from sec_certs.dataset.fips import FIPSDataset
+from sec_certs.model.matching import AbstractMatcher
+from sec_certs.sample.fips import FIPSCertificate
 from sec_certs.utils.strings import fully_sanitize_string
 
 if typing.TYPE_CHECKING:
-    from sec_certs.dataset.fips import FIPSDataset
-    from sec_certs.sample.fips import FIPSCertificate
     from sec_certs.sample.fips_iut import IUTEntry, IUTSnapshot
     from sec_certs.sample.fips_mip import MIPEntry, MIPSnapshot
 
 
-class FIPSProcessMatcher:
+class FIPSProcessMatcher(AbstractMatcher[FIPSCertificate, FIPSDataset]):
     """
     A heuristic matcher between entries on the FIPS IUT/MIP lists and
     the FIPS certificates.
@@ -64,31 +64,10 @@ class FIPSProcessMatcher:
         """
         Match a whole snapshot of IUT/MIP entries to a FIPS certificate dataset.
 
-        Duplicates may occur.
-
         :param snapshot: The snapshot to match the entries of.
-        :param dset: The dataset tot match to.
-        :return: The matching.
+        :param dset: The dataset to match to.
+        :return: A mapping of certificate digests to entries, without duplicates, not all entries may be present.
         """
-        matches: MutableMapping[IUTEntry | MIPEntry, FIPSCertificate | None] = {}
-        for entry in snapshot:
-            matcher = FIPSProcessMatcher(entry)
-            scores = sorted(((matcher.match(cert), cert) for cert in dset), key=itemgetter(0), reverse=True)
-            found = False
-            for score, cert in scores:
-                if score < config.fips_matching_threshold:
-                    break
-                validations = cert.web_data.validation_history
-                if not validations:
-                    continue
-                for validation in validations:
-                    if validation.date >= snapshot.timestamp.date():
-                        # It could be this cert, so take it
-                        found = True
-                        matches[entry] = cert
-                        break
-                if found:
-                    break
-                else:
-                    matches[entry] = None
-        return matches
+        certs: list[FIPSCertificate] = list(dset)
+        matchers = [FIPSProcessMatcher(entry) for entry in snapshot]
+        return cls._match_all(matchers, certs, config.fips_matching_threshold)
diff --git a/src/sec_certs/model/matching.py b/src/sec_certs/model/matching.py
new file mode 100644
index 00000000..687a5888
--- /dev/null
+++ b/src/sec_certs/model/matching.py
@@ -0,0 +1,43 @@
+import typing
+from abc import ABC, abstractmethod
+from heapq import heappop, heappush
+from typing import Generic
+
+from sec_certs.dataset.dataset import Dataset
+from sec_certs.sample.certificate import Certificate
+
+CertSubType = typing.TypeVar("CertSubType", bound=Certificate)
+DatasetSubType = typing.TypeVar("DatasetSubType", bound=Dataset)
+
+
+class AbstractMatcher(Generic[CertSubType, DatasetSubType], ABC):
+    @abstractmethod
+    def match(self, cert: CertSubType) -> float:
+        raise NotImplementedError
+
+    @staticmethod
+    def _match_all(matchers, certs, threshold):
+        scores: list[tuple[float, int, int]] = []
+        matched_is: set[int] = set()
+        matched_js: set[int] = set()
+        for i, cert in enumerate(certs):
+            for j, matcher in enumerate(matchers):
+                score = matcher.match(cert)
+                triple = (100 - score, i, j)
+                heappush(scores, triple)
+        results = {}
+        for triple in (heappop(scores) for _ in range(len(scores))):
+            inv_score, i, j = triple
+            # Do not match already matched entries/certs.
+            if i in matched_is or j in matched_js:
+                continue
+            # Compute the actual score from the inverse.
+            score = 100 - inv_score
+            # Do not match if we are below threshold, all the following will be as well.
+            if score < threshold:
+                break
+            # Match cert dgst to entry
+            cert = certs[i]
+            entry = matchers[j].entry
+            results[cert.dgst] = entry
+        return results
author	J08nY	2023-04-14 20:46:12 +0200
committer	J08nY	2023-04-14 20:47:32 +0200
commit	ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153 (patch)
tree	844e2f32d0896938b07ca020edfdb15c580ba411 /src
parent	89b3d880088b5c30fa10036f280e73b1c1aee05e (diff)
download	sec-certs-ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153.tar.gz sec-certs-ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153.tar.zst sec-certs-ae2bb3247521aaf0930fac70e1f3f1cb2ffbc153.zip