{ "cells": [ { "cell_type": "markdown", "id": "10337a8316f35aeb", "metadata": {}, "source": [ "# Scheme data matching evaluation\n", "This notebook evaluates the performance of matching the data extracted from scheme websites to data from the commoncriteriaportal." ] }, { "cell_type": "code", "execution_count": null, "id": "initial_id", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pprint import pprint\n", "from tqdm.auto import trange, tqdm\n", "\n", "from sec_certs.dataset import CCDataset, CCSchemeDataset\n", "from sec_certs.model import CCSchemeMatcher\n", "from sec_certs.sample.cc_certificate_id import canonicalize\n", "from sec_certs.sample.cc_scheme import CCScheme, EntryType\n", "from sec_certs.configuration import config\n", "from sec_certs.dataset.auxiliary_dataset_handling import CCSchemeDatasetHandler" ] }, { "cell_type": "code", "execution_count": null, "id": "1fccbff4e5cee78a", "metadata": {}, "outputs": [], "source": [ "dset = CCDataset.from_json(\"../../dset.json\")" ] }, { "cell_type": "code", "execution_count": null, "id": "2d711cd0bebf1daa", "metadata": {}, "outputs": [], "source": [ "schemes = CCSchemeDataset.from_json(\"../../schemes_new.json\")\n", "#schemes = CCSchemeDataset.from_web(enhanced=True)\n", "#schemes.to_json(\"../../schemes_new.json\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b5470ec4719da0d9", "metadata": {}, "outputs": [], "source": [ "dset.aux_handlers[CCSchemeDatasetHandler].dset = schemes\n", "\n", "count_was = 0\n", "count_is = 0\n", "for cert in dset:\n", " if cert.heuristics.scheme_data is not None:\n", " count_was += 1\n", " cert.heuristics.old_scheme_data = cert.heuristics.scheme_data\n", " cert.heuristics.scheme_data = None\n", "dset._compute_scheme_data()\n", "for cert in dset:\n", " if cert.heuristics.scheme_data is not None:\n", " count_is += 1\n", "print(count_was, count_is)" ] }, { "cell_type": "code", "execution_count": null, "id": "4d6b6db4956b4774", "metadata": {}, "outputs": [], "source": [ "def build_df(dset):\n", " df = pd.DataFrame([(cert.scheme, cert.name, cert.manufacturer, cert.status, cert.heuristics.cert_id, cert.not_valid_before, cert.heuristics.scheme_data) for cert in dset],\n", " columns=[\"scheme\", \"name\", \"vendor\", \"status\", \"cert_id\", \"cert_date\", \"scheme_data\"])\n", " df[\"scheme_cert_id\"] = df[\"scheme_data\"].map(lambda data: (data.get(\"cert_id\") or data.get(\"enhanced\", {}).get(\"cert_id\")) if data else None)\n", " def try_canonicalize(cert_id, scheme):\n", " try:\n", " return canonicalize(cert_id, scheme)\n", " except:\n", " return None\n", " df[\"scheme_cert_id_canonical\"] = df.apply(lambda x: try_canonicalize(x[\"scheme_cert_id\"], x[\"scheme\"]), axis=1)\n", " def get_from_entry(entry, *keys: str):\n", " if e := entry.get(\"enhanced\"):\n", " for key in keys:\n", " if val := e.get(key):\n", " return val\n", " for key in keys:\n", " if val := entry.get(key):\n", " return val\n", " return None\n", " df[\"scheme_cert_date\"] = df[\"scheme_data\"].map(lambda data: get_from_entry(data, \"certification_date\") if data else None)\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "id": "89cbf34713ce6c6a", "metadata": {}, "outputs": [], "source": [ "df = build_df(dset)" ] }, { "cell_type": "markdown", "id": "8a7976ae31969150", "metadata": {}, "source": [ "## Evaluate all schemes" ] }, { "cell_type": "markdown", "id": "f2e5c8d5-e08a-4fb9-919a-4de0718f5de5", "metadata": {}, "source": [ "Let's look at how the threshold setting changes the match rate." ] }, { "cell_type": "code", "execution_count": null, "id": "e8a7140f3738166f", "metadata": { "scrolled": true }, "outputs": [], "source": [ "original_threshold = config.cc_matching_threshold\n", "thresholds = list(range(100, -10, -10))\n", "rates = {}\n", "dfs = {}\n", "for threshold in tqdm(thresholds):\n", " config.cc_matching_threshold = threshold\n", " for cert in dset:\n", " cert.heuristics.scheme_data = None\n", " dset._compute_scheme_data()\n", " count = 0\n", " for cert in dset:\n", " if cert.heuristics.scheme_data is not None:\n", " count += 1\n", " print(f\"Threshold: {threshold}\")\n", " print(f\"Assigned count: {count}\")\n", " df = build_df(dset)\n", " dfs[threshold] = df\n", " for scheme in schemes:\n", " country = scheme.country\n", " total = df[df[\"scheme\"] == country]\n", " assigned = df[(df[\"scheme\"] == country) & df[\"scheme_data\"].notnull()]\n", " rate = len(assigned)/len(total) * 100 if len(total) != 0 else 0\n", " rate_list = rates.setdefault(country, [])\n", " rate_list.append(rate)\n", "\n", " print(f\"{country}: {len(assigned)} assigned out of {len(total)} -> {rate:.1f}%\")\n", " total_active = total[total[\"status\"] == \"active\"]\n", " assigned_active = assigned[assigned[\"status\"] == \"active\"]\n", " print(f\"\\t- active: {len(assigned_active)} out of {len(total_active)}, entries: {len(scheme.lists.get(EntryType.Certified, []))}\")\n", " total_archived = total[total[\"status\"] == \"archived\"]\n", " assigned_archived = assigned[assigned[\"status\"] == \"archived\"]\n", " print(f\"\\t- archived: {len(assigned_archived)} out of {len(total_archived)}, entries: {len(scheme.lists.get(EntryType.Archived, []))}\")\n", " print()\n", "\n", "config.cc_matching_threshold = original_threshold" ] }, { "cell_type": "code", "execution_count": null, "id": "05ef5991-1b3f-4fd3-9f4b-808ddb51a89f", "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "from matplotlib import pyplot as plt\n", "from itertools import cycle\n", "\n", "lines = [\"-\",\"--\",\"-.\",\":\"]\n", "linecycler = cycle(lines)\n", "\n", "fig, ax = plt.subplots(figsize=(12,4))\n", "for scheme in schemes:\n", " ax.plot(thresholds, rates[scheme.country], next(linecycler), label=scheme.country)\n", "ax.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\");" ] }, { "cell_type": "markdown", "id": "62df8d488e204ac2", "metadata": {}, "source": [ "## Evaluate a scheme" ] }, { "cell_type": "code", "execution_count": null, "id": "ab0d21164906fe3f", "metadata": {}, "outputs": [], "source": [ "scheme = \"DE\"\n", "threshold = 70\n", "df = dfs[threshold]\n", "df[df[\"scheme\"] == scheme].sample(10)" ] }, { "cell_type": "code", "execution_count": null, "id": "709598fe26cc4371", "metadata": { "scrolled": true }, "outputs": [], "source": [ "un_df = pd.DataFrame(schemes[scheme].lists[EntryType.Certified])\n", "un_df" ] }, { "cell_type": "code", "execution_count": null, "id": "11c70f74ce9c4d56", "metadata": {}, "outputs": [], "source": [ "sd = list(df[\"scheme_data\"])\n", "unmatched_certs = [cert for cert in dset if cert.scheme == scheme and cert.heuristics.scheme_data is None and cert.status == \"active\"]\n", "unmatched_entries = [entry for entry in schemes[scheme].lists[EntryType.Certified] if entry not in sd]\n", "matches = CCSchemeMatcher.match_all(unmatched_entries, scheme, unmatched_certs)\n", "matches" ] }, { "cell_type": "code", "execution_count": null, "id": "f5fa79176664dac8", "metadata": {}, "outputs": [], "source": [ "pd.DataFrame([cert.pandas_tuple[:5] for cert in unmatched_certs])" ] }, { "cell_type": "code", "execution_count": null, "id": "95a6e048e53601c7", "metadata": {}, "outputs": [], "source": [ "pd.DataFrame(unmatched_entries)" ] }, { "cell_type": "code", "execution_count": null, "id": "ff67f13e-bf99-4ba5-bef0-37c85dd3e2c8", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }