{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Vulnerability analysis\n",
    "\n",
    "This notebook contains analysis of vulnerabilities in Common Criteria certificates. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "from sec_certs.cert_rules import cc_rules\n",
    "\n",
    "sns.set_style(\"whitegrid\")\n",
    "sns.set_palette(\"deep\")\n",
    "sns.set_context(\"notebook\")  # Set to \"paper\" for use in paper :)\n",
    "\n",
    "import itertools\n",
    "import warnings\n",
    "from pathlib import Path\n",
    "import tempfile\n",
    "from sec_certs.dataset.auxiliary_dataset_handling import CVEDatasetHandler, CPEDatasetHandler, CCMaintenanceUpdateDatasetHandler\n",
    "from sec_certs.dataset import CCDataset, CCDatasetMaintenanceUpdates, CVEDataset, CPEDataset\n",
    "from sec_certs.utils.pandas import (\n",
    "    compute_cve_correlations,\n",
    "    compute_maintenances_that_come_after_vulns,\n",
    "    discover_sar_families,\n",
    "    expand_df_with_cve_cols,\n",
    "    filter_to_cves_within_validity_period,\n",
    "    get_sar_level_from_set,\n",
    "    get_top_n_cwes,\n",
    "    move_fixing_mu_to_directory,\n",
    "    prepare_cwe_df,\n",
    ")\n",
    "\n",
    "warnings.simplefilter(action=\"ignore\", category=pd.errors.PerformanceWarning)\n",
    "\n",
    "RESULTS_DIR = Path(\"./results\")\n",
    "RESULTS_DIR.mkdir(exist_ok=True)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Retrieve necessary objects from sec-certs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Local instantiation\n",
    "\n",
    "print(\"Loading CCDataset\")\n",
    "dset: CCDataset = CCDataset.from_json(\"/path/to/cc_dset.json\")\n",
    "\n",
    "print(\"Loading CCDatasetMaintenanceUpdates\")\n",
    "main_dset: CCDatasetMaintenanceUpdates = CCDatasetMaintenanceUpdates.from_json(\"/path/to/maintenance_updates.json\")\n",
    "\n",
    "print(\"Loading CVEDataset\")\n",
    "cve_dset: CVEDataset = CVEDataset.from_json(\"/path/to/cve_dataset.json\")\n",
    "\n",
    "print(\"Loading CPEDataset\")\n",
    "cpe_dset: CPEDataset = CPEDataset.from_json(\"/path/to/cpe_dataset.json\")\n",
    "\n",
    "# # Remote instantiation (takes approx. 10 minutes to complete)\n",
    "# dset: CCDataset = CCDataset.from_web(path=\"dset\", auxiliary_datasets=True)\n",
    "# dset.load_auxiliary_datasets()\n",
    "\n",
    "# main_dset: CCDatasetMaintenanceUpdates = dset.aux_handlers[CCMaintenanceUpdateDatasetHandler].dset\n",
    "# cpe_dset: CPEDataset = dset.aux_handlers[CPEDatasetHandler].dset\n",
    "# cve_dset: CVEDataset = dset.aux_handlers[CVEDatasetHandler].dset"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dataset preprocessing\n",
    "\n",
    "- Loads `CCDataset` into pandas DataFrame\n",
    "- Expands it with various maintenance-related columns\n",
    "- Expands it with various CVE-related columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "df = dset.to_pandas()\n",
    "\n",
    "if \"n_maintenances\" not in df.columns:\n",
    "    n_maintenances = main_dset.get_n_maintenances_df()\n",
    "    df = pd.concat([df, n_maintenances], axis=\"columns\")\n",
    "\n",
    "if \"maintenance_dates\" not in df.columns:\n",
    "    main_dates = main_dset.get_maintenance_dates_df()\n",
    "    df = pd.concat([df, main_dates], axis=\"columns\")\n",
    "\n",
    "# Expand DataFrame with CVEs that affect some certificate\n",
    "cves = list(itertools.chain.from_iterable(x.heuristics.related_cves for x in dset if x.heuristics.related_cves))\n",
    "cve_dict = {x: cve_dset[x] for x in cves}\n",
    "cve_dset.cves = cve_dict  # Limit cve_dset to CVEs relevant to some certificate\n",
    "\n",
    "df = expand_df_with_cve_cols(df, cve_dset)\n",
    "\n",
    "df = df.loc[(df.not_valid_before.notnull()) & (df.not_valid_after.notnull())]\n",
    "\n",
    "df_cves_within_validity_period = filter_to_cves_within_validity_period(\n",
    "    df.copy(), cve_dset\n",
    ")\n",
    "df_cves_within_validity_period = expand_df_with_cve_cols(df_cves_within_validity_period, cve_dset)\n",
    "\n",
    "df_cpe_rich = df.loc[~df.cpe_matches.isnull()].copy()\n",
    "df_cve_rich = df.loc[df.n_cves > 0].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Take a look at columns related to CVEs\n",
    "df.loc[\n",
    "    ~df.related_cves.isna(), [\"related_cves\", \"cve_published_dates\", \"earliest_cve\", \"worst_cve_score\", \"avg_cve_score\"]\n",
    "]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plots\n",
    "\n",
    "- distribution of categories in CPE-rich vs. all certificates\n",
    "- distribution of years in CPE-rich, CVE-rich, and all certificates\n",
    "- distribution of security levels in CPE-rich, CVE-rich and all certificates\n",
    "- distribution of most common (top 20) vendors in CPE-rich, CVE-rich and all certificates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# distribution of categories in CPE-rich vs. all certificates\n",
    "categories_cpe = df_cpe_rich.category.value_counts().sort_index().rename(\"Category distribution CPE-rich\")\n",
    "categories_cve = df_cve_rich.category.value_counts().sort_index().rename(\"Category distribution CVE-rich\")\n",
    "categories_all = df.category.value_counts().sort_index().rename(\"Category distribution all\")\n",
    "\n",
    "categories_merged = pd.concat([categories_all, categories_cpe, categories_cve], axis=1)\n",
    "categories_merged = categories_merged.div(categories_merged.sum(axis=0), axis=1)\n",
    "\n",
    "categories_merged.plot.bar(\n",
    "    title=\"Categories (without smartcards) comparison between CPE-rich, CVE-rich and all certificates\"\n",
    ")\n",
    "plt.show()\n",
    "plt.savefig(RESULTS_DIR / \"categories.pdf\", bbox_inches=\"tight\")\n",
    "\n",
    "# distribution of years in CPE-rich, CVE-rich, and all certificates\n",
    "years_cpe = df_cpe_rich.year_from.value_counts().sort_index().rename(\"Year distribution CPE-rich\")\n",
    "years_cve = df_cve_rich.year_from.value_counts().sort_index().rename(\"Year distribution CVE-rich\")\n",
    "years_all = df.year_from.value_counts().sort_index().rename(\"Year distribution all certificates\")\n",
    "\n",
    "years_merged = pd.concat([years_all, years_cpe, years_cve], axis=1)\n",
    "years_merged.index.name = \"year_from\"\n",
    "years_merged = years_merged.loc[years_merged.index < 2022]\n",
    "years_merged = years_merged.div(years_merged.sum(axis=0), axis=1)\n",
    "years_merged.plot.line(title=\"Years comparision between CPE-rich, CVE-rich and all certificates\")\n",
    "\n",
    "plt.show()\n",
    "plt.savefig(RESULTS_DIR / \"cve_cpe_certs_time_evolution.pdf\", bbox_inches=\"tight\")\n",
    "\n",
    "# distribution of security levels in CPE-rich, CVE-rich and all certificates\n",
    "levels_cpe = df_cpe_rich.eal.value_counts().sort_index().rename(\"EAL distribution CPE-rich\")\n",
    "levels_cve = df_cve_rich.eal.value_counts().sort_index().rename(\"EAL distribution CVE-rich\")\n",
    "levels_all = df.eal.value_counts().sort_index().rename(\"EAL distribution all certificates\")\n",
    "\n",
    "levels_merged = pd.concat([levels_all, levels_cpe, levels_cve], axis=1)\n",
    "levels_merged = levels_merged.div(levels_merged.sum(axis=0), axis=1)\n",
    "levels_merged.plot.bar(title=\"EAL comparision between CPE-rich, CVE-rich and all certificates\")\n",
    "plt.show()\n",
    "plt.savefig(RESULTS_DIR / \"eal_distr_all_cpes_cves.pdf\", bbox_inches=\"tight\")\n",
    "\n",
    "# distribution of most common (top 20) vendors in CPE-rich, CVE-rich and all certificates\n",
    "top_manufacturers = list(df.manufacturer.value_counts().head(20).index)\n",
    "vendors_cpe = (\n",
    "    df_cpe_rich.loc[df_cpe_rich.manufacturer.isin(top_manufacturers), \"manufacturer\"]\n",
    "    .value_counts()\n",
    "    .sort_index()\n",
    "    .rename(\"Common vendors distribution CPE-rich\")\n",
    ")\n",
    "vendors_cve = (\n",
    "    df_cve_rich.loc[df_cve_rich.manufacturer.isin(top_manufacturers), \"manufacturer\"]\n",
    "    .value_counts()\n",
    "    .sort_index()\n",
    "    .rename(\"Common vendors distribution CVE-rich\")\n",
    ")\n",
    "vendors_all = (\n",
    "    df.loc[df.manufacturer.isin(top_manufacturers), \"manufacturer\"]\n",
    "    .value_counts()\n",
    "    .sort_index()\n",
    "    .rename(\"Common vendors distribution all certificates\")\n",
    ")\n",
    "\n",
    "vendors_merged = pd.concat([vendors_all, vendors_cpe, vendors_cve], axis=1)\n",
    "vendors_merged = vendors_merged.div(vendors_merged.sum(axis=0), axis=1)\n",
    "vendors_merged.plot.bar(title=\"Common vendors comparison between CPE-rich, CVE-rich and all certificates\")\n",
    "plt.show()\n",
    "plt.savefig(RESULTS_DIR / \"top_vendors_vulns.pdf\", bbox_inches=\"tight\")\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Correlations between (EAL, SAR) and CVEs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This limits analysis to SAR families that are popular-enough (>100 CVE-rich certs, second-most-popular value with >= 40 instances)\n",
    "cve_rich = df_cves_within_validity_period.loc[df_cves_within_validity_period.related_cves.notnull()].copy()\n",
    "families = discover_sar_families(cve_rich.extracted_sars)\n",
    "\n",
    "for family in tqdm(families):\n",
    "    cve_rich[family] = cve_rich.extracted_sars.map(lambda x: get_sar_level_from_set(x, family))\n",
    "\n",
    "sars = cve_rich[families].notnull().sum().to_frame(\"support\")\n",
    "sars.index.name = \"family\"\n",
    "sars[\"second_most_popular\"] = sars.index.map(\n",
    "    lambda x: 0 if not len(cve_rich[x].value_counts()) > 1 else cve_rich[x].value_counts().iloc[1]\n",
    ")\n",
    "sars = sars.loc[(sars.support >= 100) & (sars.second_most_popular >= 40)].sort_values(by=\"support\", ascending=False)\n",
    "sars[\"range\"] = sars.index.map(lambda x: len(cve_rich[x].value_counts()))\n",
    "\n",
    "df_corr = compute_cve_correlations(\n",
    "    df_cves_within_validity_period.loc[\n",
    "        df_cves_within_validity_period.category != \"ICs, Smart Cards and Smart Card-Related Devices and Systems\"\n",
    "    ],\n",
    "    sar_families=sars.index.tolist(),\n",
    "    filter_nans=False,\n",
    ")\n",
    "df_corr[\"range\"] = sars[\"range\"]\n",
    "df_corr.loc[\"eal\", \"range\"] = len(cve_rich.eal.cat.categories)\n",
    "df_corr = (\n",
    "    df_corr.sort_values(by=\"avg_cve_score_corr\")\n",
    "    .rename(index={\"eal\": \"EAL\"})\n",
    "    .drop(columns=[\"worst_cve_score_corr\", \"worst_cve_pvalue\"])\n",
    ")\n",
    "df_corr = pd.concat([df_corr.loc[\"EAL\", :].to_frame().transpose(), df_corr.drop(\"EAL\")])\n",
    "df_corr.range = df_corr.range.astype(\"int\")\n",
    "df_corr.support = df_corr.support.astype(\"int\")\n",
    "df_corr.index.name = \"EAL/SAR family\"\n",
    "df_corr.index = df_corr.index.map(lambda x: x.replace(\"_\", \"\\_\"))\n",
    "df_corr.to_csv(RESULTS_DIR / \"certs_without_smartcards_sar_cve_corr.csv\", float_format=\"%.2e\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cves_within_validity_period"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Time from CVE to maintenance update\n",
    "\n",
    "The following block computes time from CVE to maintenance update of a certificate. Though, it has some limitations:\n",
    "- So far we only watch for first CVE related to a certified product (though most of certificates have only one maintenance anyway, if they even have one)\n",
    "- We assume that a maintance update released *after* (by date comparison) CVE may be related to that vulnerability\n",
    "- The sample size is small. Only ~40 certificates have at least one CVE and at least one Maintenance update. Out of those, only ~30 certificates have maintenance update that comes after a CVE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_main_after_vuln = compute_maintenances_that_come_after_vulns(df_cves_within_validity_period)\n",
    "\n",
    "n_vuln_certs_wo_maintenances = df_cves_within_validity_period.loc[\n",
    "    (df_cves_within_validity_period.n_cves > 0) & (df_cves_within_validity_period.n_maintenances == 0)\n",
    "].shape[0]\n",
    "n_vuln_certs_w_maintenances = df_cves_within_validity_period.loc[\n",
    "    (df_cves_within_validity_period.n_cves > 0) & (df_cves_within_validity_period.n_maintenances > 0)\n",
    "].shape[0]\n",
    "n_certs_with_main_after_vuln = df_main_after_vuln.loc[df_main_after_vuln.earliest_maintenance_after_vuln.notna()].shape[\n",
    "    0\n",
    "]\n",
    "\n",
    "print(\n",
    "    f\"Number of certificates with >0 CVEs in validity period but 0 maintenance reports: {n_vuln_certs_wo_maintenances}\"\n",
    ")\n",
    "print(\n",
    "    f\"Number of certificates with >0 CVEs in validity period and >0 maintenance reports: {n_vuln_certs_w_maintenances}\"\n",
    ")\n",
    "print(f\"Number of vulnerable certificates with a maintenance report *after* CVE: {n_certs_with_main_after_vuln}\")\n",
    "\n",
    "updates_that_should_fix_vulns_path = RESULTS_DIR / \"updates_that_should_fix_vulns\"\n",
    "updates_that_should_fix_vulns_path.mkdir(exist_ok=True)\n",
    "mu_filenames = move_fixing_mu_to_directory(\n",
    "    df_main_after_vuln,\n",
    "    main_dset.to_pandas(),\n",
    "    updates_that_should_fix_vulns_path,\n",
    "    \"/Users/adam/phd/projects/certificates/sec-certs/datasets/cc_final_run_may_23/auxiliary_datasets/maintenances/reports/pdf\",\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_main_after_vuln.loc[\n",
    "    df_main_after_vuln.earliest_maintenance_after_vuln.notna(),\n",
    "    [\n",
    "        \"name\",\n",
    "        \"cpe_matches\",\n",
    "        \"related_cves\",\n",
    "        \"not_valid_before\",\n",
    "        \"not_valid_after\",\n",
    "        \"cve_published_dates\",\n",
    "        \"maintenance_dates\",\n",
    "        \"earliest_maintenance_after_vuln\",\n",
    "    ],\n",
    "].to_csv(RESULTS_DIR / \"vulns_before_mu.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_main_after_vuln.loc[\n",
    "    df_main_after_vuln.earliest_maintenance_after_vuln.notna(),\n",
    "    [\"maintenance_dates\", \"earliest_cve\", \"earliest_maintenance_after_vuln\"],\n",
    "]\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Histogram timeline of new vulnerabilities\n",
    "\n",
    "Shows when vulnerabilities are announced in relation to the date of certification & date of certificate expiration.\n",
    "\n",
    "*Note*: Some certificates (especially the new ones) don't have their expiration date set yet. These are discarded from the analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "exploded_cves = (\n",
    "    df_cve_rich.explode(\"related_cves\")\n",
    "    .loc[:, [\"related_cves\", \"not_valid_before\", \"not_valid_after\"]]\n",
    "    .rename(columns={\"related_cves\": \"cve\"})\n",
    ")\n",
    "exploded_cves = exploded_cves.dropna()\n",
    "exploded_cves[\"cve_published_date\"] = exploded_cves.cve.map(lambda x: cve_dset[x].published_date)\n",
    "\n",
    "exploded_cves.not_valid_before = exploded_cves.not_valid_before.dt.normalize()\n",
    "exploded_cves.not_valid_after = exploded_cves.not_valid_after.dt.normalize()\n",
    "exploded_cves.cve_published_date = exploded_cves.cve_published_date.dt.tz_localize(None).dt.normalize()\n",
    "\n",
    "exploded_cves[\"n_days_after_certification\"] = (\n",
    "    exploded_cves.cve_published_date - exploded_cves.not_valid_before\n",
    ").dt.days\n",
    "exploded_cves[\"n_days_after_expiry\"] = (exploded_cves.cve_published_date - exploded_cves.not_valid_after).dt.days\n",
    "\n",
    "plt.rcParams[\"figure.figsize\"] = [12, 4]\n",
    "plt.rcParams[\"figure.autolayout\"] = True\n",
    "figure, axes = plt.subplots(1, 2)\n",
    "\n",
    "exploded_cves.to_csv(RESULTS_DIR / \"exploded_cves.csv\")\n",
    "\n",
    "hist = sns.histplot(exploded_cves.n_days_after_certification, kde=True, ax=axes[0])\n",
    "hist.set(\n",
    "    xlim=(-4000, 4600),\n",
    "    title=\"CVEs appearing n days after certification\",\n",
    "    xlabel=\"Number of days after date of certification\",\n",
    "    ylabel=\"Frequency of CVEs\",\n",
    ")\n",
    "hist.axvline(0, color=\"red\", linewidth=\"1\", label=\"Day of certification\")\n",
    "hist.legend(loc=\"upper right\")\n",
    "# plt.savefig(RESULTS_DIR / \"cves_n_days_after_certification.pdf\", bbox_inches='tight')\n",
    "# plt.show()\n",
    "\n",
    "hist = sns.histplot(exploded_cves.n_days_after_expiry, kde=True, ax=axes[1])\n",
    "hist.set(\n",
    "    xlim=(-6200, 4000),\n",
    "    title=\"CVEs appearing n days after certificate expiry date\",\n",
    "    xlabel=\"Number of days after certificate expiration\",\n",
    "    ylabel=\"Frequency of CVEs\",\n",
    ")\n",
    "hist.axvline(0, color=\"red\", linewidth=\"1\", label=\"Day of expiration\")\n",
    "hist.legend(loc=\"upper left\")\n",
    "plt.savefig(RESULTS_DIR / \"cves_vs_certificate_lifetime.pdf\", bbox_inches=\"tight\")\n",
    "plt.show()\n",
    "\n",
    "n_cves = exploded_cves.shape[0]\n",
    "ratio_before_cert = exploded_cves.loc[exploded_cves.n_days_after_certification <= 0].shape[0] / n_cves\n",
    "ratio_after_cert = exploded_cves.loc[exploded_cves.n_days_after_certification > 0].shape[0] / n_cves\n",
    "print(f\"Ratio of CVEs appearing before (or exactly on) certification date: {100 * ratio_before_cert:.2f}%\")\n",
    "print(f\"Ratio of CVEs appearing after certification date: {100 * ratio_after_cert:.2f}%\")\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How many CVE-rich certificates were revoked within <365 days after certification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "maybe_revoked = df_cve_rich.loc[\n",
    "    ~df_cve_rich.not_valid_after.isna(),\n",
    "    [\"not_valid_before\", \"not_valid_after\", \"n_cves\", \"worst_cve_score\", \"avg_cve_score\", \"related_cves\"],\n",
    "]\n",
    "maybe_revoked[\"validity_n_days\"] = (maybe_revoked.not_valid_after - maybe_revoked.not_valid_before).dt.days\n",
    "maybe_revoked = maybe_revoked.loc[maybe_revoked.validity_n_days < 365]\n",
    "print(f\"How many CVE-rich certificates were revoked in <365 days after certification: {maybe_revoked.shape[0]}\")\n",
    "\n",
    "df_w_validity_dates = df.loc[~df.not_valid_after.isna()].copy()\n",
    "df_w_validity_dates.loc[:, \"validity_n_days\"] = (\n",
    "    df_w_validity_dates.not_valid_after - df_w_validity_dates.not_valid_before\n",
    ").dt.days\n",
    "df_w_validity_dates = df_w_validity_dates.loc[df_w_validity_dates.validity_n_days < 365]\n",
    "print(f\"How many certificates were revoked in <365 days after certification: {df_w_validity_dates.shape[0]}\")\n",
    "print(\n",
    "    f\"How many certificates have expiry date same as certification date: {df_w_validity_dates.loc[df_w_validity_dates.not_valid_before == df_w_validity_dates.not_valid_after].shape[0]}\"\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_validity_dates.loc[\n",
    "    :, [\"name\", \"not_valid_before\", \"not_valid_after\", \"validity_n_days\", \"related_cves\"]\n",
    "].sort_values(by=\"validity_n_days\", ascending=False)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scatterplot of CVE severity w.r.t. categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_cves_top_20_certs = df_cve_rich.n_cves.sort_values(ascending=False).head(20)[-1]\n",
    "cves = (\n",
    "    df_cve_rich.loc[\n",
    "        (df_cve_rich.year_from < 2022) & (df_cve_rich.n_cves < n_cves_top_20_certs) & (~df_cve_rich.eal.isna()),\n",
    "        [\"related_cves\", \"eal\", \"category\"],\n",
    "    ]\n",
    "    .explode(\"related_cves\")\n",
    "    .rename(columns={\"related_cves\": \"cve_id\"})\n",
    ")\n",
    "cves[\"published_date\"] = cves.cve_id.map(lambda x: cve_dset[x].published_date).dt.tz_localize(None).dt.normalize()\n",
    "cves[\"base_score\"] = cves.cve_id.map(lambda x: cve_dset[x].metrics.base_score)\n",
    "cves = cves.drop_duplicates()\n",
    "\n",
    "g = sns.relplot(\n",
    "    data=cves,\n",
    "    x=\"published_date\",\n",
    "    y=\"base_score\",\n",
    "    hue=\"eal\",\n",
    "    col=\"category\",\n",
    "    col_wrap=3,\n",
    "    height=4,\n",
    "    kind=\"scatter\",\n",
    "    facet_kws={\"sharey\": False, \"sharex\": False},\n",
    "    palette=\"viridis\",\n",
    ")\n",
    "g.set_titles(\"{col_name}\")\n",
    "g.set_xticklabels(rotation=60)\n",
    "g.set_axis_labels(\"Vulnerability publish date\", \"Vulnerability base score\")\n",
    "g.tight_layout()\n",
    "plt.savefig(RESULTS_DIR / \"scatter_category_vs_cves.pdf\", bbox_inches=\"tight\")\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Histogram of vulnerability scores per category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cves = cves.reset_index()  # See https://github.com/mwaskom/seaborn/issues/2733\n",
    "g = sns.displot(\n",
    "    data=cves,\n",
    "    x=\"base_score\",\n",
    "    col=\"category\",\n",
    "    col_wrap=3,\n",
    "    height=3,\n",
    "    kind=\"hist\",\n",
    "    facet_kws={\"sharey\": False, \"sharex\": True},\n",
    ")\n",
    "g.set_titles(\"{col_name}\")\n",
    "# g.set_xticklabels(rotation=60)\n",
    "g.set_axis_labels(\"Vulnerability base score\", \"Frequency\")\n",
    "g.tight_layout()\n",
    "plt.savefig(RESULTS_DIR / \"hist_category_vs_cve.pdf\", bbox_inches=\"tight\")\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Protection-profiles vs CVEs\n",
    "\n",
    "Computes Pearson's correlation coefficient between number of CVEs and binary variable: Has a protection profile (1) or has not a protection profile (0)\n",
    "Beware, number of supporting certificates may be quite low"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cve_rich[\"has_pp\"] = df_cve_rich.protection_profiles.isna()\n",
    "corrs = df_cve_rich.groupby(\"category\")[[\"n_cves\", \"has_pp\"]].corr().iloc[0::2, -1].droplevel(level=1)\n",
    "corrs.name = \"correlation\"\n",
    "support_counts = df_cve_rich.groupby(\"category\").size()\n",
    "support_counts.name = \"n certs in category\"\n",
    "pp_rich_counts = df_cve_rich.loc[df_cve_rich.has_pp == True].groupby(\"category\").size()\n",
    "pp_rich_counts.name = \"n certs with PP\"\n",
    "pp_n_cves_corr = pd.concat([corrs, support_counts, pp_rich_counts], axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pp_n_cves_corr\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CWEs in vulnerable certificates\n",
    "\n",
    "- Study distribution of [CWEs](https://cwe.mitre.org/) in vulnerable certificates. \n",
    "- Experiment mainly with distribution across categories and security level. \n",
    "- Compare our results with findings from [open-source projects](https://dl.acm.org/doi/pdf/10.1145/3133956.3134072).\n",
    "- Limit only to vulns. that were disclosed in the validity period of a certificate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cc_df, cwe_df = prepare_cwe_df(\n",
    "    df_cves_within_validity_period.loc[df_cves_within_validity_period.n_cves > 0], cve_dset, fine_grained=True\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cc_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cwe_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Print top-25 CWEs across dataset\n",
    "top_25_cwes = get_top_n_cwes(cc_df, cwe_df, n_cwes=25)\n",
    "top_25_cwes\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store top-10 CWEs into CSVs, do this also per category\n",
    "CWE_DIR = RESULTS_DIR / \"cwes\"\n",
    "CWE_DIR.mkdir(exist_ok=True)\n",
    "top_25_cwes.to_csv(CWE_DIR / \"top_25_cwe_overall.csv\")\n",
    "\n",
    "for cat in cc_df.category.unique():\n",
    "    get_top_n_cwes(cc_df, cwe_df, category=cat).to_csv(CWE_DIR / (cat + \".csv\"))\n",
    "\n",
    "# Same for EALs\n",
    "for eal in cc_df.eal.unique():\n",
    "    get_top_n_cwes(cc_df, cwe_df, eal=eal).to_csv(CWE_DIR / (eal + \".csv\"))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For more coarse-grained description of CVEs, one could use so-called Security Fault Patterns (SFP) clusters.\n",
    "from sec_certs.utils.pandas import SFPModel\n",
    "\n",
    "model = SFPModel.from_web()\n",
    "model.search_cwe(626)  # <-- This returns Primary Cluster and Secondary cluster\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CWE evolution in time\n",
    "\n",
    "the following plot shows how top-10 overall CWEs evolve in time. As can be seen from the plot, the distribution is quite stable except for `CWE-119`. We assume that this can be caused by favoring different CWEs for similar problems, so it does not necessarilly correspond to certain weakness being eliminated recently in the coe, but even this may be the case."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cc_df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cc_df[\"vuln_year\"] = cc_df.cve_id.map(lambda x: int(x.split(\"-\")[1]))\n",
    "cc_df = cc_df.loc[cc_df.vuln_year < 2022]\n",
    "top_10_cwes = set(get_top_n_cwes(cc_df, cwe_df, n_cwes=10).index.tolist())\n",
    "top_10_cwes = cc_df.loc[cc_df.cwe_id.isin(top_10_cwes)]\n",
    "pivot = (\n",
    "    top_10_cwes.groupby([\"vuln_year\", \"cwe_id\"], as_index=False)\n",
    "    .size()\n",
    "    .rename(columns={\"size\": \"frequency\"})\n",
    "    .pivot(\"vuln_year\", \"cwe_id\", \"frequency\")\n",
    "    .fillna(0)\n",
    ")\n",
    "sns.lineplot(data=pivot)\n",
    "plt.xlabel(\"Year of vulnerability\")\n",
    "plt.ylabel(\"Number of vulnerabilities\")\n",
    "plt.legend(title=\"Weakness ID\")\n",
    "plt.title(\"CWE prevalence in time\")\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Print commands for LaTeX file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_n_rules(dct):\n",
    "    if isinstance(dct, list):\n",
    "        return len(dct)\n",
    "    if isinstance(list(dct.values())[0], dict):\n",
    "        return sum([get_n_rules(x) for x in dct.values()])\n",
    "    else:\n",
    "        return sum([len(x) for x in dct.values()])\n",
    "\n",
    "\n",
    "n_regular_expressions = get_n_rules(cc_rules)\n",
    "\n",
    "n_all_cpes = len(cpe_dset)\n",
    "n_short_cpes = len([x for x in cpe_dset if x.item_name is not None and not len(x.item_name) > 3])\n",
    "\n",
    "n_smartcards_vulnerable = len(\n",
    "    df_cve_rich.loc[df_cve_rich.category == \"ICs, Smart Cards and Smart Card-Related Devices and Systems\"]\n",
    ")\n",
    "unique_smartcard_cves = set(\n",
    "    itertools.chain.from_iterable(\n",
    "        df_cve_rich.loc[\n",
    "            df_cve_rich.category == \"ICs, Smart Cards and Smart Card-Related Devices and Systems\"\n",
    "        ].related_cves.tolist()\n",
    "    )\n",
    ")\n",
    "\n",
    "df_cve_rich[\"roca\"] = df_cve_rich.related_cves.map(lambda x: \"CVE-2017-15361\" in x)\n",
    "df_cve_rich[\"titan\"] = df_cve_rich.related_cves.map(lambda x: \"CVE-2021-3011\" in x)\n",
    "df_cve_rich[\"minerva\"] = df_cve_rich.related_cves.map(lambda x: \"CVE-2019-15809\" in x)\n",
    "df_cve_rich[\"tmpfail\"] = df_cve_rich.related_cves.map(lambda x: \"CVE-2019-16863\" in x)\n",
    "\n",
    "n_roca_certs = df_cve_rich.roca.value_counts()[True]\n",
    "n_titan_certs = df_cve_rich.titan.value_counts()[True]\n",
    "\n",
    "# Compute direct RoCA references\n",
    "roca_certs = set(df_cve_rich.loc[df_cve_rich.roca].index.tolist())\n",
    "st = set()\n",
    "report = set()\n",
    "\n",
    "for cert in dset:\n",
    "    if cert.dgst in roca_certs:\n",
    "        if cert.heuristics.report_references and cert.heuristics.report_references.directly_referenced_by:\n",
    "            report = report.union(cert.heuristics.report_references.directly_referenced_by)\n",
    "        if cert.heuristics.st_references and cert.heuristics.st_references.directly_referenced_by:\n",
    "            st = st.union(cert.heuristics.st_references.directly_referenced_by)\n",
    "all_refs = st.union(report)\n",
    "\n",
    "try:\n",
    "    n_minerva_certs = df_cve_rich.minerva.value_counts()[True]\n",
    "except KeyError:\n",
    "    n_minerva_certs = 0\n",
    "\n",
    "try:\n",
    "    n_tmpfail_certs = df_cve_rich.tmpfail.value_counts()[True]\n",
    "except KeyError:\n",
    "    n_tmpfail_certs = 0\n",
    "\n",
    "n_cves_in_validity_period = exploded_cves.loc[\n",
    "    (exploded_cves.cve_published_date > exploded_cves.not_valid_before)\n",
    "    & (exploded_cves.cve_published_date < exploded_cves.not_valid_after)\n",
    "].shape[0]\n",
    "n_cves_all = exploded_cves.shape[0]\n",
    "\n",
    "print(f\"\\\\newcommand{{\\\\numCCActiveVulnerable}}{{${df_cve_rich.loc[df_cve_rich.status == 'active'].shape[0]}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numCCArchivedVulnerable}}{{${df_cve_rich.loc[df_cve_rich.status == 'archived'].shape[0]}$}}\")\n",
    "print(\n",
    "    f\"\\\\newcommand{{\\\\fractionCPEShort}}{{${(100 * n_short_cpes / n_all_cpes):.2f}\\%$}}  % number CPE strings with <4 characters.\"\n",
    ")\n",
    "print(f\"\\\\newcommand{{\\\\numRegularExpressions}}{{${n_regular_expressions}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numSmartcardsVulnerable}}{{${n_smartcards_vulnerable}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numSmartcardsDistinctVulnerabilities}}{{${len(unique_smartcard_cves)}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numSmartcardsRocaVuln}}{{${n_roca_certs}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numCertsReferencingRocaVuln}}{{${len(all_refs)}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numSmartcardsTitanVuln}}{{${n_titan_certs}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numSmartcardsMinervaVuln}}{{${n_minerva_certs}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numSmartcardsTmpFailVuln}}{{${n_tmpfail_certs}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numCertsWithMaintenance}}{{${df.loc[df.n_maintenances > 0].shape[0]}$}}\")\n",
    "print(\n",
    "    f\"\\\\newcommand{{\\\\numVulnerableCertsWithMaintenance}}{{${df_cve_rich.loc[df_cve_rich.n_maintenances > 0].shape[0]}$}}\"\n",
    ")\n",
    "print(f\"\\\\newcommand{{\\\\numCertsRevokedWithinYear}}{{${df_w_validity_dates.shape[0]}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numVulnerableCertsRevokedWithinYear}}{{${maybe_revoked.shape[0]}$}}\")\n",
    "print(\n",
    "    f\"\\\\newcommand{{\\\\numVulnerableCertsBeforeCertification}}{{${(100 * ratio_before_cert):.0f}\\%$}}\"\n",
    ")\n",
    "print(\n",
    "    f\"\\\\newcommand{{\\\\numVulnerableCertsAfterCertification}}{{${(100 * ratio_after_cert):.0f}\\%$}}\"\n",
    ")\n",
    "print(\n",
    "    f\"\\\\newcommand{{\\\\numVulnerableCertsInValidityPeriod}}{{${(100 * n_cves_in_validity_period / n_cves_all):.0f}\\%$}}\"\n",
    ")\n",
    "\n",
    "\n",
    "\n",
    "name_mapping = {\n",
    "    \"Improper Restriction of Operations within the Bounds of a Memory Buffer\": \"Buffer overflow\",\n",
    "    \"Exposure of Sensitive Information to an Unauthorized Actor\": \"Sensitive information exposure\",\n",
    "    \"Permissions, Privileges, and Access Controls\": \"Access control error\",\n",
    "    \"Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting')\": \"Cross-site Scripting\",\n",
    "    \"Concurrent Execution using Shared Resource with Improper Synchronization ('Race Condition')\": \"Race Condition\",\n",
    "}\n",
    "\n",
    "top_10_cwes = get_top_n_cwes(cc_df, cwe_df, n_cwes=10)\n",
    "top_10_cwes.cwe_name = top_10_cwes.cwe_name.map(lambda x: name_mapping.get(x, x))\n",
    "top_10_cwes.to_csv(RESULTS_DIR / \"top_10_cwes.csv\")\n",
    "\n",
    "print(f\"\\nTable of CWEs follows:\\n\")\n",
    "\n",
    "for cwe_id, row in top_10_cwes.iterrows():\n",
    "    print(f\"{cwe_id}\\t & {row['cwe_name']} \\t & {row['frequency']} \\\\\\\\\")\n",
    "\n",
    "print(f\"\\nTable of correlations follows:\\n\")\n",
    "\n",
    "for sar, row in df_corr.iterrows():\n",
    "    color_n_cves = \"\\cellcolor{green!15}\" if row[\"n_cves_pvalue\"] < 0.01 else \"\"\n",
    "    color_avg_score = \"\\cellcolor{green!15}\" if row[\"avg_cve_pvalue\"] < 0.01 else \"\"\n",
    "    print(\n",
    "        f\"{sar} & {color_n_cves} {row['n_cves_corr']:.2f} & {color_n_cves} {row['n_cves_pvalue']:.2e} & {color_avg_score} {row['avg_cve_score_corr']:.2f} & {color_avg_score} {row['avg_cve_pvalue']:.2e} & {row['support']:.0f} & {row['range']:.0f} \\\\\\\\\"\n",
    "    )\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "vscode": {
   "interpreter": {
    "hash": "a5b8c5b127d2cfe5bc3a1c933e197485eb9eba25154c3661362401503b4ef9d4"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}