{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Temporal trends in the CC ecosystem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "import warnings\n",
    "from pathlib import Path\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.ticker as mtick\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import tqdm\n",
    "import yaml\n",
    "\n",
    "from sec_certs.cert_rules import PANDAS_KEYWORDS_CATEGORIES, cc_rules\n",
    "from sec_certs.dataset import CCDataset\n",
    "from sec_certs.utils.extract import extract_key_paths, rules_get_subset\n",
    "\n",
    "sns.set_theme(style=\"white\", palette=\"deep\", context=\"notebook\")\n",
    "warnings.simplefilter(action=\"ignore\", category=pd.errors.PerformanceWarning)\n",
    "\n",
    "RESULTS_DIR = Path(\"./results\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dset = CCDataset.from_web()\n",
    "df = dset.to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "long_categories = {\n",
    "    \"Access Control Devices and Systems\": \"Access control\",\n",
    "    \"Biometric Systems and Devices\": \"Biometrics\",\n",
    "    \"Boundary Protection Devices and Systems\": \"Boundary protection\",\n",
    "    \"ICs, Smart Cards and Smart Card-Related Devices and Systems\": \"ICs, Smartcards\",\n",
    "    \"Network and Network-Related Devices and Systems\": \"Network(-related) devices\",\n",
    "}\n",
    "\n",
    "categories = df.loc[df.year_from < 2024].groupby([\"year_from\", \"category\"], as_index=False).size().copy()\n",
    "categories.category = categories.category.map(lambda x: long_categories.get(x, x))\n",
    "\n",
    "with sns.plotting_context(\"notebook\", font_scale=0.75):\n",
    "    g = sns.FacetGrid(categories, col=\"category\", hue=\"category\", col_wrap=5, height=2, ylim=(0, 150))\n",
    "    g.map(sns.lineplot, \"year_from\", \"size\")\n",
    "    g.set(xlabel=\"Year of cert.\", ylabel=\"N. certs.\")\n",
    "    g.set_titles(\"{col_name}\")\n",
    "    g.figure.subplots_adjust(top=0.90)\n",
    "    g.figure.suptitle(\"Category prevalence in time\")\n",
    "    plt.show()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prevalence of most popular categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "popular_categories = {\n",
    "    \"ICs, Smart Cards and Smart Card-Related Devices and Systems\",\n",
    "    \"Multi-Function Devices\",\n",
    "    \"Network and Network-Related Devices and Systems\",\n",
    "    \"Other Devices and Systems\",\n",
    "}\n",
    "df[\"popular_categories\"] = df.category.map(lambda x: x if x in popular_categories else \"One of 11 other categories\")\n",
    "n_certs_popular = df.loc[df.category.isin(popular_categories)].shape[0]\n",
    "n_certs_all = df.shape[0]\n",
    "print(f\"N certs in popular categories: {n_certs_popular} ({(100 * n_certs_popular / n_certs_all):.2f}%)\")\n",
    "\n",
    "n_certs = (\n",
    "    df.astype({\"year_from\": \"category\"})\n",
    "    .loc[df.year_from < 2024]\n",
    "    .groupby([\"popular_categories\", \"year_from\"], as_index=False)\n",
    "    .size()\n",
    ")\n",
    "n_certs.to_csv(RESULTS_DIR / \"popular_categories.csv\")\n",
    "\n",
    "cats = n_certs.popular_categories.unique()\n",
    "years = n_certs.year_from.cat.categories[:-1]\n",
    "data = [n_certs.loc[n_certs.popular_categories == c, \"size\"].tolist()[:-1] for c in cats]\n",
    "\n",
    "palette = sns.color_palette(\"Spectral\", 5).as_hex()\n",
    "\n",
    "# plt.style.use(\"seaborn-v0_8-white\")\n",
    "# plt.figure(figsize=(7, 4))\n",
    "# plt.stackplot(years, data, labels=cats, colors=palette)\n",
    "# plt.legend(loc=\"upper left\")\n",
    "# plt.ylabel(\"Number of issued certificates\")\n",
    "# plt.xlabel(\"Year of certification\")\n",
    "# plt.xlim(1997, 2021)\n",
    "# plt.show()\n",
    "# plt.style.use(\"seaborn-whitegrid\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Security level distribution over time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sec_levels = df.loc[df.year_from < 2022].groupby([\"year_from\", \"eal\"], as_index=False).size()\n",
    "line = sns.lineplot(data=sec_levels, x=\"year_from\", y=\"size\", hue=\"eal\")\n",
    "line.set(\n",
    "    xlabel=\"Year of certification\", ylabel=\"Number of issued certificates\", title=\"Security level prevalence in time\"\n",
    ")\n",
    "line.legend(title=\"Security level\", bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Average security level over time\n",
    "\n",
    "- Quantify security level as number from 0 (EAL1) to 13 (EAL7+)\n",
    "- Show three plots:\n",
    "    1. lineplot of average security level over time\n",
    "    2. scatter plot of average security level over time, where size of dot is weighted by the number of certificates issued in the given year\n",
    "    3. Show fitted line plot of average security level over time\n",
    "- Year 1999 is a clear outlier, print its certificates "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eal_to_num_mapping = {eal: index for index, eal in enumerate(df[\"eal\"].cat.categories)}\n",
    "df[\"eal_number\"] = df.eal.map(eal_to_num_mapping)\n",
    "df.eal_number = df.eal_number.astype(\"Int64\")\n",
    "avg_levels = (\n",
    "    df.loc[(df.year_from < 2024) & (df.eal_number.notnull())]\n",
    "    .copy()\n",
    "    .groupby([\"year_from\"])\n",
    "    .agg({\"year_from\": \"size\", \"eal_number\": \"mean\"})\n",
    "    .rename(columns={\"year_from\": \"n_certs\"})\n",
    "    .reset_index()\n",
    ")\n",
    "avg_levels.year_from = avg_levels.year_from.astype(\"float\")\n",
    "avg_levels.eal_number = avg_levels.eal_number.astype(\"float\")\n",
    "\n",
    "ymin = math.floor(avg_levels.eal_number.min())\n",
    "ymax = math.ceil(avg_levels.eal_number.max())\n",
    "ylabels = [\n",
    "    x if \"+\" in x else x + \"  \" for x in list(eal_to_num_mapping.keys())[ymin : ymax + 1]\n",
    "]  # this also aligns the labels by adding phantom spaces\n",
    "\n",
    "figure, axes = plt.subplots(1, 2)\n",
    "figure.set_size_inches(12, 4)\n",
    "figure.set_tight_layout(True)\n",
    "\n",
    "line = sns.lineplot(data=avg_levels, x=\"year_from\", y=\"eal_number\", marker=\"o\", ax=axes[0])\n",
    "line.set(xlabel=\"Year of certification\", ylabel=\"Average security level EAL1-EAL7+\", title=\"Average EAL over time\")\n",
    "line.set_yticks(range(ymin, ymax + 1), ylabels)\n",
    "\n",
    "fitted = sns.regplot(\n",
    "    data=avg_levels,\n",
    "    x=\"year_from\",\n",
    "    y=\"eal_number\",\n",
    "    scatter_kws={\"sizes\": [1.2 * x for x in list(avg_levels.n_certs)]},\n",
    "    line_kws={\"color\": \"red\"},\n",
    "    ci=None,\n",
    "    ax=axes[1],\n",
    ")\n",
    "fitted.set(xlabel=\"Year of certification\", ylabel=\"Average security level EAL1-EAL7+\", title=\"Fitted EAL over time\")\n",
    "fitted.set_yticks(range(ymin, ymax + 1), ylabels)\n",
    "\n",
    "plt.show()\n",
    "\n",
    "avg_levels = (\n",
    "    df.loc[df.eal_number.notnull()]\n",
    "    .copy()\n",
    "    .groupby([\"year_from\", \"category\"])\n",
    "    .agg({\"year_from\": \"size\", \"eal_number\": \"mean\"})\n",
    "    .rename(columns={\"year_from\": \"n_certs\"})\n",
    "    .reset_index()\n",
    ")\n",
    "avg_levels.year_from = avg_levels.year_from.astype(\"float\")\n",
    "avg_levels.eal_number = avg_levels.eal_number.astype(\"float\")\n",
    "avg_levels.category = avg_levels.category.map(lambda x: long_categories.get(x, x))\n",
    "\n",
    "with sns.plotting_context(\"notebook\", font_scale=0.75):\n",
    "    g = sns.FacetGrid(avg_levels, col=\"category\", hue=\"category\", col_wrap=5, height=2)\n",
    "    g.map(sns.lineplot, \"year_from\", \"eal_number\")\n",
    "    g.set(xlabel=\"Year of cert.\", ylabel=\"Avg. EAL\")\n",
    "    g.set_titles(\"{col_name}\")\n",
    "    g.fig.subplots_adjust(top=0.90)\n",
    "    g.fig.suptitle(\"Average EAL between categories\")\n",
    "    plt.show()\n",
    "\n",
    "avg_levels[\"smartcard_category\"] = avg_levels.category.map(\n",
    "    lambda x: x if x == \"ICs, Smartcards\" else \"Other 14 categories\"\n",
    ")\n",
    "line = sns.lineplot(\n",
    "    data=avg_levels,\n",
    "    x=\"year_from\",\n",
    "    y=\"eal_number\",\n",
    "    hue=\"smartcard_category\",\n",
    "    ci=None,\n",
    "    style=\"smartcard_category\",\n",
    "    markers=True,\n",
    ")\n",
    "line.legend(title=\"Category\", bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)\n",
    "line.set(xlabel=\"Year of certification\", ylabel=\"Average security level EAL1-EAL7+\", title=\"Average EAL over time\")\n",
    "ymin = 1\n",
    "ymax = 9\n",
    "ylabels = [\n",
    "    x if \"+\" in x else x + \"  \" for x in list(eal_to_num_mapping.keys())[ymin : ymax + 1]\n",
    "]  # this also aligns the labels by adding phantom spaces\n",
    "line.set_yticks(range(ymin, ymax + 1), ylabels)\n",
    "\n",
    "avg_levels.to_csv(RESULTS_DIR / \"avg_eal.csv\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Average SAR levels\n",
    "\n",
    "Somewhat fragile code, sensible results only for the most popular SARs. Computes their average values over time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sec_certs.utils.pandas import discover_sar_families, get_sar_level_from_set\n",
    "\n",
    "df_sar = df.loc[:, [\"eal\", \"extracted_sars\"]]\n",
    "families = discover_sar_families(df_sar.extracted_sars)\n",
    "df_sar.eal = df_sar.eal.cat.codes\n",
    "\n",
    "supports = [df_sar.loc[~df_sar[\"eal\"].isnull()].shape[0]]\n",
    "\n",
    "for family in tqdm.tqdm(families):\n",
    "    df_sar[family] = df_sar.extracted_sars.map(lambda x: get_sar_level_from_set(x, family))\n",
    "\n",
    "most_popular_sars = list(df_sar.isna().sum().sort_values().head(12).index)[2:]\n",
    "df_sar = df_sar.loc[:, most_popular_sars].join(df.year_from.to_frame())\n",
    "melted_sars = (\n",
    "    df_sar.groupby(\"year_from\").mean().reset_index().melt(id_vars=\"year_from\", var_name=\"SAR\", value_name=\"avg_val\")\n",
    ")\n",
    "\n",
    "g = sns.FacetGrid(melted_sars, col=\"SAR\", hue=\"SAR\", col_wrap=5, height=2)\n",
    "g.map(sns.lineplot, \"year_from\", \"avg_val\")\n",
    "g.set(xlabel=\"Year of cert.\", ylabel=\"Avg. value\")\n",
    "g.set_titles(\"{col_name}\")\n",
    "g.fig.subplots_adjust(top=0.87)\n",
    "g.fig.suptitle(\"Category prevalence in time\")\n",
    "plt.show()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Average certificate lifetime\n",
    "\n",
    "Examine average certificate lifetime. Note that approx. 1k certificates expired on a single day, at 2019-09-01.\n",
    "It may be benefitial to display these plots without these certificates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"validity_period\"] = (df.not_valid_after - df.not_valid_before).dt.days / 365\n",
    "\n",
    "# Interesting filtering options: (df.not_valid_after != \"2019-09-01\"),  (df.cert_link.notnull())\n",
    "df_validity = df.loc[(df.validity_period.notnull()) & (df.year_from < 2025)]\n",
    "validity_period = (\n",
    "    df_validity.groupby(\"year_from\")\n",
    "    .agg({\"year_from\": \"size\", \"validity_period\": \"mean\"})\n",
    "    .rename(columns={\"year_from\": \"n_certs\"})\n",
    "    .reset_index()\n",
    ")\n",
    "\n",
    "figure, axes = plt.subplots(1, 2)\n",
    "figure.set_size_inches(12, 4)\n",
    "figure.set_tight_layout(True)\n",
    "\n",
    "line = sns.lineplot(data=validity_period, x=\"year_from\", y=\"validity_period\", marker=\"o\", ax=axes[0])\n",
    "line.set(\n",
    "    xlabel=\"Year of certification\",\n",
    "    ylabel=\"Average lifetime of certificates (in years)\",\n",
    "    title=\"Average lifetime of certificates in years\",\n",
    ")\n",
    "\n",
    "box = sns.boxplot(data=df_validity, x=\"year_from\", y=\"validity_period\", ax=axes[1])\n",
    "box.set(\n",
    "    xlabel=\"Year of certification\",\n",
    "    ylabel=\"Lifetime of certificates (in years)\",\n",
    "    title=\"Boxplot of certificate validity periods in individual years\",\n",
    ")\n",
    "box.tick_params(axis=\"x\", rotation=60)\n",
    "df_validity[[\"year_from\", \"validity_period\"]].to_csv(RESULTS_DIR / \"df_validity.csv\")\n",
    "\n",
    "strips = sns.relplot(\n",
    "    kind=\"scatter\", data=df_validity, x=\"not_valid_before\", y=\"validity_period\", height=10, aspect=2 / 1, hue=\"category\"\n",
    ")\n",
    "strips.set(\n",
    "    title=\"Scatter plot of validity period development over time\",\n",
    "    xlabel=\"Date of certification\",\n",
    "    ylabel=\"Validity period of certificate (in years)\",\n",
    ")\n",
    "\n",
    "scatter = sns.relplot(\n",
    "    kind=\"scatter\", data=df_validity, x=\"not_valid_before\", y=\"not_valid_after\", height=10, aspect=2 / 1, hue=\"category\"\n",
    ")\n",
    "scatter.set(\n",
    "    title=\"Scatter plot of validity dates\",\n",
    "    xlabel=\"Date of certification (not valid before)\",\n",
    "    ylabel=\"Date of expiry (not valid after)\",\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Certification scheme & certification laboratory popularity over time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "schemes = df.loc[(df.scheme.notnull()) & (df.year_from < 2024)].groupby([\"year_from\", \"scheme\"], as_index=False).size()\n",
    "g = sns.FacetGrid(schemes, col=\"scheme\", hue=\"scheme\", col_wrap=6, height=2, ylim=(0, 100))\n",
    "g.map(sns.lineplot, \"year_from\", \"size\")\n",
    "g.set(xlabel=\"Year of cert.\", ylabel=\"N. certs.\")\n",
    "g.fig.subplots_adjust(top=0.90)\n",
    "g.fig.suptitle(\"National scheme prevalence in time\")\n",
    "plt.show()\n",
    "\n",
    "labs = df.loc[(df.cert_lab.notnull()) & (df.year_from < 2024)].groupby([\"year_from\", \"cert_lab\"], as_index=False).size()\n",
    "g = sns.FacetGrid(labs, col=\"cert_lab\", hue=\"cert_lab\", col_wrap=6, height=2)\n",
    "g.map(sns.lineplot, \"year_from\", \"size\")\n",
    "g.set(xlabel=\"Year of cert.\", ylabel=\"N. certs.\")\n",
    "g.fig.subplots_adjust(top=0.85)\n",
    "g.fig.suptitle(\"Certification laboratories prevalence in time\")\n",
    "plt.show()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prints stats and commands for LaTex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_certs = schemes[\"size\"].sum()\n",
    "\n",
    "n_certs_smartcards = df.loc[df.category == \"ICs, Smart Cards and Smart Card-Related Devices and Systems\"].shape[0]\n",
    "\n",
    "interesting_schemes = [\"US\", \"DE\", \"FR\", \"JP\", \"CA\"]\n",
    "\n",
    "for scheme in interesting_schemes:\n",
    "    frac = schemes.loc[schemes.scheme == scheme, \"size\"].sum()\n",
    "    print(f\"Number of {scheme} certs: {frac} ({(frac * 100 / total_certs):.2f}%)\")\n",
    "\n",
    "frac = schemes.loc[schemes.scheme.isin(interesting_schemes), \"size\"].sum()\n",
    "print(f\"Popular schemes constitute: {frac} ({(frac * 100 / total_certs):.2f}%)\")\n",
    "\n",
    "print(f\"\\nLatex commands:\\n\")\n",
    "for scheme in interesting_schemes:\n",
    "    frac = 100 * schemes.loc[schemes.scheme == scheme, \"size\"].sum() / total_certs\n",
    "    print(f\"\\\\newcommand{{\\\\fractionCerts{scheme}}}{{${frac:.0f}\\%$}}\")\n",
    "\n",
    "frac = 100 * schemes.loc[schemes.scheme.isin(interesting_schemes), \"size\"].sum() / total_certs\n",
    "print(f\"\\\\newcommand{{\\\\fractionCertsPopularSchemes}}{{${frac:.0f}\\%$}}\")\n",
    "\n",
    "# Print some commands related to general CC stats\n",
    "print(f\"\\\\newcommand{{\\\\numCCActiveCerts}}{{${df.loc[df.status == 'active'].shape[0]}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numCCArchivedCerts}}{{${df.loc[df.status == 'archived'].shape[0]}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\fractioncertspopularcategories}}{{${(100 * n_certs_popular / n_certs_all):.0f}\\%$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\fractionCertsSmartcards}}{{${(100 * n_certs_smartcards / n_certs_all):.0f}\\%$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numCCBulkArchivedTwentyNineteen}}{{${(df.not_valid_after.value_counts()['2019-09-01']):.0f}$}}\")\n",
    "\n",
    "# Number of attempts to convert documents pdf->txt\n",
    "num_cc_convert_attempts = sum(bool(cert.state.st_pdf_path) for cert in dset) + sum(\n",
    "    bool(cert.state.report_pdf_path) for cert in dset\n",
    ")\n",
    "# Number of OCR attempts\n",
    "num_cc_ocr_attempted = sum(bool(cert.state.st_convert_garbage) for cert in dset) + sum(\n",
    "    bool(cert.state.report_convert_garbage) for cert in dset\n",
    ")\n",
    "num_cc_ocr_success = sum(bool(cert.state.st_convert_garbage and cert.state.st_convert_ok) for cert in dset) + sum(\n",
    "    bool(cert.state.report_convert_garbage and cert.state.report_convert_ok) for cert in dset\n",
    ")\n",
    "\n",
    "print(f\"\\\\newcommand{{\\\\numCcConvertAttempts}}{{${num_cc_convert_attempts}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numCcOcrAttempted}}{{${num_cc_ocr_attempted}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numCcOcrSuccess}}{{${num_cc_ocr_success}$}}\")\n",
    "\n",
    "# Load number of different schemes for cert ids and regexes to match theme\n",
    "with open(\"./src/sec_certs/rules.yaml\") as handle:\n",
    "    rules_yaml = yaml.load(handle, Loader=yaml.FullLoader)\n",
    "\n",
    "num_cc_schemes = len(rules_yaml[\"cc_cert_id\"].keys())\n",
    "num_cc_scheme_id_rules = sum(len(rules_yaml[\"cc_cert_id\"][x]) for x in rules_yaml[\"cc_cert_id\"])\n",
    "\n",
    "print(f\"\\\\newcommand{{\\\\numccschemes}}{{${num_cc_schemes}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numccschemeidrules}}{{${num_cc_scheme_id_rules}$}}\")\n",
    "\n",
    "num_certs_zero_day_valid = df.loc[df.not_valid_after == df.not_valid_before].shape[0]\n",
    "num_certs_empty_expiration_date = df.loc[df.not_valid_after.isna()].shape[0]\n",
    "print(f\"\\\\newcommand{{\\\\numCertsZeroDaysValid}}{{${num_certs_zero_day_valid}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numCertsEmptyExpirationDate}}{{${num_certs_empty_expiration_date}$}}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "interesting_schemes = schemes.loc[schemes.scheme.isin({\"NL\", \"SE\", \"NO\", \"UK\"})]\n",
    "interesting_schemes.scheme = interesting_schemes.scheme.cat.remove_unused_categories()\n",
    "interesting_schemes.to_csv(RESULTS_DIR / \"interesting_schemes.csv\")\n",
    "\n",
    "line = sns.lineplot(\n",
    "    data=interesting_schemes, x=\"year_from\", y=\"size\", hue=\"scheme\", style=\"scheme\", markers=True, dashes=True\n",
    ")\n",
    "line.legend(title=\"Scheme\", bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Popularity of keywords extracted from security targets over time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: Resolve duplicity in crypto_scheme mac\n",
    "st_keywords_df = dset.get_keywords_df(\"st_keywords\").drop(columns=[\"crypto_scheme.MAC\"])\n",
    "st_keywords_df = (\n",
    "    st_keywords_df.rename(columns={x: x.split(\".\")[-1] for x in st_keywords_df.columns})\n",
    "    .fillna(0)\n",
    "    .applymap(lambda x: x > 0)\n",
    ")\n",
    "\n",
    "df_keywords = df.loc[:, [\"category\", \"eal\", \"not_valid_before\", \"not_valid_after\", \"year_from\"]].copy()\n",
    "df_keywords = df_keywords.join(st_keywords_df).loc[df_keywords.year_from < 2025].copy()\n",
    "\n",
    "figure, axes = plt.subplots(4, 3)\n",
    "figure.set_size_inches(20, 20)\n",
    "figure.set_tight_layout(True)\n",
    "row = 0\n",
    "col = 0\n",
    "\n",
    "for examined_category in PANDAS_KEYWORDS_CATEGORIES:\n",
    "    cc_rules_subset = rules_get_subset(examined_category)\n",
    "    keywords = [x.split(\".\")[-1] for x in extract_key_paths(cc_rules_subset, examined_category)]\n",
    "    top_n_keywords = df_keywords.loc[:, keywords].sum().sort_values(ascending=False).head(10).index\n",
    "\n",
    "    # Count number of non-zero rows for each year, weight by number of certificates issued in the given year.\n",
    "    crypto = df_keywords.groupby(\"year_from\")[top_n_keywords].sum()\n",
    "    crypto[\"n_certs\"] = df_keywords.groupby(\"year_from\").size()\n",
    "    crypto.iloc[:, :-1] = crypto.iloc[:, :-1].div(crypto.n_certs, axis=0) * 100\n",
    "    crypto = (\n",
    "        crypto.drop(columns=[\"n_certs\"])\n",
    "        .reset_index()\n",
    "        .melt(id_vars=\"year_from\", var_name=\"keyword\", value_name=\"percentage\")\n",
    "    )  # Bring to tidy form\n",
    "\n",
    "    line = sns.lineplot(data=crypto, x=\"year_from\", y=\"percentage\", hue=\"keyword\", ax=axes[row][col])\n",
    "    line.set(\n",
    "        title=f\"Density of {examined_category} keywords over time\",\n",
    "        xlabel=\"Year of certification\",\n",
    "        ylabel=\"% of certs. containing keyword\",\n",
    "    )\n",
    "    line.yaxis.set_major_formatter(mtick.PercentFormatter())\n",
    "    line.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)\n",
    "\n",
    "    if col == 2:\n",
    "        col = 0\n",
    "        row += 1\n",
    "    else:\n",
    "        col += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frac_sha2_2014 = (\n",
    "    df_keywords.loc[(df_keywords.year_from == 2014) & (df_keywords.SHA2)].shape[0]\n",
    "    / df_keywords.loc[df_keywords.year_from == 2014].shape[0]\n",
    ")\n",
    "frac_sha2_2021 = (\n",
    "    df_keywords.loc[(df_keywords.year_from == 2021) & (df_keywords.SHA2)].shape[0]\n",
    "    / df_keywords.loc[df_keywords.year_from == 2021].shape[0]\n",
    ")\n",
    "frac_sha1_2021 = (\n",
    "    df_keywords.loc[(df_keywords.year_from == 2021) & (df_keywords.SHA1)].shape[0]\n",
    "    / df_keywords.loc[df_keywords.year_from == 2021].shape[0]\n",
    ")\n",
    "\n",
    "pq_keywords = [x.split(\".\")[-1] for x in extract_key_paths(rules_get_subset(\"pq_crypto\"), \"pq_crypto\")]\n",
    "n_positive_pq_certs = df_keywords[pq_keywords].any(axis=1).sum()\n",
    "\n",
    "ec_keywords = [x.split(\".\")[-1] for x in extract_key_paths(rules_get_subset(\"ecc_curve\"), \"ecc_curve\")]\n",
    "frac_nist_2021 = (\n",
    "    df_keywords.loc[(df_keywords.year_from == 2021) & (df_keywords.NIST)].shape[0]\n",
    "    / df_keywords.loc[(df_keywords.year_from == 2021)].shape[0]\n",
    ")\n",
    "frac_brainpool_2021 = (\n",
    "    df_keywords.loc[(df_keywords.year_from == 2021) & (df_keywords.Brainpool)].shape[0]\n",
    "    / df_keywords.loc[(df_keywords.year_from == 2021)].shape[0]\n",
    ")\n",
    "\n",
    "\n",
    "print(f\"\\\\newcommand{{\\\\numShaTwoFourteen}}{{${(100 * frac_sha2_2014):.0f}\\%$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numShaTwoTwentyOne}}{{${(100 * frac_sha2_2021):.0f}\\%$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numShaOneTwentyOne}}{{${(100 * frac_sha1_2021):.0f}\\%$}}\")\n",
    "\n",
    "print(f\"\\\\newcommand{{\\\\numSearchedPQKeywords}}{{${len(pq_keywords)}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numCertsWithPQKeywords}}{{${n_positive_pq_certs}$}}\")\n",
    "\n",
    "print(f\"\\\\newcommand{{\\\\numSearchedECKeywords}}{{${len(ec_keywords)}$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numECNISTTwentyOne}}{{${(100 * frac_nist_2021):.0f}\\%$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\numECBrainpoolTwentyOne}}{{${(100 * frac_brainpool_2021):.0f}\\%$}}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  },
  "vscode": {
   "interpreter": {
    "hash": "a5b8c5b127d2cfe5bc3a1c933e197485eb9eba25154c3661362401503b4ef9d4"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}