more work on PoC reference annotator

author: adamjanovsky 2023-02-17 14:11:57 +0100
committer: adamjanovsky 2023-02-17 14:11:57 +0100
commit: 76d032e53e4e84c1c0036daaadeab2821be0730a (patch)
tree: 74e4c08f6022de4404563216c461b8d705df1037 /notebooks/cc/reference_annotations/prediction.ipynb
parent: 7508e2b03444c70e44c0597ed333a074917024f7 (diff)
download: sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.tar.gz
sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.tar.zst
sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.zip
1 files changed, 195 insertions, 524 deletions
diff --git a/notebooks/cc/reference_annotations/prediction.ipynb b/notebooks/cc/reference_annotations/prediction.ipynb
index 435fe684..29b63523 100644
--- a/notebooks/cc/reference_annotations/prediction.ipynb
+++ b/notebooks/cc/reference_annotations/prediction.ipynb
@@ -1,8 +1,20 @@
 {
  "cells": [
   {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prediction of reference annotations in CC Certificates (Raw)\n",
+    "\n",
+    "This notebook:\n",
+    "- loads dataframe of a dataset with `(dgst, cert_id, sentences, label)`\n",
+    "- Trains a model to classify the sentences related to certificate reference to their common sentiment (meaning of reference)"
+   ]
+  },
+  {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -14,7 +26,7 @@
     }
    ],
    "source": [
-    "# It is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook\n",
+    "# When on Aura, it is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook\n",
     "# For available GPUs, see https://www.fi.muni.cz/tech/unix/aura.html.cs\n",
     "\n",
     "import os\n",
@@ -22,140 +34,38 @@
     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"MIG-a5459e6a-b26d-5985-874c-528458a7728b\"\n",
     "print(os.getenv(\"CUDA_VISIBLE_DEVICES\"))\n",
     "\n",
-    "# import spacy\n",
-    "# from spacy_cld import LanguageDetector\n",
-    "\n",
-    "# nlp = spacy.load(\"en_core_web_sm\")\n",
-    "# language_detector = LanguageDetector()\n",
-    "# nlp.add_pipe(language_detector)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# installed packages: setfit\n",
-    "\n",
     "import pandas as pd\n",
-    "from datasets import Dataset, DatasetDict, ClassLabel, Features, Value, Sequence\n",
-    "from sentence_transformers.losses import CosineSimilarityLoss\n",
+    "from sec_certs.utils.nlp import prec_recall_metric\n",
     "from ast import literal_eval\n",
-    "from setfit import SetFitModel, SetFitTrainer, sample_dataset\n",
-    "\n",
-    "from sklearn.metrics import precision_score, recall_score\n",
     "from pathlib import Path\n",
+    "from sec_certs.model.reference_classification import ReferenceClassifierTrainer\n",
+    "import numpy as np\n",
     "\n",
     "REPO_ROOT = Path(\"../../../\").resolve()\n",
     "\n",
-    "def prec_recall_metric(y_pred, y_test):\n",
-    "    return {\"precision\": precision_score(y_test, y_pred, zero_division=\"warn\", average=\"micro\"), \"recall\": recall_score(y_test, y_pred, zero_division=\"warn\", average=\"micro\")}\n",
     "\n",
-    "def predict_and_fill_df(model, df, train_dataset) -> pd.DataFrame:\n",
-    "    label_mapping = {index: x for index, x in enumerate(train_dataset.features[\"label\"].names)}\n",
+    "def predict_and_fill_df(clf, df, label_mapping):\n",
+    "    \"\"\"\n",
+    "    Given the classifier, dataframe and label mapping, will populate dataframe with predictions for simple inspection.\n",
+    "    \"\"\"\n",
     "    df_new = df.copy()\n",
-    "\n",
-    "    y_train_proba = model.predict_proba(df_new.sentences.tolist())\n",
-    "    df_new[\"y_proba\"] = y_train_proba.tolist()\n",
-    "    df_new[\"y_pred\"] = df_new.y_proba.map(lambda x: label_mapping[x.index(max(x))])\n",
-    "    df_new[\"correct\"] = df_new.y_pred == df_new.label\n",
-    "    \n",
-    "    return df_new\n"
+    "    y_proba = clf.predict_proba(df_new.sentences)\n",
+    "    df_new[\"y_proba\"] = y_proba\n",
+    "    df_new[\"y_pred\"] = df_new.y_proba.map(lambda x: label_mapping[np.argmax(x)])\n",
+    "    df_new[\"correct\"] = df_new.label == df_new.y_pred\n",
+    "    return df_new"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# EXPLODED variant\n",
-    "# # Prepare dataset\n",
-    "\n",
-    "# df = pd.read_csv(REPO_ROOT / \"datasets/reference_classification_dataset_exploded.csv\", sep=\";\")\n",
-    "# df = df.loc[(df.label.notnull()) & ((df.location == \"report\"))]\n",
-    "\n",
-    "# # # Get language of the sentence, quite unreliable for the moment\n",
-    "# # df[\"lang\"] = df.sentence.map(lambda x: nlp(x)._.languages)\n",
-    "# # df[\"is_en\"] = df.lang.map(lambda x: x == [\"en\"])\n",
-    "\n",
-    "# # # Take suitable subset of the dataframe\n",
-    "# # df = df.loc[df.is_en] # only english\n",
-    "\n",
-    "# df = df.loc[df.label.isin({\"COMPONENT_USED\", \"BASIS_OF_RECERTIFICATION\", \"BASIS_FOR\"})]  # only the most popular labels\n",
-    "\n",
-    "# # Split into train/valid\n",
-    "# df_train = df.loc[df.split == \"train\"].drop(columns=\"split\")\n",
-    "# df_valid = df.loc[df.split == \"valid\"].drop(columns=\"split\")\n",
-    "\n",
-    "# dataset_features = Features(\n",
-    "#     {\n",
-    "#         \"dgst\": Value(\"string\"),\n",
-    "#         \"cert_id\": Value(\"string\"),\n",
-    "#         \"location\": Value(\"string\"),\n",
-    "#         \"sentences\": Value(\"string\"),\n",
-    "#         \"label\": ClassLabel(names=list(df.label.unique())),\n",
-    "#     }\n",
-    "# )\n",
-    "# train_dataset = Dataset.from_pandas(df_train, features=dataset_features, split=\"train\", preserve_index=False)\n",
-    "# valid_dataset = Dataset.from_pandas(df_valid, features=dataset_features, split=\"validation\", preserve_index=False)\n",
-    "\n",
-    "# dataset = DatasetDict()\n",
-    "# dataset['train'] = train_dataset\n",
-    "# dataset['validation'] = valid_dataset\n",
-    "\n",
-    "# train_dataset = sample_dataset(dataset[\"train\"], label_column=\"label\", num_samples=10)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "099029c554974aa1abdd9660eb8e2ece",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4b605cedf332432e8c0e055b1eb4aa54",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# GROUPED variant\n",
     "# Prepare dataset\n",
     "\n",
     "df = pd.read_csv(REPO_ROOT / \"datasets/reference_classification_dataset_merged.csv\", sep=\";\")\n",
     "df = df.loc[(df.label.notnull())]\n",
-    "\n",
-    "# # Get language of the sentence, quite unreliable for the moment\n",
-    "# df[\"lang\"] = df.sentence.map(lambda x: nlp(x)._.languages)\n",
-    "# df[\"is_en\"] = df.lang.map(lambda x: x == [\"en\"])\n",
-    "\n",
-    "# # Take suitable subset of the dataframe\n",
-    "# df = df.loc[df.is_en] # only english\n",
-    "\n",
     "df = df.loc[df.label.isin({\"COMPONENT_USED\", \"BASIS_OF_RECERTIFICATION\", \"BASIS_FOR\"})]  # only the most popular labels\n",
     "df.sentences = df.sentences.map(lambda x: list(literal_eval(x)))\n",
     "\n",
@@ -163,27 +73,13 @@
     "df_train = df.loc[df.split == \"train\"].drop(columns=\"split\")\n",
     "df_valid = df.loc[df.split == \"valid\"].drop(columns=\"split\")\n",
     "\n",
-    "dataset_features = Features(\n",
-    "    {\n",
-    "        \"dgst\": Value(\"string\"),\n",
-    "        \"cert_id\": Value(\"string\"),\n",
-    "        \"sentences\": Sequence(feature=Value(\"string\")),\n",
-    "        \"label\": ClassLabel(names=list(df.label.unique())),\n",
-    "    }\n",
-    ")\n",
-    "train_dataset = Dataset.from_pandas(df_train, features=dataset_features, split=\"train\", preserve_index=False)\n",
-    "valid_dataset = Dataset.from_pandas(df_valid, features=dataset_features, split=\"validation\", preserve_index=False)\n",
-    "\n",
-    "dataset = DatasetDict()\n",
-    "dataset['train'] = train_dataset\n",
-    "dataset['validation'] = valid_dataset\n",
-    "\n",
-    "train_dataset = sample_dataset(dataset[\"train\"], label_column=\"label\", num_samples=10)\n"
+    "# Use just few examples for learning\n",
+    "df_train = df_train.sample(n=10)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -210,83 +106,77 @@
        "      <th>dgst</th>\n",
        "      <th>cert_id</th>\n",
        "      <th>label</th>\n",
-       "      <th>split</th>\n",
        "      <th>sentences</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0c7ef6c32cbdee47</td>\n",
-       "      <td>ANSSI-CC-2017/61</td>\n",
+       "      <th>12</th>\n",
+       "      <td>99223aca5d9eb3b3</td>\n",
+       "      <td>DCSSI-2009/11</td>\n",
        "      <td>COMPONENT_USED</td>\n",
-       "      <td>valid</td>\n",
-       "      <td>{'Elixir-2 Project, Certification ID ANSSI-CC-...</td>\n",
+       "      <td>[Toolbox Certificate DCSSI-2009/11\\nTable 1:]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0c7ef6c32cbdee47</td>\n",
-       "      <td>BSI-DSZ-CC-1074-2019</td>\n",
-       "      <td>BASIS_FOR</td>\n",
-       "      <td>valid</td>\n",
-       "      <td>{'The BAC+PACE configuration is subject of the...</td>\n",
+       "      <th>4</th>\n",
+       "      <td>0f3900cdcd0c7f3e</td>\n",
+       "      <td>BSI-DSZ-CC-1072-V4-2021-MA-01</td>\n",
+       "      <td>COMPONENT_USED</td>\n",
+       "      <td>[Certification Report NXP Secure Smart Card Co...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0e22fe4e4e58faf4</td>\n",
-       "      <td>BSI-DSZ-CC-1052-V4-2021</td>\n",
+       "      <th>9</th>\n",
+       "      <td>6d6ade44dcc497dd</td>\n",
+       "      <td>BSI-DSZ-CC-0227-2004</td>\n",
        "      <td>BASIS_OF_RECERTIFICATION</td>\n",
-       "      <td>valid</td>\n",
-       "      <td>{'basierend auf BSI-DSZ-CC-1052-V4-2021.'}</td>\n",
+       "      <td>[This is a\\nre-certification based on BSI-DSZ-...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
+       "      <th>5</th>\n",
        "      <td>0f3900cdcd0c7f3e</td>\n",
-       "      <td>BSI-DSZ-CC-1072-V4-2021</td>\n",
+       "      <td>NSCIB-CC-66030-CR5</td>\n",
        "      <td>COMPONENT_USED</td>\n",
-       "      <td>train</td>\n",
-       "      <td>{'BSI-DSZ-CC-1072-V4-2021 and the Crypto Libra...</td>\n",
+       "      <td>[certificate identification NSCIB-CC-66030-CR5...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0f3900cdcd0c7f3e</td>\n",
-       "      <td>BSI-DSZ-CC-1072-V4-2021-MA-01</td>\n",
+       "      <th>6</th>\n",
+       "      <td>1fb1564dfb0f0b04</td>\n",
+       "      <td>ANSSI-CC-2020/34</td>\n",
        "      <td>COMPONENT_USED</td>\n",
-       "      <td>train</td>\n",
-       "      <td>{'Certification Report NXP Secure Smart Card C...</td>\n",
+       "      <td>[[CER_IC] Rapport de certification ANSSI-CC-20...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "               dgst                        cert_id                     label  \\\n",
-       "0  0c7ef6c32cbdee47               ANSSI-CC-2017/61            COMPONENT_USED   \n",
-       "1  0c7ef6c32cbdee47           BSI-DSZ-CC-1074-2019                 BASIS_FOR   \n",
-       "2  0e22fe4e4e58faf4        BSI-DSZ-CC-1052-V4-2021  BASIS_OF_RECERTIFICATION   \n",
-       "3  0f3900cdcd0c7f3e        BSI-DSZ-CC-1072-V4-2021            COMPONENT_USED   \n",
-       "4  0f3900cdcd0c7f3e  BSI-DSZ-CC-1072-V4-2021-MA-01            COMPONENT_USED   \n",
+       "                dgst                        cert_id                     label  \\\n",
+       "12  99223aca5d9eb3b3                  DCSSI-2009/11            COMPONENT_USED   \n",
+       "4   0f3900cdcd0c7f3e  BSI-DSZ-CC-1072-V4-2021-MA-01            COMPONENT_USED   \n",
+       "9   6d6ade44dcc497dd           BSI-DSZ-CC-0227-2004  BASIS_OF_RECERTIFICATION   \n",
+       "5   0f3900cdcd0c7f3e             NSCIB-CC-66030-CR5            COMPONENT_USED   \n",
+       "6   1fb1564dfb0f0b04               ANSSI-CC-2020/34            COMPONENT_USED   \n",
        "\n",
-       "   split                                          sentences  \n",
-       "0  valid  {'Elixir-2 Project, Certification ID ANSSI-CC-...  \n",
-       "1  valid  {'The BAC+PACE configuration is subject of the...  \n",
-       "2  valid         {'basierend auf BSI-DSZ-CC-1052-V4-2021.'}  \n",
-       "3  train  {'BSI-DSZ-CC-1072-V4-2021 and the Crypto Libra...  \n",
-       "4  train  {'Certification Report NXP Secure Smart Card C...  "
+       "                                            sentences  \n",
+       "12      [Toolbox Certificate DCSSI-2009/11\\nTable 1:]  \n",
+       "4   [Certification Report NXP Secure Smart Card Co...  \n",
+       "9   [This is a\\nre-certification based on BSI-DSZ-...  \n",
+       "5   [certificate identification NSCIB-CC-66030-CR5...  \n",
+       "6   [[CER_IC] Rapport de certification ANSSI-CC-20...  "
       ]
      },
-     "execution_count": 12,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.head()"
+    "df_train.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -297,16 +187,16 @@
       "model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.\n",
       "Applying column mapping to training dataset\n",
       "***** Running training *****\n",
-      "  Num examples = 640\n",
+      "  Num examples = 1760\n",
       "  Num epochs = 1\n",
-      "  Total optimization steps = 40\n",
+      "  Total optimization steps = 110\n",
       "  Total train batch size = 16\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a7118a8525ee47f78f29b74fb5e62117",
+       "model_id": "96e469ad1f984bf6ba2c819884a1c231",
        "version_major": 2,
        "version_minor": 0
       },
@@ -320,12 +210,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "96f5685713554e05af4ff7b7ec4e2f31",
+       "model_id": "a9ef28c8c0314e7f831e6e35c2af75db",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Iteration:   0%|          | 0/40 [00:00<?, ?it/s]"
+       "Iteration:   0%|          | 0/110 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -343,49 +233,33 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'precision': 0.3684210526315789, 'recall': 0.3684210526315789}\n"
+      "Internal evaluation (of model working on individual sentences)\n",
+      "{'precision': 0.45454545454545453, 'recall': 0.45454545454545453}\n",
+      "Actual evaluation after ensemble soft voting\n",
+      "{'precision': 0.2857142857142857, 'recall': 0.2857142857142857}\n"
      ]
     }
    ],
    "source": [
-    "# Load a SetFit model from Hub\n",
-    "model = SetFitModel.from_pretrained(\"all-mpnet-base-v2\")\n",
-    "\n",
-    "# Create trainer\n",
-    "trainer = SetFitTrainer(\n",
-    "    model=model,\n",
-    "    train_dataset=train_dataset,\n",
-    "    eval_dataset=valid_dataset,\n",
-    "    loss_class=CosineSimilarityLoss,\n",
-    "    metric=prec_recall_metric,\n",
-    "    batch_size=16,\n",
-    "    num_iterations=40, # The number of text pairs to generate for contrastive learning\n",
-    "    num_epochs=1, # The number of epochs to use for contrastive learning\n",
-    "    column_mapping={\"sentences\": \"text\", \"label\": \"label\"} # Map dataset columns to text/label expected by trainer\n",
-    ")\n",
-    "# trainer.unfreeze(keep_body_frozen=False)\n",
-    "\n",
-    "trainer.train(show_progress_bar=True)\n",
-    "metrics = trainer.evaluate()\n",
-    "print(metrics)"
+    "trainer = ReferenceClassifierTrainer(df_train, df_valid, prec_recall_metric, \"transformer\")\n",
+    "trainer.train()\n",
+    "trainer.evaluate()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Can be validated with train_dataset.features[\"label\"].int2str(index) function\n",
-    "label_mapping = {index: x for index, x in enumerate(train_dataset.features[\"label\"].names)}\n",
-    "\n",
-    "df_train = predict_and_fill_df(model, df_train, train_dataset)\n",
-    "df_valid = predict_and_fill_df(model, df_valid, train_dataset)"
+    "# Take a look at misclassified instances\n",
+    "df_train = predict_and_fill_df(trainer.clf, df_train, trainer.label_mapping)\n",
+    "df_valid = predict_and_fill_df(trainer.clf, df_valid, trainer.label_mapping)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -411,7 +285,6 @@
        "      <th></th>\n",
        "      <th>dgst</th>\n",
        "      <th>cert_id</th>\n",
-       "      <th>location</th>\n",
        "      <th>label</th>\n",
        "      <th>sentences</th>\n",
        "      <th>y_proba</th>\n",
@@ -422,374 +295,171 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0e22fe4e4e58faf4</td>\n",
-       "      <td>BSI-DSZ-CC-1052-V4-2021</td>\n",
-       "      <td>report</td>\n",
+       "      <td>6d6ade44dcc497dd</td>\n",
+       "      <td>BSI-DSZ-CC-0227-2004</td>\n",
        "      <td>BASIS_OF_RECERTIFICATION</td>\n",
-       "      <td>basierend auf BSI-DSZ-CC-1052-V4-2021.</td>\n",
-       "      <td>[0.9006353496264655, 0.0993646503735345]</td>\n",
+       "      <td>[This is a\\nre-certification based on BSI-DSZ-...</td>\n",
+       "      <td>[0.5461188093773812, 0.45388119062261884]</td>\n",
        "      <td>COMPONENT_USED</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>29964f32c68b0ce8</td>\n",
-       "      <td>BSI-DSZ-CC-0519-V3-2021</td>\n",
-       "      <td>report</td>\n",
+       "      <th>19</th>\n",
+       "      <td>ca5da2fe138af656</td>\n",
+       "      <td>BSI-DSZ-CC-0413-2007</td>\n",
        "      <td>BASIS_OF_RECERTIFICATION</td>\n",
-       "      <td>Specific results from\\nthe evaluation process ...</td>\n",
-       "      <td>[0.9018767488720597, 0.09812325112794029]</td>\n",
+       "      <td>[This is a re-certification based on\\nBSI-DSZ-...</td>\n",
+       "      <td>[0.5465589745598575, 0.4534410254401425]</td>\n",
        "      <td>COMPONENT_USED</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>29964f32c68b0ce8</td>\n",
-       "      <td>BSI-DSZ-CC-0519-V3-2021</td>\n",
-       "      <td>report</td>\n",
-       "      <td>BASIS_OF_RECERTIFICATION</td>\n",
-       "      <td>This is a re-certification based on BSI-DSZ-CC...</td>\n",
-       "      <td>[0.694838178998959, 0.30516182100104094]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>False</td>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                dgst               cert_id                     label  \\\n",
+       "9   6d6ade44dcc497dd  BSI-DSZ-CC-0227-2004  BASIS_OF_RECERTIFICATION   \n",
+       "19  ca5da2fe138af656  BSI-DSZ-CC-0413-2007  BASIS_OF_RECERTIFICATION   \n",
+       "\n",
+       "                                            sentences  \\\n",
+       "9   [This is a\\nre-certification based on BSI-DSZ-...   \n",
+       "19  [This is a re-certification based on\\nBSI-DSZ-...   \n",
+       "\n",
+       "                                      y_proba          y_pred  correct  \n",
+       "9   [0.5461188093773812, 0.45388119062261884]  COMPONENT_USED    False  \n",
+       "19   [0.5465589745598575, 0.4534410254401425]  COMPONENT_USED    False  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train.loc[~df_train.correct]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>dgst</th>\n",
+       "      <th>cert_id</th>\n",
+       "      <th>label</th>\n",
+       "      <th>sentences</th>\n",
+       "      <th>y_proba</th>\n",
+       "      <th>y_pred</th>\n",
+       "      <th>correct</th>\n",
        "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
        "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>29964f32c68b0ce8</td>\n",
-       "      <td>BSI-DSZ-CC-0519-V3-2021</td>\n",
-       "      <td>report</td>\n",
-       "      <td>BASIS_OF_RECERTIFICATION</td>\n",
-       "      <td>As the evaluation work performed for this cert...</td>\n",
-       "      <td>[0.8982407935537878, 0.1017592064462122]</td>\n",
+       "      <th>1</th>\n",
+       "      <td>0c7ef6c32cbdee47</td>\n",
+       "      <td>BSI-DSZ-CC-1074-2019</td>\n",
+       "      <td>BASIS_FOR</td>\n",
+       "      <td>[The BAC+PACE configuration is subject of the ...</td>\n",
+       "      <td>[0.9330686268852108, 0.06693137311478929]</td>\n",
        "      <td>COMPONENT_USED</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>22</th>\n",
-       "      <td>c1d88ce9dadd7d2d</td>\n",
-       "      <td>BSI-DSZ-CC-0312-2005</td>\n",
-       "      <td>report</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>[13] Certification Report BSI-DSZ-CC-0312-2005...</td>\n",
-       "      <td>[0.8947294027870802, 0.1052705972129198]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>c1d88ce9dadd7d2d</td>\n",
-       "      <td>BSI-DSZ-CC-0312-2005</td>\n",
-       "      <td>report</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>P5CT072V0N refer to the certification report B...</td>\n",
-       "      <td>[0.900841506996804, 0.09915849300319597]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>28</th>\n",
-       "      <td>238f8edc5eda1358</td>\n",
-       "      <td>BSI-DSZ-CC-0222-2003</td>\n",
-       "      <td>report</td>\n",
+       "      <th>2</th>\n",
+       "      <td>0e22fe4e4e58faf4</td>\n",
+       "      <td>BSI-DSZ-CC-1052-V4-2021</td>\n",
        "      <td>BASIS_OF_RECERTIFICATION</td>\n",
-       "      <td>This is a re-\\ncertification based on BSI-DSZ-...</td>\n",
-       "      <td>[0.6922405130591214, 0.3077594869408786]</td>\n",
+       "      <td>[basierend auf BSI-DSZ-CC-1052-V4-2021.]</td>\n",
+       "      <td>[0.7070543956916182, 0.2929456043083818]</td>\n",
        "      <td>COMPONENT_USED</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>29</th>\n",
+       "      <th>7</th>\n",
        "      <td>238f8edc5eda1358</td>\n",
        "      <td>BSI-DSZ-CC-0222-2003</td>\n",
-       "      <td>report</td>\n",
-       "      <td>BASIS_OF_RECERTIFICATION</td>\n",
-       "      <td>This certification is a re-certification of BS...</td>\n",
-       "      <td>[0.7146859285928153, 0.2853140714071847]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>a6fac58198296194</td>\n",
-       "      <td>BSI-DSZ-CC-0555-2009</td>\n",
-       "      <td>report</td>\n",
        "      <td>BASIS_OF_RECERTIFICATION</td>\n",
-       "      <td>Specific results from the evaluation process\\n...</td>\n",
-       "      <td>[0.9003949217128049, 0.09960507828719516]</td>\n",
+       "      <td>[This certification is a re-certification of B...</td>\n",
+       "      <td>[0.5998535578550636, 0.4001464421449364]</td>\n",
        "      <td>COMPONENT_USED</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>32</th>\n",
-       "      <td>a6fac58198296194</td>\n",
-       "      <td>BSI-DSZ-CC-0555-2009</td>\n",
-       "      <td>report</td>\n",
+       "      <th>8</th>\n",
+       "      <td>29964f32c68b0ce8</td>\n",
+       "      <td>BSI-DSZ-CC-0519-V3-2021</td>\n",
        "      <td>BASIS_OF_RECERTIFICATION</td>\n",
-       "      <td>As the evaluation work performed for this cert...</td>\n",
-       "      <td>[0.8962471709210816, 0.10375282907891832]</td>\n",
+       "      <td>[This is a re-certification based on BSI-DSZ-C...</td>\n",
+       "      <td>[0.8727371952470533, 0.12726280475294682]</td>\n",
        "      <td>COMPONENT_USED</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>33</th>\n",
+       "      <th>13</th>\n",
        "      <td>a6fac58198296194</td>\n",
        "      <td>BSI-DSZ-CC-0555-2009</td>\n",
-       "      <td>report</td>\n",
        "      <td>BASIS_OF_RECERTIFICATION</td>\n",
-       "      <td>This is a re-certification\\nbased on BSI-DSZ-C...</td>\n",
-       "      <td>[0.6928894855518116, 0.3071105144481884]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>40</th>\n",
-       "      <td>0c7ef6c32cbdee47</td>\n",
-       "      <td>BSI-DSZ-CC-1074-2019</td>\n",
-       "      <td>report</td>\n",
-       "      <td>BASIS_FOR</td>\n",
-       "      <td>The BAC+PACE configuration is subject of the s...</td>\n",
-       "      <td>[0.8997694315622796, 0.1002305684377204]</td>\n",
+       "      <td>[Specific results from the evaluation process\\...</td>\n",
+       "      <td>[0.8670438280210987, 0.13295617197890133]</td>\n",
        "      <td>COMPONENT_USED</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41</th>\n",
-       "      <td>0c7ef6c32cbdee47</td>\n",
-       "      <td>BSI-DSZ-CC-1074-2019</td>\n",
-       "      <td>report</td>\n",
-       "      <td>BASIS_FOR</td>\n",
-       "      <td>The further security mechanism Basic Access Co...</td>\n",
-       "      <td>[0.8923570789488144, 0.10764292105118559]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>42</th>\n",
-       "      <td>0c7ef6c32cbdee47</td>\n",
-       "      <td>BSI-DSZ-CC-1074-2019</td>\n",
-       "      <td>report</td>\n",
-       "      <td>BASIS_FOR</td>\n",
-       "      <td>[14] Certification Report BSI-DSZ-CC-1074-2019...</td>\n",
-       "      <td>[0.9007422537659316, 0.09925774623406838]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>43</th>\n",
-       "      <td>0c7ef6c32cbdee47</td>\n",
-       "      <td>ANSSI-CC-2017/61</td>\n",
-       "      <td>report</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>For details\\nconcerning the CC evaluation of t...</td>\n",
-       "      <td>[0.8933002244736725, 0.10669977552632742]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>44</th>\n",
-       "      <td>0c7ef6c32cbdee47</td>\n",
-       "      <td>ANSSI-CC-2017/61</td>\n",
-       "      <td>report</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>For development and production sites regarding...</td>\n",
-       "      <td>[0.9000976545847758, 0.0999023454152242]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>45</th>\n",
-       "      <td>0c7ef6c32cbdee47</td>\n",
-       "      <td>ANSSI-CC-2017/61</td>\n",
-       "      <td>report</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>Elixir-2 Project, Certification ID ANSSI-CC-20...</td>\n",
-       "      <td>[0.8997425549440855, 0.10025744505591448]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>46</th>\n",
-       "      <td>0c7ef6c32cbdee47</td>\n",
-       "      <td>ANSSI-CC-2017/61</td>\n",
-       "      <td>report</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>® X – Security Target for\\ncomposition; STMicr...</td>\n",
-       "      <td>[0.8976230487254611, 0.10237695127453893]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>47</th>\n",
-       "      <td>0c7ef6c32cbdee47</td>\n",
-       "      <td>ANSSI-CC-2017/61</td>\n",
-       "      <td>report</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>[12] Rapport de certification ANSSI-CC-2017/61...</td>\n",
-       "      <td>[0.8939784357781945, 0.10602156422180552]</td>\n",
-       "      <td>COMPONENT_USED</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                dgst                  cert_id location  \\\n",
-       "9   0e22fe4e4e58faf4  BSI-DSZ-CC-1052-V4-2021   report   \n",
-       "10  29964f32c68b0ce8  BSI-DSZ-CC-0519-V3-2021   report   \n",
-       "11  29964f32c68b0ce8  BSI-DSZ-CC-0519-V3-2021   report   \n",
-       "12  29964f32c68b0ce8  BSI-DSZ-CC-0519-V3-2021   report   \n",
-       "22  c1d88ce9dadd7d2d     BSI-DSZ-CC-0312-2005   report   \n",
-       "23  c1d88ce9dadd7d2d     BSI-DSZ-CC-0312-2005   report   \n",
-       "28  238f8edc5eda1358     BSI-DSZ-CC-0222-2003   report   \n",
-       "29  238f8edc5eda1358     BSI-DSZ-CC-0222-2003   report   \n",
-       "31  a6fac58198296194     BSI-DSZ-CC-0555-2009   report   \n",
-       "32  a6fac58198296194     BSI-DSZ-CC-0555-2009   report   \n",
-       "33  a6fac58198296194     BSI-DSZ-CC-0555-2009   report   \n",
-       "40  0c7ef6c32cbdee47     BSI-DSZ-CC-1074-2019   report   \n",
-       "41  0c7ef6c32cbdee47     BSI-DSZ-CC-1074-2019   report   \n",
-       "42  0c7ef6c32cbdee47     BSI-DSZ-CC-1074-2019   report   \n",
-       "43  0c7ef6c32cbdee47         ANSSI-CC-2017/61   report   \n",
-       "44  0c7ef6c32cbdee47         ANSSI-CC-2017/61   report   \n",
-       "45  0c7ef6c32cbdee47         ANSSI-CC-2017/61   report   \n",
-       "46  0c7ef6c32cbdee47         ANSSI-CC-2017/61   report   \n",
-       "47  0c7ef6c32cbdee47         ANSSI-CC-2017/61   report   \n",
-       "\n",
-       "                       label  \\\n",
-       "9   BASIS_OF_RECERTIFICATION   \n",
-       "10  BASIS_OF_RECERTIFICATION   \n",
-       "11  BASIS_OF_RECERTIFICATION   \n",
-       "12  BASIS_OF_RECERTIFICATION   \n",
-       "22            COMPONENT_USED   \n",
-       "23            COMPONENT_USED   \n",
-       "28  BASIS_OF_RECERTIFICATION   \n",
-       "29  BASIS_OF_RECERTIFICATION   \n",
-       "31  BASIS_OF_RECERTIFICATION   \n",
-       "32  BASIS_OF_RECERTIFICATION   \n",
-       "33  BASIS_OF_RECERTIFICATION   \n",
-       "40                 BASIS_FOR   \n",
-       "41                 BASIS_FOR   \n",
-       "42                 BASIS_FOR   \n",
-       "43            COMPONENT_USED   \n",
-       "44            COMPONENT_USED   \n",
-       "45            COMPONENT_USED   \n",
-       "46            COMPONENT_USED   \n",
-       "47            COMPONENT_USED   \n",
+       "                dgst                  cert_id                     label  \\\n",
+       "1   0c7ef6c32cbdee47     BSI-DSZ-CC-1074-2019                 BASIS_FOR   \n",
+       "2   0e22fe4e4e58faf4  BSI-DSZ-CC-1052-V4-2021  BASIS_OF_RECERTIFICATION   \n",
+       "7   238f8edc5eda1358     BSI-DSZ-CC-0222-2003  BASIS_OF_RECERTIFICATION   \n",
+       "8   29964f32c68b0ce8  BSI-DSZ-CC-0519-V3-2021  BASIS_OF_RECERTIFICATION   \n",
+       "13  a6fac58198296194     BSI-DSZ-CC-0555-2009  BASIS_OF_RECERTIFICATION   \n",
        "\n",
        "                                            sentences  \\\n",
-       "9              basierend auf BSI-DSZ-CC-1052-V4-2021.   \n",
-       "10  Specific results from\\nthe evaluation process ...   \n",
-       "11  This is a re-certification based on BSI-DSZ-CC...   \n",
-       "12  As the evaluation work performed for this cert...   \n",
-       "22  [13] Certification Report BSI-DSZ-CC-0312-2005...   \n",
-       "23  P5CT072V0N refer to the certification report B...   \n",
-       "28  This is a re-\\ncertification based on BSI-DSZ-...   \n",
-       "29  This certification is a re-certification of BS...   \n",
-       "31  Specific results from the evaluation process\\n...   \n",
-       "32  As the evaluation work performed for this cert...   \n",
-       "33  This is a re-certification\\nbased on BSI-DSZ-C...   \n",
-       "40  The BAC+PACE configuration is subject of the s...   \n",
-       "41  The further security mechanism Basic Access Co...   \n",
-       "42  [14] Certification Report BSI-DSZ-CC-1074-2019...   \n",
-       "43  For details\\nconcerning the CC evaluation of t...   \n",
-       "44  For development and production sites regarding...   \n",
-       "45  Elixir-2 Project, Certification ID ANSSI-CC-20...   \n",
-       "46  ® X – Security Target for\\ncomposition; STMicr...   \n",
-       "47  [12] Rapport de certification ANSSI-CC-2017/61...   \n",
+       "1   [The BAC+PACE configuration is subject of the ...   \n",
+       "2            [basierend auf BSI-DSZ-CC-1052-V4-2021.]   \n",
+       "7   [This certification is a re-certification of B...   \n",
+       "8   [This is a re-certification based on BSI-DSZ-C...   \n",
+       "13  [Specific results from the evaluation process\\...   \n",
        "\n",
        "                                      y_proba          y_pred  correct  \n",
-       "9    [0.9006353496264655, 0.0993646503735345]  COMPONENT_USED    False  \n",
-       "10  [0.9018767488720597, 0.09812325112794029]  COMPONENT_USED    False  \n",
-       "11   [0.694838178998959, 0.30516182100104094]  COMPONENT_USED    False  \n",
-       "12   [0.8982407935537878, 0.1017592064462122]  COMPONENT_USED    False  \n",
-       "22   [0.8947294027870802, 0.1052705972129198]  COMPONENT_USED     True  \n",
-       "23   [0.900841506996804, 0.09915849300319597]  COMPONENT_USED     True  \n",
-       "28   [0.6922405130591214, 0.3077594869408786]  COMPONENT_USED    False  \n",
-       "29   [0.7146859285928153, 0.2853140714071847]  COMPONENT_USED    False  \n",
-       "31  [0.9003949217128049, 0.09960507828719516]  COMPONENT_USED    False  \n",
-       "32  [0.8962471709210816, 0.10375282907891832]  COMPONENT_USED    False  \n",
-       "33   [0.6928894855518116, 0.3071105144481884]  COMPONENT_USED    False  \n",
-       "40   [0.8997694315622796, 0.1002305684377204]  COMPONENT_USED    False  \n",
-       "41  [0.8923570789488144, 0.10764292105118559]  COMPONENT_USED    False  \n",
-       "42  [0.9007422537659316, 0.09925774623406838]  COMPONENT_USED    False  \n",
-       "43  [0.8933002244736725, 0.10669977552632742]  COMPONENT_USED     True  \n",
-       "44   [0.9000976545847758, 0.0999023454152242]  COMPONENT_USED     True  \n",
-       "45  [0.8997425549440855, 0.10025744505591448]  COMPONENT_USED     True  \n",
-       "46  [0.8976230487254611, 0.10237695127453893]  COMPONENT_USED     True  \n",
-       "47  [0.8939784357781945, 0.10602156422180552]  COMPONENT_USED     True  "
-      ]
-     },
-     "execution_count": 36,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_valid"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "first = np.array([0.1,0.2,0.7])\n",
-    "second = np.array([0.2, 0.4, 0.4])\n",
-    "third = np.array([0.5, 0.5, 0])\n",
-    "preds = np.array([first, second, third])\n",
-    "\n",
-    "preds = np.power(preds, 2)\n",
-    "preds = preds.sum(axis=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([0.3 , 0.45, 0.65])"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "preds"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2"
+       "1   [0.9330686268852108, 0.06693137311478929]  COMPONENT_USED    False  \n",
+       "2    [0.7070543956916182, 0.2929456043083818]  COMPONENT_USED    False  \n",
+       "7    [0.5998535578550636, 0.4001464421449364]  COMPONENT_USED    False  \n",
+       "8   [0.8727371952470533, 0.12726280475294682]  COMPONENT_USED    False  \n",
+       "13  [0.8670438280210987, 0.13295617197890133]  COMPONENT_USED    False  "
       ]
      },
-     "execution_count": 12,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "np.argmax(preds)"
+    "df_valid.loc[~df_valid.correct]"
    ]
   }
  ],
@@ -811,9 +481,10 @@
    "pygments_lexer": "ipython3",
    "version": "3.8.13"
   },
+  "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
-    "hash": "a5b8c5b127d2cfe5bc3a1c933e197485eb9eba25154c3661362401503b4ef9d4"
+    "hash": "a2ed43df31f510d0b358bd0625493376557b0c4d37aa99c09b398809f951b6a5"
    }
   }
  },
author	adamjanovsky	2023-02-17 14:11:57 +0100
committer	adamjanovsky	2023-02-17 14:11:57 +0100
commit	76d032e53e4e84c1c0036daaadeab2821be0730a (patch)
tree	74e4c08f6022de4404563216c461b8d705df1037 /notebooks/cc/reference_annotations/prediction.ipynb
parent	7508e2b03444c70e44c0597ed333a074917024f7 (diff)
download	sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.tar.gz sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.tar.zst sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.zip