diff options
| author | adamjanovsky | 2023-02-17 14:11:57 +0100 |
|---|---|---|
| committer | adamjanovsky | 2023-02-17 14:11:57 +0100 |
| commit | 76d032e53e4e84c1c0036daaadeab2821be0730a (patch) | |
| tree | 74e4c08f6022de4404563216c461b8d705df1037 /notebooks/cc/reference_annotations/prediction.ipynb | |
| parent | 7508e2b03444c70e44c0597ed333a074917024f7 (diff) | |
| download | sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.tar.gz sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.tar.zst sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.zip | |
more work on PoC reference annotator
Diffstat (limited to 'notebooks/cc/reference_annotations/prediction.ipynb')
| -rw-r--r-- | notebooks/cc/reference_annotations/prediction.ipynb | 719 |
1 files changed, 195 insertions, 524 deletions
diff --git a/notebooks/cc/reference_annotations/prediction.ipynb b/notebooks/cc/reference_annotations/prediction.ipynb index 435fe684..29b63523 100644 --- a/notebooks/cc/reference_annotations/prediction.ipynb +++ b/notebooks/cc/reference_annotations/prediction.ipynb @@ -1,8 +1,20 @@ { "cells": [ { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prediction of reference annotations in CC Certificates (Raw)\n", + "\n", + "This notebook:\n", + "- loads dataframe of a dataset with `(dgst, cert_id, sentences, label)`\n", + "- Trains a model to classify the sentences related to certificate reference to their common sentiment (meaning of reference)" + ] + }, + { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -14,7 +26,7 @@ } ], "source": [ - "# It is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook\n", + "# When on Aura, it is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook\n", "# For available GPUs, see https://www.fi.muni.cz/tech/unix/aura.html.cs\n", "\n", "import os\n", @@ -22,140 +34,38 @@ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"MIG-a5459e6a-b26d-5985-874c-528458a7728b\"\n", "print(os.getenv(\"CUDA_VISIBLE_DEVICES\"))\n", "\n", - "# import spacy\n", - "# from spacy_cld import LanguageDetector\n", - "\n", - "# nlp = spacy.load(\"en_core_web_sm\")\n", - "# language_detector = LanguageDetector()\n", - "# nlp.add_pipe(language_detector)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# installed packages: setfit\n", - "\n", "import pandas as pd\n", - "from datasets import Dataset, DatasetDict, ClassLabel, Features, Value, Sequence\n", - "from sentence_transformers.losses import CosineSimilarityLoss\n", + "from sec_certs.utils.nlp import prec_recall_metric\n", "from ast import literal_eval\n", - "from setfit import SetFitModel, SetFitTrainer, sample_dataset\n", - "\n", - "from sklearn.metrics import precision_score, recall_score\n", "from pathlib import Path\n", + "from sec_certs.model.reference_classification import ReferenceClassifierTrainer\n", + "import numpy as np\n", "\n", "REPO_ROOT = Path(\"../../../\").resolve()\n", "\n", - "def prec_recall_metric(y_pred, y_test):\n", - " return {\"precision\": precision_score(y_test, y_pred, zero_division=\"warn\", average=\"micro\"), \"recall\": recall_score(y_test, y_pred, zero_division=\"warn\", average=\"micro\")}\n", "\n", - "def predict_and_fill_df(model, df, train_dataset) -> pd.DataFrame:\n", - " label_mapping = {index: x for index, x in enumerate(train_dataset.features[\"label\"].names)}\n", + "def predict_and_fill_df(clf, df, label_mapping):\n", + " \"\"\"\n", + " Given the classifier, dataframe and label mapping, will populate dataframe with predictions for simple inspection.\n", + " \"\"\"\n", " df_new = df.copy()\n", - "\n", - " y_train_proba = model.predict_proba(df_new.sentences.tolist())\n", - " df_new[\"y_proba\"] = y_train_proba.tolist()\n", - " df_new[\"y_pred\"] = df_new.y_proba.map(lambda x: label_mapping[x.index(max(x))])\n", - " df_new[\"correct\"] = df_new.y_pred == df_new.label\n", - " \n", - " return df_new\n" + " y_proba = clf.predict_proba(df_new.sentences)\n", + " df_new[\"y_proba\"] = y_proba\n", + " df_new[\"y_pred\"] = df_new.y_proba.map(lambda x: label_mapping[np.argmax(x)])\n", + " df_new[\"correct\"] = df_new.label == df_new.y_pred\n", + " return df_new" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "# EXPLODED variant\n", - "# # Prepare dataset\n", - "\n", - "# df = pd.read_csv(REPO_ROOT / \"datasets/reference_classification_dataset_exploded.csv\", sep=\";\")\n", - "# df = df.loc[(df.label.notnull()) & ((df.location == \"report\"))]\n", - "\n", - "# # # Get language of the sentence, quite unreliable for the moment\n", - "# # df[\"lang\"] = df.sentence.map(lambda x: nlp(x)._.languages)\n", - "# # df[\"is_en\"] = df.lang.map(lambda x: x == [\"en\"])\n", - "\n", - "# # # Take suitable subset of the dataframe\n", - "# # df = df.loc[df.is_en] # only english\n", - "\n", - "# df = df.loc[df.label.isin({\"COMPONENT_USED\", \"BASIS_OF_RECERTIFICATION\", \"BASIS_FOR\"})] # only the most popular labels\n", - "\n", - "# # Split into train/valid\n", - "# df_train = df.loc[df.split == \"train\"].drop(columns=\"split\")\n", - "# df_valid = df.loc[df.split == \"valid\"].drop(columns=\"split\")\n", - "\n", - "# dataset_features = Features(\n", - "# {\n", - "# \"dgst\": Value(\"string\"),\n", - "# \"cert_id\": Value(\"string\"),\n", - "# \"location\": Value(\"string\"),\n", - "# \"sentences\": Value(\"string\"),\n", - "# \"label\": ClassLabel(names=list(df.label.unique())),\n", - "# }\n", - "# )\n", - "# train_dataset = Dataset.from_pandas(df_train, features=dataset_features, split=\"train\", preserve_index=False)\n", - "# valid_dataset = Dataset.from_pandas(df_valid, features=dataset_features, split=\"validation\", preserve_index=False)\n", - "\n", - "# dataset = DatasetDict()\n", - "# dataset['train'] = train_dataset\n", - "# dataset['validation'] = valid_dataset\n", - "\n", - "# train_dataset = sample_dataset(dataset[\"train\"], label_column=\"label\", num_samples=10)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "099029c554974aa1abdd9660eb8e2ece", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00<?, ?ba/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4b605cedf332432e8c0e055b1eb4aa54", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00<?, ?ba/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# GROUPED variant\n", "# Prepare dataset\n", "\n", "df = pd.read_csv(REPO_ROOT / \"datasets/reference_classification_dataset_merged.csv\", sep=\";\")\n", "df = df.loc[(df.label.notnull())]\n", - "\n", - "# # Get language of the sentence, quite unreliable for the moment\n", - "# df[\"lang\"] = df.sentence.map(lambda x: nlp(x)._.languages)\n", - "# df[\"is_en\"] = df.lang.map(lambda x: x == [\"en\"])\n", - "\n", - "# # Take suitable subset of the dataframe\n", - "# df = df.loc[df.is_en] # only english\n", - "\n", "df = df.loc[df.label.isin({\"COMPONENT_USED\", \"BASIS_OF_RECERTIFICATION\", \"BASIS_FOR\"})] # only the most popular labels\n", "df.sentences = df.sentences.map(lambda x: list(literal_eval(x)))\n", "\n", @@ -163,27 +73,13 @@ "df_train = df.loc[df.split == \"train\"].drop(columns=\"split\")\n", "df_valid = df.loc[df.split == \"valid\"].drop(columns=\"split\")\n", "\n", - "dataset_features = Features(\n", - " {\n", - " \"dgst\": Value(\"string\"),\n", - " \"cert_id\": Value(\"string\"),\n", - " \"sentences\": Sequence(feature=Value(\"string\")),\n", - " \"label\": ClassLabel(names=list(df.label.unique())),\n", - " }\n", - ")\n", - "train_dataset = Dataset.from_pandas(df_train, features=dataset_features, split=\"train\", preserve_index=False)\n", - "valid_dataset = Dataset.from_pandas(df_valid, features=dataset_features, split=\"validation\", preserve_index=False)\n", - "\n", - "dataset = DatasetDict()\n", - "dataset['train'] = train_dataset\n", - "dataset['validation'] = valid_dataset\n", - "\n", - "train_dataset = sample_dataset(dataset[\"train\"], label_column=\"label\", num_samples=10)\n" + "# Use just few examples for learning\n", + "df_train = df_train.sample(n=10)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -210,83 +106,77 @@ " <th>dgst</th>\n", " <th>cert_id</th>\n", " <th>label</th>\n", - " <th>split</th>\n", " <th>sentences</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>0</th>\n", - " <td>0c7ef6c32cbdee47</td>\n", - " <td>ANSSI-CC-2017/61</td>\n", + " <th>12</th>\n", + " <td>99223aca5d9eb3b3</td>\n", + " <td>DCSSI-2009/11</td>\n", " <td>COMPONENT_USED</td>\n", - " <td>valid</td>\n", - " <td>{'Elixir-2 Project, Certification ID ANSSI-CC-...</td>\n", + " <td>[Toolbox Certificate DCSSI-2009/11\\nTable 1:]</td>\n", " </tr>\n", " <tr>\n", - " <th>1</th>\n", - " <td>0c7ef6c32cbdee47</td>\n", - " <td>BSI-DSZ-CC-1074-2019</td>\n", - " <td>BASIS_FOR</td>\n", - " <td>valid</td>\n", - " <td>{'The BAC+PACE configuration is subject of the...</td>\n", + " <th>4</th>\n", + " <td>0f3900cdcd0c7f3e</td>\n", + " <td>BSI-DSZ-CC-1072-V4-2021-MA-01</td>\n", + " <td>COMPONENT_USED</td>\n", + " <td>[Certification Report NXP Secure Smart Card Co...</td>\n", " </tr>\n", " <tr>\n", - " <th>2</th>\n", - " <td>0e22fe4e4e58faf4</td>\n", - " <td>BSI-DSZ-CC-1052-V4-2021</td>\n", + " <th>9</th>\n", + " <td>6d6ade44dcc497dd</td>\n", + " <td>BSI-DSZ-CC-0227-2004</td>\n", " <td>BASIS_OF_RECERTIFICATION</td>\n", - " <td>valid</td>\n", - " <td>{'basierend auf BSI-DSZ-CC-1052-V4-2021.'}</td>\n", + " <td>[This is a\\nre-certification based on BSI-DSZ-...</td>\n", " </tr>\n", " <tr>\n", - " <th>3</th>\n", + " <th>5</th>\n", " <td>0f3900cdcd0c7f3e</td>\n", - " <td>BSI-DSZ-CC-1072-V4-2021</td>\n", + " <td>NSCIB-CC-66030-CR5</td>\n", " <td>COMPONENT_USED</td>\n", - " <td>train</td>\n", - " <td>{'BSI-DSZ-CC-1072-V4-2021 and the Crypto Libra...</td>\n", + " <td>[certificate identification NSCIB-CC-66030-CR5...</td>\n", " </tr>\n", " <tr>\n", - " <th>4</th>\n", - " <td>0f3900cdcd0c7f3e</td>\n", - " <td>BSI-DSZ-CC-1072-V4-2021-MA-01</td>\n", + " <th>6</th>\n", + " <td>1fb1564dfb0f0b04</td>\n", + " <td>ANSSI-CC-2020/34</td>\n", " <td>COMPONENT_USED</td>\n", - " <td>train</td>\n", - " <td>{'Certification Report NXP Secure Smart Card C...</td>\n", + " <td>[[CER_IC] Rapport de certification ANSSI-CC-20...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " dgst cert_id label \\\n", - "0 0c7ef6c32cbdee47 ANSSI-CC-2017/61 COMPONENT_USED \n", - "1 0c7ef6c32cbdee47 BSI-DSZ-CC-1074-2019 BASIS_FOR \n", - "2 0e22fe4e4e58faf4 BSI-DSZ-CC-1052-V4-2021 BASIS_OF_RECERTIFICATION \n", - "3 0f3900cdcd0c7f3e BSI-DSZ-CC-1072-V4-2021 COMPONENT_USED \n", - "4 0f3900cdcd0c7f3e BSI-DSZ-CC-1072-V4-2021-MA-01 COMPONENT_USED \n", + " dgst cert_id label \\\n", + "12 99223aca5d9eb3b3 DCSSI-2009/11 COMPONENT_USED \n", + "4 0f3900cdcd0c7f3e BSI-DSZ-CC-1072-V4-2021-MA-01 COMPONENT_USED \n", + "9 6d6ade44dcc497dd BSI-DSZ-CC-0227-2004 BASIS_OF_RECERTIFICATION \n", + "5 0f3900cdcd0c7f3e NSCIB-CC-66030-CR5 COMPONENT_USED \n", + "6 1fb1564dfb0f0b04 ANSSI-CC-2020/34 COMPONENT_USED \n", "\n", - " split sentences \n", - "0 valid {'Elixir-2 Project, Certification ID ANSSI-CC-... \n", - "1 valid {'The BAC+PACE configuration is subject of the... \n", - "2 valid {'basierend auf BSI-DSZ-CC-1052-V4-2021.'} \n", - "3 train {'BSI-DSZ-CC-1072-V4-2021 and the Crypto Libra... \n", - "4 train {'Certification Report NXP Secure Smart Card C... " + " sentences \n", + "12 [Toolbox Certificate DCSSI-2009/11\\nTable 1:] \n", + "4 [Certification Report NXP Secure Smart Card Co... \n", + "9 [This is a\\nre-certification based on BSI-DSZ-... \n", + "5 [certificate identification NSCIB-CC-66030-CR5... \n", + "6 [[CER_IC] Rapport de certification ANSSI-CC-20... " ] }, - "execution_count": 12, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.head()" + "df_train.head()" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -297,16 +187,16 @@ "model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.\n", "Applying column mapping to training dataset\n", "***** Running training *****\n", - " Num examples = 640\n", + " Num examples = 1760\n", " Num epochs = 1\n", - " Total optimization steps = 40\n", + " Total optimization steps = 110\n", " Total train batch size = 16\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a7118a8525ee47f78f29b74fb5e62117", + "model_id": "96e469ad1f984bf6ba2c819884a1c231", "version_major": 2, "version_minor": 0 }, @@ -320,12 +210,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "96f5685713554e05af4ff7b7ec4e2f31", + "model_id": "a9ef28c8c0314e7f831e6e35c2af75db", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Iteration: 0%| | 0/40 [00:00<?, ?it/s]" + "Iteration: 0%| | 0/110 [00:00<?, ?it/s]" ] }, "metadata": {}, @@ -343,49 +233,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'precision': 0.3684210526315789, 'recall': 0.3684210526315789}\n" + "Internal evaluation (of model working on individual sentences)\n", + "{'precision': 0.45454545454545453, 'recall': 0.45454545454545453}\n", + "Actual evaluation after ensemble soft voting\n", + "{'precision': 0.2857142857142857, 'recall': 0.2857142857142857}\n" ] } ], "source": [ - "# Load a SetFit model from Hub\n", - "model = SetFitModel.from_pretrained(\"all-mpnet-base-v2\")\n", - "\n", - "# Create trainer\n", - "trainer = SetFitTrainer(\n", - " model=model,\n", - " train_dataset=train_dataset,\n", - " eval_dataset=valid_dataset,\n", - " loss_class=CosineSimilarityLoss,\n", - " metric=prec_recall_metric,\n", - " batch_size=16,\n", - " num_iterations=40, # The number of text pairs to generate for contrastive learning\n", - " num_epochs=1, # The number of epochs to use for contrastive learning\n", - " column_mapping={\"sentences\": \"text\", \"label\": \"label\"} # Map dataset columns to text/label expected by trainer\n", - ")\n", - "# trainer.unfreeze(keep_body_frozen=False)\n", - "\n", - "trainer.train(show_progress_bar=True)\n", - "metrics = trainer.evaluate()\n", - "print(metrics)" + "trainer = ReferenceClassifierTrainer(df_train, df_valid, prec_recall_metric, \"transformer\")\n", + "trainer.train()\n", + "trainer.evaluate()" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "# Can be validated with train_dataset.features[\"label\"].int2str(index) function\n", - "label_mapping = {index: x for index, x in enumerate(train_dataset.features[\"label\"].names)}\n", - "\n", - "df_train = predict_and_fill_df(model, df_train, train_dataset)\n", - "df_valid = predict_and_fill_df(model, df_valid, train_dataset)" + "# Take a look at misclassified instances\n", + "df_train = predict_and_fill_df(trainer.clf, df_train, trainer.label_mapping)\n", + "df_valid = predict_and_fill_df(trainer.clf, df_valid, trainer.label_mapping)" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -411,7 +285,6 @@ " <th></th>\n", " <th>dgst</th>\n", " <th>cert_id</th>\n", - " <th>location</th>\n", " <th>label</th>\n", " <th>sentences</th>\n", " <th>y_proba</th>\n", @@ -422,374 +295,171 @@ " <tbody>\n", " <tr>\n", " <th>9</th>\n", - " <td>0e22fe4e4e58faf4</td>\n", - " <td>BSI-DSZ-CC-1052-V4-2021</td>\n", - " <td>report</td>\n", + " <td>6d6ade44dcc497dd</td>\n", + " <td>BSI-DSZ-CC-0227-2004</td>\n", " <td>BASIS_OF_RECERTIFICATION</td>\n", - " <td>basierend auf BSI-DSZ-CC-1052-V4-2021.</td>\n", - " <td>[0.9006353496264655, 0.0993646503735345]</td>\n", + " <td>[This is a\\nre-certification based on BSI-DSZ-...</td>\n", + " <td>[0.5461188093773812, 0.45388119062261884]</td>\n", " <td>COMPONENT_USED</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", - " <th>10</th>\n", - " <td>29964f32c68b0ce8</td>\n", - " <td>BSI-DSZ-CC-0519-V3-2021</td>\n", - " <td>report</td>\n", + " <th>19</th>\n", + " <td>ca5da2fe138af656</td>\n", + " <td>BSI-DSZ-CC-0413-2007</td>\n", " <td>BASIS_OF_RECERTIFICATION</td>\n", - " <td>Specific results from\\nthe evaluation process ...</td>\n", - " <td>[0.9018767488720597, 0.09812325112794029]</td>\n", + " <td>[This is a re-certification based on\\nBSI-DSZ-...</td>\n", + " <td>[0.5465589745598575, 0.4534410254401425]</td>\n", " <td>COMPONENT_USED</td>\n", " <td>False</td>\n", " </tr>\n", - " <tr>\n", - " <th>11</th>\n", - " <td>29964f32c68b0ce8</td>\n", - " <td>BSI-DSZ-CC-0519-V3-2021</td>\n", - " <td>report</td>\n", - " <td>BASIS_OF_RECERTIFICATION</td>\n", - " <td>This is a re-certification based on BSI-DSZ-CC...</td>\n", - " <td>[0.694838178998959, 0.30516182100104094]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>False</td>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " dgst cert_id label \\\n", + "9 6d6ade44dcc497dd BSI-DSZ-CC-0227-2004 BASIS_OF_RECERTIFICATION \n", + "19 ca5da2fe138af656 BSI-DSZ-CC-0413-2007 BASIS_OF_RECERTIFICATION \n", + "\n", + " sentences \\\n", + "9 [This is a\\nre-certification based on BSI-DSZ-... \n", + "19 [This is a re-certification based on\\nBSI-DSZ-... \n", + "\n", + " y_proba y_pred correct \n", + "9 [0.5461188093773812, 0.45388119062261884] COMPONENT_USED False \n", + "19 [0.5465589745598575, 0.4534410254401425] COMPONENT_USED False " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train.loc[~df_train.correct]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>dgst</th>\n", + " <th>cert_id</th>\n", + " <th>label</th>\n", + " <th>sentences</th>\n", + " <th>y_proba</th>\n", + " <th>y_pred</th>\n", + " <th>correct</th>\n", " </tr>\n", + " </thead>\n", + " <tbody>\n", " <tr>\n", - " <th>12</th>\n", - " <td>29964f32c68b0ce8</td>\n", - " <td>BSI-DSZ-CC-0519-V3-2021</td>\n", - " <td>report</td>\n", - " <td>BASIS_OF_RECERTIFICATION</td>\n", - " <td>As the evaluation work performed for this cert...</td>\n", - " <td>[0.8982407935537878, 0.1017592064462122]</td>\n", + " <th>1</th>\n", + " <td>0c7ef6c32cbdee47</td>\n", + " <td>BSI-DSZ-CC-1074-2019</td>\n", + " <td>BASIS_FOR</td>\n", + " <td>[The BAC+PACE configuration is subject of the ...</td>\n", + " <td>[0.9330686268852108, 0.06693137311478929]</td>\n", " <td>COMPONENT_USED</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", - " <th>22</th>\n", - " <td>c1d88ce9dadd7d2d</td>\n", - " <td>BSI-DSZ-CC-0312-2005</td>\n", - " <td>report</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>[13] Certification Report BSI-DSZ-CC-0312-2005...</td>\n", - " <td>[0.8947294027870802, 0.1052705972129198]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>23</th>\n", - " <td>c1d88ce9dadd7d2d</td>\n", - " <td>BSI-DSZ-CC-0312-2005</td>\n", - " <td>report</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>P5CT072V0N refer to the certification report B...</td>\n", - " <td>[0.900841506996804, 0.09915849300319597]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>28</th>\n", - " <td>238f8edc5eda1358</td>\n", - " <td>BSI-DSZ-CC-0222-2003</td>\n", - " <td>report</td>\n", + " <th>2</th>\n", + " <td>0e22fe4e4e58faf4</td>\n", + " <td>BSI-DSZ-CC-1052-V4-2021</td>\n", " <td>BASIS_OF_RECERTIFICATION</td>\n", - " <td>This is a re-\\ncertification based on BSI-DSZ-...</td>\n", - " <td>[0.6922405130591214, 0.3077594869408786]</td>\n", + " <td>[basierend auf BSI-DSZ-CC-1052-V4-2021.]</td>\n", + " <td>[0.7070543956916182, 0.2929456043083818]</td>\n", " <td>COMPONENT_USED</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", - " <th>29</th>\n", + " <th>7</th>\n", " <td>238f8edc5eda1358</td>\n", " <td>BSI-DSZ-CC-0222-2003</td>\n", - " <td>report</td>\n", - " <td>BASIS_OF_RECERTIFICATION</td>\n", - " <td>This certification is a re-certification of BS...</td>\n", - " <td>[0.7146859285928153, 0.2853140714071847]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>31</th>\n", - " <td>a6fac58198296194</td>\n", - " <td>BSI-DSZ-CC-0555-2009</td>\n", - " <td>report</td>\n", " <td>BASIS_OF_RECERTIFICATION</td>\n", - " <td>Specific results from the evaluation process\\n...</td>\n", - " <td>[0.9003949217128049, 0.09960507828719516]</td>\n", + " <td>[This certification is a re-certification of B...</td>\n", + " <td>[0.5998535578550636, 0.4001464421449364]</td>\n", " <td>COMPONENT_USED</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", - " <th>32</th>\n", - " <td>a6fac58198296194</td>\n", - " <td>BSI-DSZ-CC-0555-2009</td>\n", - " <td>report</td>\n", + " <th>8</th>\n", + " <td>29964f32c68b0ce8</td>\n", + " <td>BSI-DSZ-CC-0519-V3-2021</td>\n", " <td>BASIS_OF_RECERTIFICATION</td>\n", - " <td>As the evaluation work performed for this cert...</td>\n", - " <td>[0.8962471709210816, 0.10375282907891832]</td>\n", + " <td>[This is a re-certification based on BSI-DSZ-C...</td>\n", + " <td>[0.8727371952470533, 0.12726280475294682]</td>\n", " <td>COMPONENT_USED</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", - " <th>33</th>\n", + " <th>13</th>\n", " <td>a6fac58198296194</td>\n", " <td>BSI-DSZ-CC-0555-2009</td>\n", - " <td>report</td>\n", " <td>BASIS_OF_RECERTIFICATION</td>\n", - " <td>This is a re-certification\\nbased on BSI-DSZ-C...</td>\n", - " <td>[0.6928894855518116, 0.3071105144481884]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>40</th>\n", - " <td>0c7ef6c32cbdee47</td>\n", - " <td>BSI-DSZ-CC-1074-2019</td>\n", - " <td>report</td>\n", - " <td>BASIS_FOR</td>\n", - " <td>The BAC+PACE configuration is subject of the s...</td>\n", - " <td>[0.8997694315622796, 0.1002305684377204]</td>\n", + " <td>[Specific results from the evaluation process\\...</td>\n", + " <td>[0.8670438280210987, 0.13295617197890133]</td>\n", " <td>COMPONENT_USED</td>\n", " <td>False</td>\n", " </tr>\n", - " <tr>\n", - " <th>41</th>\n", - " <td>0c7ef6c32cbdee47</td>\n", - " <td>BSI-DSZ-CC-1074-2019</td>\n", - " <td>report</td>\n", - " <td>BASIS_FOR</td>\n", - " <td>The further security mechanism Basic Access Co...</td>\n", - " <td>[0.8923570789488144, 0.10764292105118559]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>42</th>\n", - " <td>0c7ef6c32cbdee47</td>\n", - " <td>BSI-DSZ-CC-1074-2019</td>\n", - " <td>report</td>\n", - " <td>BASIS_FOR</td>\n", - " <td>[14] Certification Report BSI-DSZ-CC-1074-2019...</td>\n", - " <td>[0.9007422537659316, 0.09925774623406838]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>43</th>\n", - " <td>0c7ef6c32cbdee47</td>\n", - " <td>ANSSI-CC-2017/61</td>\n", - " <td>report</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>For details\\nconcerning the CC evaluation of t...</td>\n", - " <td>[0.8933002244736725, 0.10669977552632742]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>44</th>\n", - " <td>0c7ef6c32cbdee47</td>\n", - " <td>ANSSI-CC-2017/61</td>\n", - " <td>report</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>For development and production sites regarding...</td>\n", - " <td>[0.9000976545847758, 0.0999023454152242]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>45</th>\n", - " <td>0c7ef6c32cbdee47</td>\n", - " <td>ANSSI-CC-2017/61</td>\n", - " <td>report</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>Elixir-2 Project, Certification ID ANSSI-CC-20...</td>\n", - " <td>[0.8997425549440855, 0.10025744505591448]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>46</th>\n", - " <td>0c7ef6c32cbdee47</td>\n", - " <td>ANSSI-CC-2017/61</td>\n", - " <td>report</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>® X – Security Target for\\ncomposition; STMicr...</td>\n", - " <td>[0.8976230487254611, 0.10237695127453893]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>47</th>\n", - " <td>0c7ef6c32cbdee47</td>\n", - " <td>ANSSI-CC-2017/61</td>\n", - " <td>report</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>[12] Rapport de certification ANSSI-CC-2017/61...</td>\n", - " <td>[0.8939784357781945, 0.10602156422180552]</td>\n", - " <td>COMPONENT_USED</td>\n", - " <td>True</td>\n", - " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " dgst cert_id location \\\n", - "9 0e22fe4e4e58faf4 BSI-DSZ-CC-1052-V4-2021 report \n", - "10 29964f32c68b0ce8 BSI-DSZ-CC-0519-V3-2021 report \n", - "11 29964f32c68b0ce8 BSI-DSZ-CC-0519-V3-2021 report \n", - "12 29964f32c68b0ce8 BSI-DSZ-CC-0519-V3-2021 report \n", - "22 c1d88ce9dadd7d2d BSI-DSZ-CC-0312-2005 report \n", - "23 c1d88ce9dadd7d2d BSI-DSZ-CC-0312-2005 report \n", - "28 238f8edc5eda1358 BSI-DSZ-CC-0222-2003 report \n", - "29 238f8edc5eda1358 BSI-DSZ-CC-0222-2003 report \n", - "31 a6fac58198296194 BSI-DSZ-CC-0555-2009 report \n", - "32 a6fac58198296194 BSI-DSZ-CC-0555-2009 report \n", - "33 a6fac58198296194 BSI-DSZ-CC-0555-2009 report \n", - "40 0c7ef6c32cbdee47 BSI-DSZ-CC-1074-2019 report \n", - "41 0c7ef6c32cbdee47 BSI-DSZ-CC-1074-2019 report \n", - "42 0c7ef6c32cbdee47 BSI-DSZ-CC-1074-2019 report \n", - "43 0c7ef6c32cbdee47 ANSSI-CC-2017/61 report \n", - "44 0c7ef6c32cbdee47 ANSSI-CC-2017/61 report \n", - "45 0c7ef6c32cbdee47 ANSSI-CC-2017/61 report \n", - "46 0c7ef6c32cbdee47 ANSSI-CC-2017/61 report \n", - "47 0c7ef6c32cbdee47 ANSSI-CC-2017/61 report \n", - "\n", - " label \\\n", - "9 BASIS_OF_RECERTIFICATION \n", - "10 BASIS_OF_RECERTIFICATION \n", - "11 BASIS_OF_RECERTIFICATION \n", - "12 BASIS_OF_RECERTIFICATION \n", - "22 COMPONENT_USED \n", - "23 COMPONENT_USED \n", - "28 BASIS_OF_RECERTIFICATION \n", - "29 BASIS_OF_RECERTIFICATION \n", - "31 BASIS_OF_RECERTIFICATION \n", - "32 BASIS_OF_RECERTIFICATION \n", - "33 BASIS_OF_RECERTIFICATION \n", - "40 BASIS_FOR \n", - "41 BASIS_FOR \n", - "42 BASIS_FOR \n", - "43 COMPONENT_USED \n", - "44 COMPONENT_USED \n", - "45 COMPONENT_USED \n", - "46 COMPONENT_USED \n", - "47 COMPONENT_USED \n", + " dgst cert_id label \\\n", + "1 0c7ef6c32cbdee47 BSI-DSZ-CC-1074-2019 BASIS_FOR \n", + "2 0e22fe4e4e58faf4 BSI-DSZ-CC-1052-V4-2021 BASIS_OF_RECERTIFICATION \n", + "7 238f8edc5eda1358 BSI-DSZ-CC-0222-2003 BASIS_OF_RECERTIFICATION \n", + "8 29964f32c68b0ce8 BSI-DSZ-CC-0519-V3-2021 BASIS_OF_RECERTIFICATION \n", + "13 a6fac58198296194 BSI-DSZ-CC-0555-2009 BASIS_OF_RECERTIFICATION \n", "\n", " sentences \\\n", - "9 basierend auf BSI-DSZ-CC-1052-V4-2021. \n", - "10 Specific results from\\nthe evaluation process ... \n", - "11 This is a re-certification based on BSI-DSZ-CC... \n", - "12 As the evaluation work performed for this cert... \n", - "22 [13] Certification Report BSI-DSZ-CC-0312-2005... \n", - "23 P5CT072V0N refer to the certification report B... \n", - "28 This is a re-\\ncertification based on BSI-DSZ-... \n", - "29 This certification is a re-certification of BS... \n", - "31 Specific results from the evaluation process\\n... \n", - "32 As the evaluation work performed for this cert... \n", - "33 This is a re-certification\\nbased on BSI-DSZ-C... \n", - "40 The BAC+PACE configuration is subject of the s... \n", - "41 The further security mechanism Basic Access Co... \n", - "42 [14] Certification Report BSI-DSZ-CC-1074-2019... \n", - "43 For details\\nconcerning the CC evaluation of t... \n", - "44 For development and production sites regarding... \n", - "45 Elixir-2 Project, Certification ID ANSSI-CC-20... \n", - "46 ® X – Security Target for\\ncomposition; STMicr... \n", - "47 [12] Rapport de certification ANSSI-CC-2017/61... \n", + "1 [The BAC+PACE configuration is subject of the ... \n", + "2 [basierend auf BSI-DSZ-CC-1052-V4-2021.] \n", + "7 [This certification is a re-certification of B... \n", + "8 [This is a re-certification based on BSI-DSZ-C... \n", + "13 [Specific results from the evaluation process\\... \n", "\n", " y_proba y_pred correct \n", - "9 [0.9006353496264655, 0.0993646503735345] COMPONENT_USED False \n", - "10 [0.9018767488720597, 0.09812325112794029] COMPONENT_USED False \n", - "11 [0.694838178998959, 0.30516182100104094] COMPONENT_USED False \n", - "12 [0.8982407935537878, 0.1017592064462122] COMPONENT_USED False \n", - "22 [0.8947294027870802, 0.1052705972129198] COMPONENT_USED True \n", - "23 [0.900841506996804, 0.09915849300319597] COMPONENT_USED True \n", - "28 [0.6922405130591214, 0.3077594869408786] COMPONENT_USED False \n", - "29 [0.7146859285928153, 0.2853140714071847] COMPONENT_USED False \n", - "31 [0.9003949217128049, 0.09960507828719516] COMPONENT_USED False \n", - "32 [0.8962471709210816, 0.10375282907891832] COMPONENT_USED False \n", - "33 [0.6928894855518116, 0.3071105144481884] COMPONENT_USED False \n", - "40 [0.8997694315622796, 0.1002305684377204] COMPONENT_USED False \n", - "41 [0.8923570789488144, 0.10764292105118559] COMPONENT_USED False \n", - "42 [0.9007422537659316, 0.09925774623406838] COMPONENT_USED False \n", - "43 [0.8933002244736725, 0.10669977552632742] COMPONENT_USED True \n", - "44 [0.9000976545847758, 0.0999023454152242] COMPONENT_USED True \n", - "45 [0.8997425549440855, 0.10025744505591448] COMPONENT_USED True \n", - "46 [0.8976230487254611, 0.10237695127453893] COMPONENT_USED True \n", - "47 [0.8939784357781945, 0.10602156422180552] COMPONENT_USED True " - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_valid" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "first = np.array([0.1,0.2,0.7])\n", - "second = np.array([0.2, 0.4, 0.4])\n", - "third = np.array([0.5, 0.5, 0])\n", - "preds = np.array([first, second, third])\n", - "\n", - "preds = np.power(preds, 2)\n", - "preds = preds.sum(axis=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0.3 , 0.45, 0.65])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "preds" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" + "1 [0.9330686268852108, 0.06693137311478929] COMPONENT_USED False \n", + "2 [0.7070543956916182, 0.2929456043083818] COMPONENT_USED False \n", + "7 [0.5998535578550636, 0.4001464421449364] COMPONENT_USED False \n", + "8 [0.8727371952470533, 0.12726280475294682] COMPONENT_USED False \n", + "13 [0.8670438280210987, 0.13295617197890133] COMPONENT_USED False " ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "np.argmax(preds)" + "df_valid.loc[~df_valid.correct]" ] } ], @@ -811,9 +481,10 @@ "pygments_lexer": "ipython3", "version": "3.8.13" }, + "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "a5b8c5b127d2cfe5bc3a1c933e197485eb9eba25154c3661362401503b4ef9d4" + "hash": "a2ed43df31f510d0b358bd0625493376557b0c4d37aa99c09b398809f951b6a5" } } }, |
