aboutsummaryrefslogtreecommitdiffhomepage
path: root/notebooks/cc/reference_annotations/prediction.ipynb
diff options
context:
space:
mode:
authoradamjanovsky2023-02-17 14:11:57 +0100
committeradamjanovsky2023-02-17 14:11:57 +0100
commit76d032e53e4e84c1c0036daaadeab2821be0730a (patch)
tree74e4c08f6022de4404563216c461b8d705df1037 /notebooks/cc/reference_annotations/prediction.ipynb
parent7508e2b03444c70e44c0597ed333a074917024f7 (diff)
downloadsec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.tar.gz
sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.tar.zst
sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.zip
more work on PoC reference annotator
Diffstat (limited to 'notebooks/cc/reference_annotations/prediction.ipynb')
-rw-r--r--notebooks/cc/reference_annotations/prediction.ipynb719
1 files changed, 195 insertions, 524 deletions
diff --git a/notebooks/cc/reference_annotations/prediction.ipynb b/notebooks/cc/reference_annotations/prediction.ipynb
index 435fe684..29b63523 100644
--- a/notebooks/cc/reference_annotations/prediction.ipynb
+++ b/notebooks/cc/reference_annotations/prediction.ipynb
@@ -1,8 +1,20 @@
{
"cells": [
{
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prediction of reference annotations in CC Certificates (Raw)\n",
+ "\n",
+ "This notebook:\n",
+ "- loads dataframe of a dataset with `(dgst, cert_id, sentences, label)`\n",
+ "- Trains a model to classify the sentences related to certificate reference to their common sentiment (meaning of reference)"
+ ]
+ },
+ {
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -14,7 +26,7 @@
}
],
"source": [
- "# It is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook\n",
+ "# When on Aura, it is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook\n",
"# For available GPUs, see https://www.fi.muni.cz/tech/unix/aura.html.cs\n",
"\n",
"import os\n",
@@ -22,140 +34,38 @@
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"MIG-a5459e6a-b26d-5985-874c-528458a7728b\"\n",
"print(os.getenv(\"CUDA_VISIBLE_DEVICES\"))\n",
"\n",
- "# import spacy\n",
- "# from spacy_cld import LanguageDetector\n",
- "\n",
- "# nlp = spacy.load(\"en_core_web_sm\")\n",
- "# language_detector = LanguageDetector()\n",
- "# nlp.add_pipe(language_detector)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "# installed packages: setfit\n",
- "\n",
"import pandas as pd\n",
- "from datasets import Dataset, DatasetDict, ClassLabel, Features, Value, Sequence\n",
- "from sentence_transformers.losses import CosineSimilarityLoss\n",
+ "from sec_certs.utils.nlp import prec_recall_metric\n",
"from ast import literal_eval\n",
- "from setfit import SetFitModel, SetFitTrainer, sample_dataset\n",
- "\n",
- "from sklearn.metrics import precision_score, recall_score\n",
"from pathlib import Path\n",
+ "from sec_certs.model.reference_classification import ReferenceClassifierTrainer\n",
+ "import numpy as np\n",
"\n",
"REPO_ROOT = Path(\"../../../\").resolve()\n",
"\n",
- "def prec_recall_metric(y_pred, y_test):\n",
- " return {\"precision\": precision_score(y_test, y_pred, zero_division=\"warn\", average=\"micro\"), \"recall\": recall_score(y_test, y_pred, zero_division=\"warn\", average=\"micro\")}\n",
"\n",
- "def predict_and_fill_df(model, df, train_dataset) -> pd.DataFrame:\n",
- " label_mapping = {index: x for index, x in enumerate(train_dataset.features[\"label\"].names)}\n",
+ "def predict_and_fill_df(clf, df, label_mapping):\n",
+ " \"\"\"\n",
+ " Given the classifier, dataframe and label mapping, will populate dataframe with predictions for simple inspection.\n",
+ " \"\"\"\n",
" df_new = df.copy()\n",
- "\n",
- " y_train_proba = model.predict_proba(df_new.sentences.tolist())\n",
- " df_new[\"y_proba\"] = y_train_proba.tolist()\n",
- " df_new[\"y_pred\"] = df_new.y_proba.map(lambda x: label_mapping[x.index(max(x))])\n",
- " df_new[\"correct\"] = df_new.y_pred == df_new.label\n",
- " \n",
- " return df_new\n"
+ " y_proba = clf.predict_proba(df_new.sentences)\n",
+ " df_new[\"y_proba\"] = y_proba\n",
+ " df_new[\"y_pred\"] = df_new.y_proba.map(lambda x: label_mapping[np.argmax(x)])\n",
+ " df_new[\"correct\"] = df_new.label == df_new.y_pred\n",
+ " return df_new"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
- "# EXPLODED variant\n",
- "# # Prepare dataset\n",
- "\n",
- "# df = pd.read_csv(REPO_ROOT / \"datasets/reference_classification_dataset_exploded.csv\", sep=\";\")\n",
- "# df = df.loc[(df.label.notnull()) & ((df.location == \"report\"))]\n",
- "\n",
- "# # # Get language of the sentence, quite unreliable for the moment\n",
- "# # df[\"lang\"] = df.sentence.map(lambda x: nlp(x)._.languages)\n",
- "# # df[\"is_en\"] = df.lang.map(lambda x: x == [\"en\"])\n",
- "\n",
- "# # # Take suitable subset of the dataframe\n",
- "# # df = df.loc[df.is_en] # only english\n",
- "\n",
- "# df = df.loc[df.label.isin({\"COMPONENT_USED\", \"BASIS_OF_RECERTIFICATION\", \"BASIS_FOR\"})] # only the most popular labels\n",
- "\n",
- "# # Split into train/valid\n",
- "# df_train = df.loc[df.split == \"train\"].drop(columns=\"split\")\n",
- "# df_valid = df.loc[df.split == \"valid\"].drop(columns=\"split\")\n",
- "\n",
- "# dataset_features = Features(\n",
- "# {\n",
- "# \"dgst\": Value(\"string\"),\n",
- "# \"cert_id\": Value(\"string\"),\n",
- "# \"location\": Value(\"string\"),\n",
- "# \"sentences\": Value(\"string\"),\n",
- "# \"label\": ClassLabel(names=list(df.label.unique())),\n",
- "# }\n",
- "# )\n",
- "# train_dataset = Dataset.from_pandas(df_train, features=dataset_features, split=\"train\", preserve_index=False)\n",
- "# valid_dataset = Dataset.from_pandas(df_valid, features=dataset_features, split=\"validation\", preserve_index=False)\n",
- "\n",
- "# dataset = DatasetDict()\n",
- "# dataset['train'] = train_dataset\n",
- "# dataset['validation'] = valid_dataset\n",
- "\n",
- "# train_dataset = sample_dataset(dataset[\"train\"], label_column=\"label\", num_samples=10)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "099029c554974aa1abdd9660eb8e2ece",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/1 [00:00<?, ?ba/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "4b605cedf332432e8c0e055b1eb4aa54",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/1 [00:00<?, ?ba/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# GROUPED variant\n",
"# Prepare dataset\n",
"\n",
"df = pd.read_csv(REPO_ROOT / \"datasets/reference_classification_dataset_merged.csv\", sep=\";\")\n",
"df = df.loc[(df.label.notnull())]\n",
- "\n",
- "# # Get language of the sentence, quite unreliable for the moment\n",
- "# df[\"lang\"] = df.sentence.map(lambda x: nlp(x)._.languages)\n",
- "# df[\"is_en\"] = df.lang.map(lambda x: x == [\"en\"])\n",
- "\n",
- "# # Take suitable subset of the dataframe\n",
- "# df = df.loc[df.is_en] # only english\n",
- "\n",
"df = df.loc[df.label.isin({\"COMPONENT_USED\", \"BASIS_OF_RECERTIFICATION\", \"BASIS_FOR\"})] # only the most popular labels\n",
"df.sentences = df.sentences.map(lambda x: list(literal_eval(x)))\n",
"\n",
@@ -163,27 +73,13 @@
"df_train = df.loc[df.split == \"train\"].drop(columns=\"split\")\n",
"df_valid = df.loc[df.split == \"valid\"].drop(columns=\"split\")\n",
"\n",
- "dataset_features = Features(\n",
- " {\n",
- " \"dgst\": Value(\"string\"),\n",
- " \"cert_id\": Value(\"string\"),\n",
- " \"sentences\": Sequence(feature=Value(\"string\")),\n",
- " \"label\": ClassLabel(names=list(df.label.unique())),\n",
- " }\n",
- ")\n",
- "train_dataset = Dataset.from_pandas(df_train, features=dataset_features, split=\"train\", preserve_index=False)\n",
- "valid_dataset = Dataset.from_pandas(df_valid, features=dataset_features, split=\"validation\", preserve_index=False)\n",
- "\n",
- "dataset = DatasetDict()\n",
- "dataset['train'] = train_dataset\n",
- "dataset['validation'] = valid_dataset\n",
- "\n",
- "train_dataset = sample_dataset(dataset[\"train\"], label_column=\"label\", num_samples=10)\n"
+ "# Use just few examples for learning\n",
+ "df_train = df_train.sample(n=10)"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -210,83 +106,77 @@
" <th>dgst</th>\n",
" <th>cert_id</th>\n",
" <th>label</th>\n",
- " <th>split</th>\n",
" <th>sentences</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
- " <th>0</th>\n",
- " <td>0c7ef6c32cbdee47</td>\n",
- " <td>ANSSI-CC-2017/61</td>\n",
+ " <th>12</th>\n",
+ " <td>99223aca5d9eb3b3</td>\n",
+ " <td>DCSSI-2009/11</td>\n",
" <td>COMPONENT_USED</td>\n",
- " <td>valid</td>\n",
- " <td>{'Elixir-2 Project, Certification ID ANSSI-CC-...</td>\n",
+ " <td>[Toolbox Certificate DCSSI-2009/11\\nTable 1:]</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>1</th>\n",
- " <td>0c7ef6c32cbdee47</td>\n",
- " <td>BSI-DSZ-CC-1074-2019</td>\n",
- " <td>BASIS_FOR</td>\n",
- " <td>valid</td>\n",
- " <td>{'The BAC+PACE configuration is subject of the...</td>\n",
+ " <th>4</th>\n",
+ " <td>0f3900cdcd0c7f3e</td>\n",
+ " <td>BSI-DSZ-CC-1072-V4-2021-MA-01</td>\n",
+ " <td>COMPONENT_USED</td>\n",
+ " <td>[Certification Report NXP Secure Smart Card Co...</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>2</th>\n",
- " <td>0e22fe4e4e58faf4</td>\n",
- " <td>BSI-DSZ-CC-1052-V4-2021</td>\n",
+ " <th>9</th>\n",
+ " <td>6d6ade44dcc497dd</td>\n",
+ " <td>BSI-DSZ-CC-0227-2004</td>\n",
" <td>BASIS_OF_RECERTIFICATION</td>\n",
- " <td>valid</td>\n",
- " <td>{'basierend auf BSI-DSZ-CC-1052-V4-2021.'}</td>\n",
+ " <td>[This is a\\nre-certification based on BSI-DSZ-...</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>3</th>\n",
+ " <th>5</th>\n",
" <td>0f3900cdcd0c7f3e</td>\n",
- " <td>BSI-DSZ-CC-1072-V4-2021</td>\n",
+ " <td>NSCIB-CC-66030-CR5</td>\n",
" <td>COMPONENT_USED</td>\n",
- " <td>train</td>\n",
- " <td>{'BSI-DSZ-CC-1072-V4-2021 and the Crypto Libra...</td>\n",
+ " <td>[certificate identification NSCIB-CC-66030-CR5...</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>4</th>\n",
- " <td>0f3900cdcd0c7f3e</td>\n",
- " <td>BSI-DSZ-CC-1072-V4-2021-MA-01</td>\n",
+ " <th>6</th>\n",
+ " <td>1fb1564dfb0f0b04</td>\n",
+ " <td>ANSSI-CC-2020/34</td>\n",
" <td>COMPONENT_USED</td>\n",
- " <td>train</td>\n",
- " <td>{'Certification Report NXP Secure Smart Card C...</td>\n",
+ " <td>[[CER_IC] Rapport de certification ANSSI-CC-20...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
- " dgst cert_id label \\\n",
- "0 0c7ef6c32cbdee47 ANSSI-CC-2017/61 COMPONENT_USED \n",
- "1 0c7ef6c32cbdee47 BSI-DSZ-CC-1074-2019 BASIS_FOR \n",
- "2 0e22fe4e4e58faf4 BSI-DSZ-CC-1052-V4-2021 BASIS_OF_RECERTIFICATION \n",
- "3 0f3900cdcd0c7f3e BSI-DSZ-CC-1072-V4-2021 COMPONENT_USED \n",
- "4 0f3900cdcd0c7f3e BSI-DSZ-CC-1072-V4-2021-MA-01 COMPONENT_USED \n",
+ " dgst cert_id label \\\n",
+ "12 99223aca5d9eb3b3 DCSSI-2009/11 COMPONENT_USED \n",
+ "4 0f3900cdcd0c7f3e BSI-DSZ-CC-1072-V4-2021-MA-01 COMPONENT_USED \n",
+ "9 6d6ade44dcc497dd BSI-DSZ-CC-0227-2004 BASIS_OF_RECERTIFICATION \n",
+ "5 0f3900cdcd0c7f3e NSCIB-CC-66030-CR5 COMPONENT_USED \n",
+ "6 1fb1564dfb0f0b04 ANSSI-CC-2020/34 COMPONENT_USED \n",
"\n",
- " split sentences \n",
- "0 valid {'Elixir-2 Project, Certification ID ANSSI-CC-... \n",
- "1 valid {'The BAC+PACE configuration is subject of the... \n",
- "2 valid {'basierend auf BSI-DSZ-CC-1052-V4-2021.'} \n",
- "3 train {'BSI-DSZ-CC-1072-V4-2021 and the Crypto Libra... \n",
- "4 train {'Certification Report NXP Secure Smart Card C... "
+ " sentences \n",
+ "12 [Toolbox Certificate DCSSI-2009/11\\nTable 1:] \n",
+ "4 [Certification Report NXP Secure Smart Card Co... \n",
+ "9 [This is a\\nre-certification based on BSI-DSZ-... \n",
+ "5 [certificate identification NSCIB-CC-66030-CR5... \n",
+ "6 [[CER_IC] Rapport de certification ANSSI-CC-20... "
]
},
- "execution_count": 12,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df.head()"
+ "df_train.head()"
]
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -297,16 +187,16 @@
"model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.\n",
"Applying column mapping to training dataset\n",
"***** Running training *****\n",
- " Num examples = 640\n",
+ " Num examples = 1760\n",
" Num epochs = 1\n",
- " Total optimization steps = 40\n",
+ " Total optimization steps = 110\n",
" Total train batch size = 16\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "a7118a8525ee47f78f29b74fb5e62117",
+ "model_id": "96e469ad1f984bf6ba2c819884a1c231",
"version_major": 2,
"version_minor": 0
},
@@ -320,12 +210,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "96f5685713554e05af4ff7b7ec4e2f31",
+ "model_id": "a9ef28c8c0314e7f831e6e35c2af75db",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
- "Iteration: 0%| | 0/40 [00:00<?, ?it/s]"
+ "Iteration: 0%| | 0/110 [00:00<?, ?it/s]"
]
},
"metadata": {},
@@ -343,49 +233,33 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "{'precision': 0.3684210526315789, 'recall': 0.3684210526315789}\n"
+ "Internal evaluation (of model working on individual sentences)\n",
+ "{'precision': 0.45454545454545453, 'recall': 0.45454545454545453}\n",
+ "Actual evaluation after ensemble soft voting\n",
+ "{'precision': 0.2857142857142857, 'recall': 0.2857142857142857}\n"
]
}
],
"source": [
- "# Load a SetFit model from Hub\n",
- "model = SetFitModel.from_pretrained(\"all-mpnet-base-v2\")\n",
- "\n",
- "# Create trainer\n",
- "trainer = SetFitTrainer(\n",
- " model=model,\n",
- " train_dataset=train_dataset,\n",
- " eval_dataset=valid_dataset,\n",
- " loss_class=CosineSimilarityLoss,\n",
- " metric=prec_recall_metric,\n",
- " batch_size=16,\n",
- " num_iterations=40, # The number of text pairs to generate for contrastive learning\n",
- " num_epochs=1, # The number of epochs to use for contrastive learning\n",
- " column_mapping={\"sentences\": \"text\", \"label\": \"label\"} # Map dataset columns to text/label expected by trainer\n",
- ")\n",
- "# trainer.unfreeze(keep_body_frozen=False)\n",
- "\n",
- "trainer.train(show_progress_bar=True)\n",
- "metrics = trainer.evaluate()\n",
- "print(metrics)"
+ "trainer = ReferenceClassifierTrainer(df_train, df_valid, prec_recall_metric, \"transformer\")\n",
+ "trainer.train()\n",
+ "trainer.evaluate()"
]
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
- "# Can be validated with train_dataset.features[\"label\"].int2str(index) function\n",
- "label_mapping = {index: x for index, x in enumerate(train_dataset.features[\"label\"].names)}\n",
- "\n",
- "df_train = predict_and_fill_df(model, df_train, train_dataset)\n",
- "df_valid = predict_and_fill_df(model, df_valid, train_dataset)"
+ "# Take a look at misclassified instances\n",
+ "df_train = predict_and_fill_df(trainer.clf, df_train, trainer.label_mapping)\n",
+ "df_valid = predict_and_fill_df(trainer.clf, df_valid, trainer.label_mapping)"
]
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -411,7 +285,6 @@
" <th></th>\n",
" <th>dgst</th>\n",
" <th>cert_id</th>\n",
- " <th>location</th>\n",
" <th>label</th>\n",
" <th>sentences</th>\n",
" <th>y_proba</th>\n",
@@ -422,374 +295,171 @@
" <tbody>\n",
" <tr>\n",
" <th>9</th>\n",
- " <td>0e22fe4e4e58faf4</td>\n",
- " <td>BSI-DSZ-CC-1052-V4-2021</td>\n",
- " <td>report</td>\n",
+ " <td>6d6ade44dcc497dd</td>\n",
+ " <td>BSI-DSZ-CC-0227-2004</td>\n",
" <td>BASIS_OF_RECERTIFICATION</td>\n",
- " <td>basierend auf BSI-DSZ-CC-1052-V4-2021.</td>\n",
- " <td>[0.9006353496264655, 0.0993646503735345]</td>\n",
+ " <td>[This is a\\nre-certification based on BSI-DSZ-...</td>\n",
+ " <td>[0.5461188093773812, 0.45388119062261884]</td>\n",
" <td>COMPONENT_USED</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>10</th>\n",
- " <td>29964f32c68b0ce8</td>\n",
- " <td>BSI-DSZ-CC-0519-V3-2021</td>\n",
- " <td>report</td>\n",
+ " <th>19</th>\n",
+ " <td>ca5da2fe138af656</td>\n",
+ " <td>BSI-DSZ-CC-0413-2007</td>\n",
" <td>BASIS_OF_RECERTIFICATION</td>\n",
- " <td>Specific results from\\nthe evaluation process ...</td>\n",
- " <td>[0.9018767488720597, 0.09812325112794029]</td>\n",
+ " <td>[This is a re-certification based on\\nBSI-DSZ-...</td>\n",
+ " <td>[0.5465589745598575, 0.4534410254401425]</td>\n",
" <td>COMPONENT_USED</td>\n",
" <td>False</td>\n",
" </tr>\n",
- " <tr>\n",
- " <th>11</th>\n",
- " <td>29964f32c68b0ce8</td>\n",
- " <td>BSI-DSZ-CC-0519-V3-2021</td>\n",
- " <td>report</td>\n",
- " <td>BASIS_OF_RECERTIFICATION</td>\n",
- " <td>This is a re-certification based on BSI-DSZ-CC...</td>\n",
- " <td>[0.694838178998959, 0.30516182100104094]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>False</td>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " dgst cert_id label \\\n",
+ "9 6d6ade44dcc497dd BSI-DSZ-CC-0227-2004 BASIS_OF_RECERTIFICATION \n",
+ "19 ca5da2fe138af656 BSI-DSZ-CC-0413-2007 BASIS_OF_RECERTIFICATION \n",
+ "\n",
+ " sentences \\\n",
+ "9 [This is a\\nre-certification based on BSI-DSZ-... \n",
+ "19 [This is a re-certification based on\\nBSI-DSZ-... \n",
+ "\n",
+ " y_proba y_pred correct \n",
+ "9 [0.5461188093773812, 0.45388119062261884] COMPONENT_USED False \n",
+ "19 [0.5465589745598575, 0.4534410254401425] COMPONENT_USED False "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_train.loc[~df_train.correct]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>dgst</th>\n",
+ " <th>cert_id</th>\n",
+ " <th>label</th>\n",
+ " <th>sentences</th>\n",
+ " <th>y_proba</th>\n",
+ " <th>y_pred</th>\n",
+ " <th>correct</th>\n",
" </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
" <tr>\n",
- " <th>12</th>\n",
- " <td>29964f32c68b0ce8</td>\n",
- " <td>BSI-DSZ-CC-0519-V3-2021</td>\n",
- " <td>report</td>\n",
- " <td>BASIS_OF_RECERTIFICATION</td>\n",
- " <td>As the evaluation work performed for this cert...</td>\n",
- " <td>[0.8982407935537878, 0.1017592064462122]</td>\n",
+ " <th>1</th>\n",
+ " <td>0c7ef6c32cbdee47</td>\n",
+ " <td>BSI-DSZ-CC-1074-2019</td>\n",
+ " <td>BASIS_FOR</td>\n",
+ " <td>[The BAC+PACE configuration is subject of the ...</td>\n",
+ " <td>[0.9330686268852108, 0.06693137311478929]</td>\n",
" <td>COMPONENT_USED</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>22</th>\n",
- " <td>c1d88ce9dadd7d2d</td>\n",
- " <td>BSI-DSZ-CC-0312-2005</td>\n",
- " <td>report</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>[13] Certification Report BSI-DSZ-CC-0312-2005...</td>\n",
- " <td>[0.8947294027870802, 0.1052705972129198]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>True</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>23</th>\n",
- " <td>c1d88ce9dadd7d2d</td>\n",
- " <td>BSI-DSZ-CC-0312-2005</td>\n",
- " <td>report</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>P5CT072V0N refer to the certification report B...</td>\n",
- " <td>[0.900841506996804, 0.09915849300319597]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>True</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>28</th>\n",
- " <td>238f8edc5eda1358</td>\n",
- " <td>BSI-DSZ-CC-0222-2003</td>\n",
- " <td>report</td>\n",
+ " <th>2</th>\n",
+ " <td>0e22fe4e4e58faf4</td>\n",
+ " <td>BSI-DSZ-CC-1052-V4-2021</td>\n",
" <td>BASIS_OF_RECERTIFICATION</td>\n",
- " <td>This is a re-\\ncertification based on BSI-DSZ-...</td>\n",
- " <td>[0.6922405130591214, 0.3077594869408786]</td>\n",
+ " <td>[basierend auf BSI-DSZ-CC-1052-V4-2021.]</td>\n",
+ " <td>[0.7070543956916182, 0.2929456043083818]</td>\n",
" <td>COMPONENT_USED</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>29</th>\n",
+ " <th>7</th>\n",
" <td>238f8edc5eda1358</td>\n",
" <td>BSI-DSZ-CC-0222-2003</td>\n",
- " <td>report</td>\n",
- " <td>BASIS_OF_RECERTIFICATION</td>\n",
- " <td>This certification is a re-certification of BS...</td>\n",
- " <td>[0.7146859285928153, 0.2853140714071847]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>False</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>31</th>\n",
- " <td>a6fac58198296194</td>\n",
- " <td>BSI-DSZ-CC-0555-2009</td>\n",
- " <td>report</td>\n",
" <td>BASIS_OF_RECERTIFICATION</td>\n",
- " <td>Specific results from the evaluation process\\n...</td>\n",
- " <td>[0.9003949217128049, 0.09960507828719516]</td>\n",
+ " <td>[This certification is a re-certification of B...</td>\n",
+ " <td>[0.5998535578550636, 0.4001464421449364]</td>\n",
" <td>COMPONENT_USED</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>32</th>\n",
- " <td>a6fac58198296194</td>\n",
- " <td>BSI-DSZ-CC-0555-2009</td>\n",
- " <td>report</td>\n",
+ " <th>8</th>\n",
+ " <td>29964f32c68b0ce8</td>\n",
+ " <td>BSI-DSZ-CC-0519-V3-2021</td>\n",
" <td>BASIS_OF_RECERTIFICATION</td>\n",
- " <td>As the evaluation work performed for this cert...</td>\n",
- " <td>[0.8962471709210816, 0.10375282907891832]</td>\n",
+ " <td>[This is a re-certification based on BSI-DSZ-C...</td>\n",
+ " <td>[0.8727371952470533, 0.12726280475294682]</td>\n",
" <td>COMPONENT_USED</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>33</th>\n",
+ " <th>13</th>\n",
" <td>a6fac58198296194</td>\n",
" <td>BSI-DSZ-CC-0555-2009</td>\n",
- " <td>report</td>\n",
" <td>BASIS_OF_RECERTIFICATION</td>\n",
- " <td>This is a re-certification\\nbased on BSI-DSZ-C...</td>\n",
- " <td>[0.6928894855518116, 0.3071105144481884]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>False</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>40</th>\n",
- " <td>0c7ef6c32cbdee47</td>\n",
- " <td>BSI-DSZ-CC-1074-2019</td>\n",
- " <td>report</td>\n",
- " <td>BASIS_FOR</td>\n",
- " <td>The BAC+PACE configuration is subject of the s...</td>\n",
- " <td>[0.8997694315622796, 0.1002305684377204]</td>\n",
+ " <td>[Specific results from the evaluation process\\...</td>\n",
+ " <td>[0.8670438280210987, 0.13295617197890133]</td>\n",
" <td>COMPONENT_USED</td>\n",
" <td>False</td>\n",
" </tr>\n",
- " <tr>\n",
- " <th>41</th>\n",
- " <td>0c7ef6c32cbdee47</td>\n",
- " <td>BSI-DSZ-CC-1074-2019</td>\n",
- " <td>report</td>\n",
- " <td>BASIS_FOR</td>\n",
- " <td>The further security mechanism Basic Access Co...</td>\n",
- " <td>[0.8923570789488144, 0.10764292105118559]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>False</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>42</th>\n",
- " <td>0c7ef6c32cbdee47</td>\n",
- " <td>BSI-DSZ-CC-1074-2019</td>\n",
- " <td>report</td>\n",
- " <td>BASIS_FOR</td>\n",
- " <td>[14] Certification Report BSI-DSZ-CC-1074-2019...</td>\n",
- " <td>[0.9007422537659316, 0.09925774623406838]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>False</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>43</th>\n",
- " <td>0c7ef6c32cbdee47</td>\n",
- " <td>ANSSI-CC-2017/61</td>\n",
- " <td>report</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>For details\\nconcerning the CC evaluation of t...</td>\n",
- " <td>[0.8933002244736725, 0.10669977552632742]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>True</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>44</th>\n",
- " <td>0c7ef6c32cbdee47</td>\n",
- " <td>ANSSI-CC-2017/61</td>\n",
- " <td>report</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>For development and production sites regarding...</td>\n",
- " <td>[0.9000976545847758, 0.0999023454152242]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>True</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>45</th>\n",
- " <td>0c7ef6c32cbdee47</td>\n",
- " <td>ANSSI-CC-2017/61</td>\n",
- " <td>report</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>Elixir-2 Project, Certification ID ANSSI-CC-20...</td>\n",
- " <td>[0.8997425549440855, 0.10025744505591448]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>True</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>46</th>\n",
- " <td>0c7ef6c32cbdee47</td>\n",
- " <td>ANSSI-CC-2017/61</td>\n",
- " <td>report</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>® X – Security Target for\\ncomposition; STMicr...</td>\n",
- " <td>[0.8976230487254611, 0.10237695127453893]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>True</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>47</th>\n",
- " <td>0c7ef6c32cbdee47</td>\n",
- " <td>ANSSI-CC-2017/61</td>\n",
- " <td>report</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>[12] Rapport de certification ANSSI-CC-2017/61...</td>\n",
- " <td>[0.8939784357781945, 0.10602156422180552]</td>\n",
- " <td>COMPONENT_USED</td>\n",
- " <td>True</td>\n",
- " </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
- " dgst cert_id location \\\n",
- "9 0e22fe4e4e58faf4 BSI-DSZ-CC-1052-V4-2021 report \n",
- "10 29964f32c68b0ce8 BSI-DSZ-CC-0519-V3-2021 report \n",
- "11 29964f32c68b0ce8 BSI-DSZ-CC-0519-V3-2021 report \n",
- "12 29964f32c68b0ce8 BSI-DSZ-CC-0519-V3-2021 report \n",
- "22 c1d88ce9dadd7d2d BSI-DSZ-CC-0312-2005 report \n",
- "23 c1d88ce9dadd7d2d BSI-DSZ-CC-0312-2005 report \n",
- "28 238f8edc5eda1358 BSI-DSZ-CC-0222-2003 report \n",
- "29 238f8edc5eda1358 BSI-DSZ-CC-0222-2003 report \n",
- "31 a6fac58198296194 BSI-DSZ-CC-0555-2009 report \n",
- "32 a6fac58198296194 BSI-DSZ-CC-0555-2009 report \n",
- "33 a6fac58198296194 BSI-DSZ-CC-0555-2009 report \n",
- "40 0c7ef6c32cbdee47 BSI-DSZ-CC-1074-2019 report \n",
- "41 0c7ef6c32cbdee47 BSI-DSZ-CC-1074-2019 report \n",
- "42 0c7ef6c32cbdee47 BSI-DSZ-CC-1074-2019 report \n",
- "43 0c7ef6c32cbdee47 ANSSI-CC-2017/61 report \n",
- "44 0c7ef6c32cbdee47 ANSSI-CC-2017/61 report \n",
- "45 0c7ef6c32cbdee47 ANSSI-CC-2017/61 report \n",
- "46 0c7ef6c32cbdee47 ANSSI-CC-2017/61 report \n",
- "47 0c7ef6c32cbdee47 ANSSI-CC-2017/61 report \n",
- "\n",
- " label \\\n",
- "9 BASIS_OF_RECERTIFICATION \n",
- "10 BASIS_OF_RECERTIFICATION \n",
- "11 BASIS_OF_RECERTIFICATION \n",
- "12 BASIS_OF_RECERTIFICATION \n",
- "22 COMPONENT_USED \n",
- "23 COMPONENT_USED \n",
- "28 BASIS_OF_RECERTIFICATION \n",
- "29 BASIS_OF_RECERTIFICATION \n",
- "31 BASIS_OF_RECERTIFICATION \n",
- "32 BASIS_OF_RECERTIFICATION \n",
- "33 BASIS_OF_RECERTIFICATION \n",
- "40 BASIS_FOR \n",
- "41 BASIS_FOR \n",
- "42 BASIS_FOR \n",
- "43 COMPONENT_USED \n",
- "44 COMPONENT_USED \n",
- "45 COMPONENT_USED \n",
- "46 COMPONENT_USED \n",
- "47 COMPONENT_USED \n",
+ " dgst cert_id label \\\n",
+ "1 0c7ef6c32cbdee47 BSI-DSZ-CC-1074-2019 BASIS_FOR \n",
+ "2 0e22fe4e4e58faf4 BSI-DSZ-CC-1052-V4-2021 BASIS_OF_RECERTIFICATION \n",
+ "7 238f8edc5eda1358 BSI-DSZ-CC-0222-2003 BASIS_OF_RECERTIFICATION \n",
+ "8 29964f32c68b0ce8 BSI-DSZ-CC-0519-V3-2021 BASIS_OF_RECERTIFICATION \n",
+ "13 a6fac58198296194 BSI-DSZ-CC-0555-2009 BASIS_OF_RECERTIFICATION \n",
"\n",
" sentences \\\n",
- "9 basierend auf BSI-DSZ-CC-1052-V4-2021. \n",
- "10 Specific results from\\nthe evaluation process ... \n",
- "11 This is a re-certification based on BSI-DSZ-CC... \n",
- "12 As the evaluation work performed for this cert... \n",
- "22 [13] Certification Report BSI-DSZ-CC-0312-2005... \n",
- "23 P5CT072V0N refer to the certification report B... \n",
- "28 This is a re-\\ncertification based on BSI-DSZ-... \n",
- "29 This certification is a re-certification of BS... \n",
- "31 Specific results from the evaluation process\\n... \n",
- "32 As the evaluation work performed for this cert... \n",
- "33 This is a re-certification\\nbased on BSI-DSZ-C... \n",
- "40 The BAC+PACE configuration is subject of the s... \n",
- "41 The further security mechanism Basic Access Co... \n",
- "42 [14] Certification Report BSI-DSZ-CC-1074-2019... \n",
- "43 For details\\nconcerning the CC evaluation of t... \n",
- "44 For development and production sites regarding... \n",
- "45 Elixir-2 Project, Certification ID ANSSI-CC-20... \n",
- "46 ® X – Security Target for\\ncomposition; STMicr... \n",
- "47 [12] Rapport de certification ANSSI-CC-2017/61... \n",
+ "1 [The BAC+PACE configuration is subject of the ... \n",
+ "2 [basierend auf BSI-DSZ-CC-1052-V4-2021.] \n",
+ "7 [This certification is a re-certification of B... \n",
+ "8 [This is a re-certification based on BSI-DSZ-C... \n",
+ "13 [Specific results from the evaluation process\\... \n",
"\n",
" y_proba y_pred correct \n",
- "9 [0.9006353496264655, 0.0993646503735345] COMPONENT_USED False \n",
- "10 [0.9018767488720597, 0.09812325112794029] COMPONENT_USED False \n",
- "11 [0.694838178998959, 0.30516182100104094] COMPONENT_USED False \n",
- "12 [0.8982407935537878, 0.1017592064462122] COMPONENT_USED False \n",
- "22 [0.8947294027870802, 0.1052705972129198] COMPONENT_USED True \n",
- "23 [0.900841506996804, 0.09915849300319597] COMPONENT_USED True \n",
- "28 [0.6922405130591214, 0.3077594869408786] COMPONENT_USED False \n",
- "29 [0.7146859285928153, 0.2853140714071847] COMPONENT_USED False \n",
- "31 [0.9003949217128049, 0.09960507828719516] COMPONENT_USED False \n",
- "32 [0.8962471709210816, 0.10375282907891832] COMPONENT_USED False \n",
- "33 [0.6928894855518116, 0.3071105144481884] COMPONENT_USED False \n",
- "40 [0.8997694315622796, 0.1002305684377204] COMPONENT_USED False \n",
- "41 [0.8923570789488144, 0.10764292105118559] COMPONENT_USED False \n",
- "42 [0.9007422537659316, 0.09925774623406838] COMPONENT_USED False \n",
- "43 [0.8933002244736725, 0.10669977552632742] COMPONENT_USED True \n",
- "44 [0.9000976545847758, 0.0999023454152242] COMPONENT_USED True \n",
- "45 [0.8997425549440855, 0.10025744505591448] COMPONENT_USED True \n",
- "46 [0.8976230487254611, 0.10237695127453893] COMPONENT_USED True \n",
- "47 [0.8939784357781945, 0.10602156422180552] COMPONENT_USED True "
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_valid"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "first = np.array([0.1,0.2,0.7])\n",
- "second = np.array([0.2, 0.4, 0.4])\n",
- "third = np.array([0.5, 0.5, 0])\n",
- "preds = np.array([first, second, third])\n",
- "\n",
- "preds = np.power(preds, 2)\n",
- "preds = preds.sum(axis=0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([0.3 , 0.45, 0.65])"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "preds"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "2"
+ "1 [0.9330686268852108, 0.06693137311478929] COMPONENT_USED False \n",
+ "2 [0.7070543956916182, 0.2929456043083818] COMPONENT_USED False \n",
+ "7 [0.5998535578550636, 0.4001464421449364] COMPONENT_USED False \n",
+ "8 [0.8727371952470533, 0.12726280475294682] COMPONENT_USED False \n",
+ "13 [0.8670438280210987, 0.13295617197890133] COMPONENT_USED False "
]
},
- "execution_count": 12,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "np.argmax(preds)"
+ "df_valid.loc[~df_valid.correct]"
]
}
],
@@ -811,9 +481,10 @@
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
+ "orig_nbformat": 4,
"vscode": {
"interpreter": {
- "hash": "a5b8c5b127d2cfe5bc3a1c933e197485eb9eba25154c3661362401503b4ef9d4"
+ "hash": "a2ed43df31f510d0b358bd0625493376557b0c4d37aa99c09b398809f951b6a5"
}
}
},