1 files changed, 25 insertions, 32 deletions
diff --git a/notebooks/cc/reference_annotations/data_preprocessing.ipynb b/notebooks/cc/reference_annotations/data_preprocessing.ipynb
index 5e478205..7e6e05ba 100644
--- a/notebooks/cc/reference_annotations/data_preprocessing.ipynb
+++ b/notebooks/cc/reference_annotations/data_preprocessing.ipynb
@@ -11,22 +11,15 @@
     "\n",
     "1. Recover text segments that surround certificate ID for all references in CC dataset\n",
     "2. Create a DataFrame `(dgst, cert_id, label, text_segments)` out of the objects\n",
-    "    - two versions, in fact: one with single row per segment, second with all segments from all sources (target, report) merged into single row\n",
     "3. Clean and dump into csv\n",
     "4. Check for label noise"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": []
-    }
-   ],
+   "outputs": [],
    "source": [
     "from __future__ import annotations\n",
     "\n",
@@ -37,8 +30,6 @@
     "from sec_certs.utils.parallel_processing import process_parallel\n",
     "import pandas as pd\n",
     "import json\n",
-    "from tqdm import tqdm\n",
-    "\n",
     "\n",
     "nlp = spacy.load(\"en_core_web_sm\")\n",
     "from pathlib import Path\n",
@@ -89,7 +80,7 @@
     "    Builds dataframe with [dgst,cert_id,location,reason,sentences] with references from list of ReferenceRecords.\n",
     "    Reason set to None if not defined. \n",
     "    \"\"\"\n",
-    "    results =  process_parallel(ReferenceRecord.get_cert_references_with_sentences, records, max_workers=200, use_threading=False, progress_bar=True)\n",
+    "    results =  process_parallel(ReferenceRecord.get_cert_references_with_sentences, records, use_threading=False, progress_bar=True)\n",
     "    return pd.DataFrame.from_records([x.to_pandas_tuple() for x in results], columns=[\"dgst\", \"cert_id\", \"location\", \"label\", \"sentences\"])"
    ]
   },
@@ -103,17 +94,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      " 69%|██████▉   | 654/944 [22:33<10:00,  2.07s/it]\n",
-      "100%|██████████| 58/58 [00:07<00:00,  8.26it/s]\n",
-      "100%|██████████| 944/944 [01:04<00:00, 14.59it/s]\n",
-      "100%|██████████| 2259/2259 [00:30<00:00, 75.10it/s]\n"
+      "100%|██████████| 58/58 [00:07<00:00,  8.27it/s]\n",
+      "100%|██████████| 944/944 [01:06<00:00, 14.12it/s]\n",
+      "100%|██████████| 2259/2259 [00:32<00:00, 69.22it/s]\n"
      ]
     }
    ],
@@ -147,17 +137,21 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 49,
+   "attachments": {},
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "df = df_labeled.copy()"
+    "## Process Dataframes and dump two versions into csv\n",
+    "\n",
+    "1. Version with `dgst, cert_id, location, single_sentence` as `*_exploded.csv`\n",
+    "2. Version where all sentences tied to `(dgst, cert_id)` key are merged into `sentences`. Saved as `*_grouped.csv`\n",
+    "\n",
+    "*Note*: So far don't work with test dataset"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -170,28 +164,27 @@
     "\n",
     "split_dct = {**dict.fromkeys(train_digests, \"train\"), **dict.fromkeys(valid_digests, \"valid\")}\n",
     "\n",
-    "\n",
     "# Apply filtering\n",
-    "df = df.loc[df.sentences.notnull()] # TODO: We should investigate the cases when we match no sentence\n",
-    "df[\"split\"] = df.dgst.map(split_dct)  # Annotate with splits\n",
+    "# TODO: We should investigate the cases when we match no sentence\n",
+    "df = df.loc[df.sentences.notnull()] \n",
+    "df[\"split\"] = df.dgst.map(split_dct)\n",
     "df = df.loc[df.split.notnull()]  # Discard test samples\n",
-    "df.explode(\"sentences\").to_csv(REPO_ROOT / \"datasets/reference_classification_dataset_exploded.csv\", sep=';', index=False)\n",
     "\n",
     "# TODO: Add language detection\n",
     "\n",
     "# Aggregate sentences from different sources (target, report) into one row\n",
-    "df_grouped = df.groupby([\"dgst\", \"cert_id\", \"label\", \"split\"], as_index=False)[\"sentences\"].agg({\"sentences\": lambda x: set.union(*x)})\n",
-    "df_grouped.to_csv(REPO_ROOT / \"datasets/reference_classification_dataset_merrged.csv\", sep=';', index=False)"
+    "df = df.groupby([\"dgst\", \"cert_id\", \"label\", \"split\"], as_index=False)[\"sentences\"].agg({\"sentences\": lambda x: set.union(*x)})\n",
+    "df.to_csv(REPO_ROOT / \"datasets/reference_classification_dataset_merrged.csv\", sep=';', index=False)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Check for label noise\n",
-    "duplicates_df = df_grouped[df_grouped.duplicated(subset=[\"dgst\", \"cert_id\"], keep=False)]\n",
+    "# Check for label noise, i.e., search for instances that have different label of a reference.\n",
+    "duplicates_df = df[df.duplicated(subset=[\"dgst\", \"cert_id\"], keep=False)]\n",
     "if not duplicates_df.empty:\n",
     "    print(\"Warning, label noise in dataset. I.e. tuples (dgst, cert_id) with inconsistent reason. See `duplicates_df` frame.\")"
    ]
@@ -218,7 +211,7 @@
   "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
-    "hash": "a5b8c5b127d2cfe5bc3a1c933e197485eb9eba25154c3661362401503b4ef9d4"
+    "hash": "a2ed43df31f510d0b358bd0625493376557b0c4d37aa99c09b398809f951b6a5"
    }
   }
  },