diff options
Diffstat (limited to 'notebooks/cc/reference_annotations/data_preprocessing.ipynb')
| -rw-r--r-- | notebooks/cc/reference_annotations/data_preprocessing.ipynb | 57 |
1 files changed, 25 insertions, 32 deletions
diff --git a/notebooks/cc/reference_annotations/data_preprocessing.ipynb b/notebooks/cc/reference_annotations/data_preprocessing.ipynb index 5e478205..7e6e05ba 100644 --- a/notebooks/cc/reference_annotations/data_preprocessing.ipynb +++ b/notebooks/cc/reference_annotations/data_preprocessing.ipynb @@ -11,22 +11,15 @@ "\n", "1. Recover text segments that surround certificate ID for all references in CC dataset\n", "2. Create a DataFrame `(dgst, cert_id, label, text_segments)` out of the objects\n", - " - two versions, in fact: one with single row per segment, second with all segments from all sources (target, report) merged into single row\n", "3. Clean and dump into csv\n", "4. Check for label noise" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [] - } - ], + "outputs": [], "source": [ "from __future__ import annotations\n", "\n", @@ -37,8 +30,6 @@ "from sec_certs.utils.parallel_processing import process_parallel\n", "import pandas as pd\n", "import json\n", - "from tqdm import tqdm\n", - "\n", "\n", "nlp = spacy.load(\"en_core_web_sm\")\n", "from pathlib import Path\n", @@ -89,7 +80,7 @@ " Builds dataframe with [dgst,cert_id,location,reason,sentences] with references from list of ReferenceRecords.\n", " Reason set to None if not defined. \n", " \"\"\"\n", - " results = process_parallel(ReferenceRecord.get_cert_references_with_sentences, records, max_workers=200, use_threading=False, progress_bar=True)\n", + " results = process_parallel(ReferenceRecord.get_cert_references_with_sentences, records, use_threading=False, progress_bar=True)\n", " return pd.DataFrame.from_records([x.to_pandas_tuple() for x in results], columns=[\"dgst\", \"cert_id\", \"location\", \"label\", \"sentences\"])" ] }, @@ -103,17 +94,16 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " 69%|██████▉ | 654/944 [22:33<10:00, 2.07s/it]\n", - "100%|██████████| 58/58 [00:07<00:00, 8.26it/s]\n", - "100%|██████████| 944/944 [01:04<00:00, 14.59it/s]\n", - "100%|██████████| 2259/2259 [00:30<00:00, 75.10it/s]\n" + "100%|██████████| 58/58 [00:07<00:00, 8.27it/s]\n", + "100%|██████████| 944/944 [01:06<00:00, 14.12it/s]\n", + "100%|██████████| 2259/2259 [00:32<00:00, 69.22it/s]\n" ] } ], @@ -147,17 +137,21 @@ ] }, { - "cell_type": "code", - "execution_count": 49, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "df = df_labeled.copy()" + "## Process Dataframes and dump two versions into csv\n", + "\n", + "1. Version with `dgst, cert_id, location, single_sentence` as `*_exploded.csv`\n", + "2. Version where all sentences tied to `(dgst, cert_id)` key are merged into `sentences`. Saved as `*_grouped.csv`\n", + "\n", + "*Note*: So far don't work with test dataset" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -170,28 +164,27 @@ "\n", "split_dct = {**dict.fromkeys(train_digests, \"train\"), **dict.fromkeys(valid_digests, \"valid\")}\n", "\n", - "\n", "# Apply filtering\n", - "df = df.loc[df.sentences.notnull()] # TODO: We should investigate the cases when we match no sentence\n", - "df[\"split\"] = df.dgst.map(split_dct) # Annotate with splits\n", + "# TODO: We should investigate the cases when we match no sentence\n", + "df = df.loc[df.sentences.notnull()] \n", + "df[\"split\"] = df.dgst.map(split_dct)\n", "df = df.loc[df.split.notnull()] # Discard test samples\n", - "df.explode(\"sentences\").to_csv(REPO_ROOT / \"datasets/reference_classification_dataset_exploded.csv\", sep=';', index=False)\n", "\n", "# TODO: Add language detection\n", "\n", "# Aggregate sentences from different sources (target, report) into one row\n", - "df_grouped = df.groupby([\"dgst\", \"cert_id\", \"label\", \"split\"], as_index=False)[\"sentences\"].agg({\"sentences\": lambda x: set.union(*x)})\n", - "df_grouped.to_csv(REPO_ROOT / \"datasets/reference_classification_dataset_merrged.csv\", sep=';', index=False)" + "df = df.groupby([\"dgst\", \"cert_id\", \"label\", \"split\"], as_index=False)[\"sentences\"].agg({\"sentences\": lambda x: set.union(*x)})\n", + "df.to_csv(REPO_ROOT / \"datasets/reference_classification_dataset_merrged.csv\", sep=';', index=False)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "# Check for label noise\n", - "duplicates_df = df_grouped[df_grouped.duplicated(subset=[\"dgst\", \"cert_id\"], keep=False)]\n", + "# Check for label noise, i.e., search for instances that have different label of a reference.\n", + "duplicates_df = df[df.duplicated(subset=[\"dgst\", \"cert_id\"], keep=False)]\n", "if not duplicates_df.empty:\n", " print(\"Warning, label noise in dataset. I.e. tuples (dgst, cert_id) with inconsistent reason. See `duplicates_df` frame.\")" ] @@ -218,7 +211,7 @@ "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "a5b8c5b127d2cfe5bc3a1c933e197485eb9eba25154c3661362401503b4ef9d4" + "hash": "a2ed43df31f510d0b358bd0625493376557b0c4d37aa99c09b398809f951b6a5" } } }, |
