aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authoradamjanovsky2023-09-29 13:20:13 +0200
committeradamjanovsky2023-09-29 13:20:13 +0200
commitee00fb283d8c1685abdddcbf4e097ea6ca911f09 (patch)
tree78f415e4f3817d320b122cb1fc7fafe215502110 /src
parentf59e517490112c8baeb56c81a5869774ad1b821b (diff)
downloadsec-certs-ee00fb283d8c1685abdddcbf4e097ea6ca911f09.tar.gz
sec-certs-ee00fb283d8c1685abdddcbf4e097ea6ca911f09.tar.zst
sec-certs-ee00fb283d8c1685abdddcbf4e097ea6ca911f09.zip
ditch lang, fix groupby
Diffstat (limited to 'src')
-rw-r--r--src/sec_certs/model/references/segment_extractor.py17
-rw-r--r--src/sec_certs/utils/nlp.py6
2 files changed, 13 insertions, 10 deletions
diff --git a/src/sec_certs/model/references/segment_extractor.py b/src/sec_certs/model/references/segment_extractor.py
index 27fa6f19..4839a0b4 100644
--- a/src/sec_certs/model/references/segment_extractor.py
+++ b/src/sec_certs/model/references/segment_extractor.py
@@ -9,7 +9,7 @@ from importlib.resources import files
from pathlib import Path
from typing import Any, Iterable, Literal
-import langdetect
+# import langdetect
import numpy as np
import pandas as pd
import spacy
@@ -47,7 +47,7 @@ def fill_reference_segments(record: ReferenceRecord, n_sent_before: int = 1, n_s
"""
lower = max(0, hit_index - n_before)
upper = min(max_index, hit_index + n_after)
- return range(lower, upper)
+ return range(lower, upper + 1)
with record.processed_data_source_path.open("r") as handle:
data = handle.read()
@@ -188,6 +188,7 @@ class ReferenceSegmentExtractor:
df_reports = self._build_df(report_certs, "report")
print(f"df_targets shape: {df_targets.shape}")
print(f"df_reports shape: {df_reports.shape}")
+
return ReferenceSegmentExtractor._process_df(pd.concat([df_targets, df_reports]), certs)
def _build_records(self, certs: list[CCCertificate], source: Literal["target", "report"]) -> list[ReferenceRecord]:
@@ -312,6 +313,10 @@ class ReferenceSegmentExtractor:
segment = segment.replace(ref_id, "REFERENCED_CERTIFICATE_ID")
return segment
+ def unique_elements(series):
+ combined = [item for sublist in series for item in sublist]
+ return list(set(combined))
+
"""
Fully processes the dataframe.
"""
@@ -336,14 +341,14 @@ class ReferenceSegmentExtractor:
df_processed = (
df.loc[df.segments.notnull()]
.explode("segments")
- .assign(lang=lambda df_: df_.segments.map(langdetect.detect))
- .loc[lambda df_: df_.lang.isin({"en", "fr", "de"})]
+ # .assign(lang=lambda df_: df_.segments.map(langdetect.detect))
+ # .loc[lambda df_: df_.lang.isin({"en", "fr", "de"})] # This could get disabled possibly.
.groupby(
- ["dgst", "canonical_reference_keyword", "actual_reference_keywords", "source"],
+ ["dgst", "canonical_reference_keyword"],
as_index=False,
dropna=False,
)
- .agg({"segments": list, "lang": list})
+ .agg({"segments": list, "actual_reference_keywords": unique_elements})
.assign(
split=lambda df_: df_.dgst.map(split_dct),
label=lambda df_: [
diff --git a/src/sec_certs/utils/nlp.py b/src/sec_certs/utils/nlp.py
index cfa4aaf9..1e8dc911 100644
--- a/src/sec_certs/utils/nlp.py
+++ b/src/sec_certs/utils/nlp.py
@@ -29,10 +29,8 @@ def filter_short_sentences(sentences, actual_reference_keywords):
def prepare_reference_annotations_df(df: pd.DataFrame):
if df.loc[(df.label != "SELF") & (df.label.notnull())].empty:
raise ValueError("No expert annotations found in the dataset of references.")
- df = (
- df.loc[lambda df_: (df_.label != "SELF") & (df_.label.notnull())]
- .assign(segments=lambda df_: eval_strings_if_necessary(df_.segments))
- .drop(columns="lang")
+ df = df.loc[lambda df_: (df_.label != "SELF") & (df_.label.notnull())].assign(
+ segments=lambda df_: eval_strings_if_necessary(df_.segments)
)
df.segments = df.apply(
lambda row: filter_short_sentences(row["segments"], row["actual_reference_keywords"]), axis=1