aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorAdam Janovsky2023-07-19 15:23:27 +0200
committerAdam Janovsky2023-07-19 15:23:27 +0200
commita288efcf961e4b79fe1172961eee6440a20bbbaa (patch)
treef3a67398b2cfc9f296dc3bab29a9c3c9b1ade7b6
parentf5289b61d880d29a4fc64689c4309048fca6635d (diff)
downloadsec-certs-a288efcf961e4b79fe1172961eee6440a20bbbaa.tar.gz
sec-certs-a288efcf961e4b79fe1172961eee6440a20bbbaa.tar.zst
sec-certs-a288efcf961e4b79fe1172961eee6440a20bbbaa.zip
add function to turn dataframe from ReferenceSegmentExtractor to LabelStudio input
-rw-r--r--src/sec_certs/model/references/segment_extractor.py16
1 files changed, 16 insertions, 0 deletions
diff --git a/src/sec_certs/model/references/segment_extractor.py b/src/sec_certs/model/references/segment_extractor.py
index 97bababa..430730df 100644
--- a/src/sec_certs/model/references/segment_extractor.py
+++ b/src/sec_certs/model/references/segment_extractor.py
@@ -16,6 +16,22 @@ from sec_certs.utils import parallel_processing
nlp = spacy.load("en_core_web_sm")
+def references_to_label_studio(df: pd.DataFrame, filepath: Path) -> None:
+ """
+ Prepares a DataFrame obtained from ReferenceSegmentExtractor to be used in Label Studio for manual annotation.
+ It then suffices to use "Natural Language Processing" -> "Text Classification" task in Label Studio.
+ """
+
+ def split(segments: list[str]) -> str:
+ res = ""
+ for x in segments:
+ res += "* Segment: " + x + "\n\n"
+ return res
+
+ df["text"] = df["segments"].apply(split)
+ df.loc[:, ["dgst", "referenced_cert_id", "text"]].to_json(filepath, indent=4, orient="records")
+
+
def fill_reference_segments_spacy(record: ReferenceRecord) -> ReferenceRecord:
"""
Open file, read text and extract sentences with `referenced_cert_id` match.