diff options
| author | Adam Janovsky | 2023-07-19 15:23:27 +0200 |
|---|---|---|
| committer | Adam Janovsky | 2023-07-19 15:23:27 +0200 |
| commit | a288efcf961e4b79fe1172961eee6440a20bbbaa (patch) | |
| tree | f3a67398b2cfc9f296dc3bab29a9c3c9b1ade7b6 | |
| parent | f5289b61d880d29a4fc64689c4309048fca6635d (diff) | |
| download | sec-certs-a288efcf961e4b79fe1172961eee6440a20bbbaa.tar.gz sec-certs-a288efcf961e4b79fe1172961eee6440a20bbbaa.tar.zst sec-certs-a288efcf961e4b79fe1172961eee6440a20bbbaa.zip | |
add function to turn dataframe from ReferenceSegmentExtractor to LabelStudio input
| -rw-r--r-- | src/sec_certs/model/references/segment_extractor.py | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/src/sec_certs/model/references/segment_extractor.py b/src/sec_certs/model/references/segment_extractor.py index 97bababa..430730df 100644 --- a/src/sec_certs/model/references/segment_extractor.py +++ b/src/sec_certs/model/references/segment_extractor.py @@ -16,6 +16,22 @@ from sec_certs.utils import parallel_processing nlp = spacy.load("en_core_web_sm") +def references_to_label_studio(df: pd.DataFrame, filepath: Path) -> None: + """ + Prepares a DataFrame obtained from ReferenceSegmentExtractor to be used in Label Studio for manual annotation. + It then suffices to use "Natural Language Processing" -> "Text Classification" task in Label Studio. + """ + + def split(segments: list[str]) -> str: + res = "" + for x in segments: + res += "* Segment: " + x + "\n\n" + return res + + df["text"] = df["segments"].apply(split) + df.loc[:, ["dgst", "referenced_cert_id", "text"]].to_json(filepath, indent=4, orient="records") + + def fill_reference_segments_spacy(record: ReferenceRecord) -> ReferenceRecord: """ Open file, read text and extract sentences with `referenced_cert_id` match. |
