diff options
| author | adamjanovsky | 2023-02-17 14:11:57 +0100 |
|---|---|---|
| committer | adamjanovsky | 2023-02-17 14:11:57 +0100 |
| commit | 76d032e53e4e84c1c0036daaadeab2821be0730a (patch) | |
| tree | 74e4c08f6022de4404563216c461b8d705df1037 /src | |
| parent | 7508e2b03444c70e44c0597ed333a074917024f7 (diff) | |
| download | sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.tar.gz sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.tar.zst sec-certs-76d032e53e4e84c1c0036daaadeab2821be0730a.zip | |
more work on PoC reference annotator
Diffstat (limited to 'src')
| -rw-r--r-- | src/sec_certs/model/reference_classification.py | 92 | ||||
| -rw-r--r-- | src/sec_certs/utils/nlp.py | 11 |
2 files changed, 77 insertions, 26 deletions
diff --git a/src/sec_certs/model/reference_classification.py b/src/sec_certs/model/reference_classification.py index 1ca60c0b..9f1d4ff1 100644 --- a/src/sec_certs/model/reference_classification.py +++ b/src/sec_certs/model/reference_classification.py @@ -1,26 +1,30 @@ from __future__ import annotations -from typing import Callable, Literal +from dataclasses import dataclass +from typing import Any, Callable, Literal import numpy as np -from datasets import Dataset +import pandas as pd +from datasets import ClassLabel, Dataset, Features, NamedSplit, Value from sentence_transformers.losses import CosineSimilarityLoss from setfit import SetFitModel, SetFitTrainer +from sec_certs.utils.nlp import softmax + class ReferenceClassifierTrainer: def __init__( self, - train_dataset: Dataset, - eval_dataset: Dataset, + train_dataset: pd.DataFrame, + eval_dataset: pd.DataFrame, metric: Callable, mode: Literal["transformer", "baseline"] = "transformer", ): - self._model, self._trainer = self._init_trainer(mode) - self.clf = RerefenceClassifier(self._model, self.get_label_mapping()) self._train_dataset = train_dataset - self._eval_dataset = eval_dataset + self._validation_dataset = eval_dataset self._metric = metric + self._model, self._trainer, self.label_mapping = self._init_trainer(mode) + self.clf = RerefenceClassifier(self._model, self.label_mapping) def _init_trainer(self, mode: Literal["transformer", "baseline"]): return ( @@ -31,55 +35,97 @@ class ReferenceClassifierTrainer: def _init_transformer_model_and_trainer(self): model = SetFitModel.from_pretrained("all-mpnet-base-v2") + + internal_train_dataset = self._get_hugging_face_datasets_from_df(self._train_dataset, "train") + internal_validation_dataset = self._get_hugging_face_datasets_from_df(self._validation_dataset, "validation") + trainer = SetFitTrainer( model=model, - train_dataset=self._train_dataset, - eval_dataset=self._valid_dataset, + train_dataset=internal_train_dataset, + eval_dataset=internal_validation_dataset, loss_class=CosineSimilarityLoss, metric=self._metric, batch_size=16, num_iterations=40, # The number of text pairs to generate for contrastive learning num_epochs=1, # The number of epochs to use for contrastive learning column_mapping={ - "sentences": "text", + "sentence": "text", "label": "label", }, # Map dataset columns to text/label expected by trainer ) - return model, trainer + label_mapping = {index: x for index, x in enumerate(internal_train_dataset.features["label"].names)} + return model, trainer, label_mapping + + @staticmethod + def _get_hugging_face_datasets_from_df(df: pd.DataFrame, split: NamedSplit) -> Dataset: + df_to_use = df.explode("sentences").rename(columns={"sentences": "sentence"}) + features = Features( + { + "dgst": Value("string"), + "cert_id": Value("string"), + "sentence": Value("string"), + "label": ClassLabel(names=list(df_to_use.label.unique())), + } + ) + return Dataset.from_pandas(df_to_use, features=features, split=split, preserve_index=False) def _init_baseline_model_and_trainer(self): + # Process the datasets so that BaselineTrainer can work with them and init the trainer. raise NotImplementedError("Not yet implemented.") def train(self): self._trainer.train(show_progress_bar=True) def evaluate(self): - self._trainer.evaluate() + print("Internal evaluation (of model working on individual sentences)") + print(self._evaluate_raw()) + print("Actual evaluation after ensemble soft voting") + print(self._evaluate_stacked()) + + def _evaluate_raw(self): + return self._trainer.evaluate() - def get_label_mapping(self) -> dict[int, str]: - return {index: x for index, x in enumerate(self._train_dataset.features["label"].names)} + def _evaluate_stacked(self): + y_pred = self.clf.predict(self._validation_dataset.sentences) + y_true = self._validation_dataset.label + return self._metric(y_pred, y_true) +# TODO: Implement me class BaselineTrainer: - pass + """ + This is where baseline method shall be implemented. It should accept the classifier and fit it on train_dataset. + It should then use eval_dataset to evaluate the classifier. + """ + def __init__(self, model, train_dataset, eval_dataset, metric): + pass + def train(self): + pass + + def evaluate(self): + pass + + +@dataclass class RerefenceClassifier: - def __init__(self, model, label_mapping: dict[int, str]): - self._model = model - self._label_mapping = label_mapping + _model: Any + _label_mapping: dict[int, str] def predict(self, X: list[list[str]]) -> list[str]: return [self._predict_single(x) for x in X] def _predict_single(self, sample: list[str]) -> str: + return self._label_mapping[int(np.argmax(self._predict_proba_single(sample)))] + + def predict_proba(self, X: list[list[str]]) -> list[list[float]]: + return [self._predict_proba_single(x) for x in X] + + def _predict_proba_single(self, sample: list[str]) -> list[float]: """ 1. Get predictions for each sentence 2. Square every prediction to reward confidence 3. Sum probabilities for each label - 4. Map to label and return """ - preds = np.array(x for x in self._model.predict_proba(sample)) - preds = np.power(preds, 2) - preds = preds.sum(axis=0) - return self._label_mapping[int(np.argmax(preds))] + return softmax(np.power(self._model.predict_proba(sample), 2).sum(axis=0)) diff --git a/src/sec_certs/utils/nlp.py b/src/sec_certs/utils/nlp.py index 67ae2869..496b71b8 100644 --- a/src/sec_certs/utils/nlp.py +++ b/src/sec_certs/utils/nlp.py @@ -1,8 +1,13 @@ +import numpy as np from sklearn.metrics import precision_score, recall_score -def prec_recall_metric(y_pred, y_test): +def prec_recall_metric(y_pred, y_true): return { - "precision": precision_score(y_test, y_pred, zero_division="warn", average="micro"), - "recall": recall_score(y_test, y_pred, zero_division="warn", average="micro"), + "precision": precision_score(y_true, y_pred, zero_division="warn", average="micro"), + "recall": recall_score(y_true, y_pred, zero_division="warn", average="micro"), } + + +def softmax(x): + return np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum() |
