{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import logging\n", "import sys\n", "from ast import literal_eval\n", "from pathlib import Path\n", "\n", "import pandas as pd\n", "import torch\n", "from catboost import Pool\n", "\n", "from sec_certs.dataset import CCDataset\n", "from sec_certs.model.references_nlp.evaluation import (\n", " evaluate_model,\n", ")\n", "from sec_certs.model.references_nlp.feature_extraction import (\n", " build_embeddings,\n", " dataframe_to_training_arrays,\n", " extract_geometrical_features,\n", " extract_language_features,\n", " extract_prediction_features,\n", " extract_segments,\n", " perform_dimensionality_reduction,\n", ")\n", "from sec_certs.model.references_nlp.training import train_model\n", "\n", "REPO_ROOT = Path().resolve()\n", "DATASET_PATH = REPO_ROOT / \"dataset/cc_november_23/dataset.json\"\n", "TENSORBOARD_DATA_DIR = REPO_ROOT / \"dataset/tensorboard_visualisation/\"\n", "TRAINED_MODEL_PATH = REPO_ROOT / \"dataset/reference_prediction/final_model\"\n", "\n", "print(f\"GPU available: {torch.cuda.is_available()}\")\n", "\n", "logger = logging.getLogger(__name__)\n", "logging.getLogger(\"setfit\").setLevel(logging.CRITICAL)\n", "logging.getLogger(\"sentence_transformers\").setLevel(logging.CRITICAL)\n", "file_handler = logging.StreamHandler(sys.stderr)\n", "file_handler.setFormatter(logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\"))\n", "logging.basicConfig(level=logging.INFO, handlers=[file_handler])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mode = \"evaluation\"\n", "cc_dset = CCDataset.from_json(DATASET_PATH)\n", "\n", "df = extract_segments(cc_dset, mode=mode)\n", "df.to_csv(REPO_ROOT / \"dataset/reference_prediction/dataset.csv\", index=False)\n", "\n", "df = (\n", " pd.read_csv(REPO_ROOT / \"dataset/reference_prediction/dataset.csv\")\n", " .assign(\n", " segments=lambda df_: df_.segments.apply(literal_eval),\n", " actual_reference_keywords=lambda df_: df_.actual_reference_keywords.apply(literal_eval),\n", " )\n", " .loc[lambda df_: (df_.label != \"IRRELEVANT\")]\n", ")\n", "\n", "label_mapping = {\n", " \"COMPONENT_USED\": \"COMPONENT_USED\",\n", " \"RE-EVALUATION\": \"PREVIOUS_VERSION\",\n", " \"EVALUATION_REUSED\": \"COMPONENT_USED\",\n", " \"PREVIOUS_VERSION\": \"PREVIOUS_VERSION\",\n", " \"COMPONENT_SHARED\": \"COMPONENT_USED\",\n", "}\n", "df.label = df.label.map(label_mapping)\n", "\n", "df, annotator = build_embeddings(\n", " df,\n", " mode=mode,\n", " method=\"transformer\",\n", " model_path=REPO_ROOT / \"/dataset/reference_prediction/final_model\",\n", ")\n", "df = perform_dimensionality_reduction(\n", " df,\n", " mode,\n", ")\n", "df = extract_language_features(df, cc_dset)\n", "df = extract_prediction_features(df, annotator._model)\n", "df = extract_geometrical_features(df)\n", "\n", "# Obtained from running the feature selection algorithm below\n", "features_to_use = [\n", " \"pca_mean_x\",\n", " \"pca_mean_y\",\n", " \"pca_var_y\",\n", " \"pca_cov_xy\",\n", " \"pca_median_x\",\n", " \"pca_median_y\",\n", " \"pca_std_distance_to_centroid\",\n", " \"pca_point_density\",\n", " \"umap_mean_x\",\n", " \"umap_mean_y\",\n", " \"umap_skew_y\",\n", " \"umap_cov_xy\",\n", " \"umap_median_x\",\n", " \"umap_median_y\",\n", " \"umap_max_distance_to_centroid\",\n", " \"umap_aspect_ratio\",\n", " \"lang_partial_ratio\",\n", " \"lang_token_sort_ratio\",\n", " \"lang_n_segments\",\n", " \"lang_matches_recertification\",\n", " \"lang_n_intersection_versions\",\n", " \"lang_common_words\",\n", " \"lang_bigram_overlap\",\n", " \"lang_common_suffix_len\",\n", " \"lang_character_trigram_overlap\",\n", " \"lang_len_difference\",\n", " \"pred_0\",\n", " \"pred_2\",\n", " \"pred_3\",\n", " \"pred_4\",\n", "]\n", "df_ = df[features_to_use + [\"label\", \"split\"]]\n", "\n", "x_train, y_train, x_valid, y_valid, features = dataframe_to_training_arrays(\n", " df_, mode=mode, use_pca=True, use_umap=True, use_pred=True, use_lang=True\n", ")\n", "\n", "clf = train_model(\n", " mode,\n", " x_train,\n", " y_train,\n", " x_valid,\n", " y_valid,\n", " train_baseline=False,\n", ")\n", "evaluate_model(\n", " clf,\n", " x_valid,\n", " y_valid,\n", " features,\n", " output_path=None,\n", ")\n", "\n", "# Classify the whole dataset and serialize the result\n", "x_all = df[features_to_use].values\n", "df[\"y_pred\"] = clf.predict(x_all)\n", "df[\"reference_label\"] = df.label.fillna(df.y_pred)\n", "df[[\"dgst\", \"canonical_reference_keyword\", \"reference_label\"]].to_csv(\n", " REPO_ROOT / \"/dataset/reference_prediction/predictions.csv\"\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run feature selection algorithm" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_pool = Pool(x_train, y_train, feature_names=features)\n", "valid_pool = Pool(x_valid, y_valid, feature_names=features)\n", "\n", "dct = clf.select_features(\n", " train_pool,\n", " eval_set=valid_pool,\n", " features_for_select=features,\n", " num_features_to_select=30,\n", " train_final_model=False,\n", " verbose=False,\n", ")\n", "\n", "features_to_use = dct[\"selected_features_names\"]\n", "df_lim_features = df[features_to_use + [\"label\", \"split\"]]\n", "x_train, y_train, x_valid, y_valid, features = dataframe_to_training_arrays(\n", " df_lim_features, mode=mode, use_pca=True, use_umap=True, use_pred=True, use_lang=True\n", ")\n", "\n", "clf = train_model(x_train, y_train, x_valid, y_valid, train_baseline=False)\n", "evaluate_model(\n", " clf,\n", " x_valid,\n", " y_valid,\n", " features,\n", " output_path=REPO_ROOT / \"dataset/cc_ref_annotator_evaluation/embeddings\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Serialize misclassified instances" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scheme_mapping = {x.dgst: x.scheme for x in cc_dset}\n", "all_classified_instances = df.loc[(df.label.notnull())].assign(scheme=lambda df_: df_.dgst.map(scheme_mapping))\n", "misclassified_instances = df.loc[\n", " (df.y_pred != df.label) & (df.label.notnull()),\n", " [\n", " \"dgst\",\n", " \"canonical_reference_keyword\",\n", " \"actual_reference_keywords\",\n", " \"label\",\n", " \"y_pred\",\n", " \"split\",\n", " \"segments\",\n", " \"referenced_cert_name\",\n", " \"cert_versions\",\n", " \"referenced_cert_versions\",\n", " \"lang_partial_ratio\",\n", " \"lang_token_sort_ratio\",\n", " ],\n", "].assign(\n", " report_link=lambda df_: df_.dgst.map(lambda x: f\"https://seccerts.org/cc/{x}/report.pdf\"),\n", " st_link=lambda df_: df_.dgst.map(lambda x: f\"https://seccerts.org/cc/{x}/target.pdf\"),\n", " scheme=lambda df_: df_.dgst.map(scheme_mapping),\n", ")\n", "\n", "# Then replace all \\\\/ with / in the corresponding json, as the pandas to_json method escapes the slashes.\n", "misclassified_instances.to_json(\n", " REPO_ROOT / \"dataset/misclassified_references_validation_set.json\",\n", " orient=\"records\",\n", " indent=4,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Display proportion of misclassifications per scheme. Only DE and FR have sufficient support to make any conclusions.\n", "# FR 4times more likely to be misclassified than DE\n", "misclassified_instances.scheme.value_counts() * 100 / all_classified_instances.scheme.value_counts()" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 2 }