{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Manual CPE matching evaluation\n",
    "\n",
    "This notebook assists the manual matching evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sec_certs.dataset import CCDataset\n",
    "import pandas as pd\n",
    "import json\n",
    "import tempfile\n",
    "from sec_certs.utils.label_studio_utils import to_label_studio_json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare the input data for label studio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading CC Dataset: 100%|██████████| 139M/139M [00:15<00:00, 9.61MB/s] \n"
     ]
    }
   ],
   "source": [
    "dset = CCDataset.from_web()\n",
    "df = dset.to_pandas()\n",
    "\n",
    "eval_digests = pd.read_csv(\"./../../data/cpe_eval/random.csv\", sep=\";\").set_index(\"dgst\").index\n",
    "eval_certs = df.loc[(df.index.isin(eval_digests)) & (df.cpe_matches.notnull())].copy()\n",
    "\n",
    "# It may be handy to display max number of assigned cpe matches here\n",
    "eval_certs[\"n_cpes\"] = eval_certs.cpe_matches.map(len)\n",
    "max_n_cpes = eval_certs.n_cpes.max()\n",
    "print(f\"Max CPE matches: {max_n_cpes}\")\n",
    "\n",
    "# Now you may want to adjust the key `cpe_n_max_matches` config in sec_certs/config/settings.yml according to max_n_cpes\n",
    "# This helps to avoid clutter in label studio interface\n",
    "with tempfile.TemporaryDirectory() as tmp_dir:\n",
    "    dset.root_dir = tmp_dir\n",
    "    dset.certs = {x.dgst: x for x in dset if x.dgst in eval_certs.index.tolist()}\n",
    "    to_label_studio_json(dset, \"./label_studio_input_data.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`Now you import this data to label studio and label it`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the data from label studio and show the results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluated 607 CPE matches in 100 certificates\n",
      "In total, 546 (89.95%) are correct (precision of the positive class).\n",
      "Also, 81 (81.00%) certificates have perfect matches.\n",
      "\\newcommand{\\evalCcPrecision}{$89.95\\%$}\n",
      "\\newcommand{\\evalCcRatioErrorFree}{$81\\%$}\n"
     ]
    }
   ],
   "source": [
    "with open(\"./../../data/cpe_eval/manual_cpe_labels.json\", \"r\") as handle:\n",
    "    data = json.load(handle)\n",
    "\n",
    "results = []\n",
    "for sample in data:\n",
    "    option_keys = [key for key in sample.keys() if \"option_\" in key]\n",
    "    n_cpe_matches = len([sample[key] for key in option_keys if sample[key] != \"No good match\"])\n",
    "\n",
    "    if not \"verified_cpe_match\" in sample.keys():\n",
    "        n_wrong = 0\n",
    "    elif isinstance(sample[\"verified_cpe_match\"], str):\n",
    "        n_wrong = 1\n",
    "    else:\n",
    "        n_wrong = len(sample[\"verified_cpe_match\"][\"choices\"])\n",
    "\n",
    "    results.append((n_cpe_matches, n_wrong))\n",
    "\n",
    "correct = [x[0] - x[1] for x in results]\n",
    "wrong = [x[1] for x in results]\n",
    "n_candidates = [x[0] for x in results]\n",
    "completely_right = [x == 0 for x in wrong]\n",
    "\n",
    "precision = 100 * sum(correct) / sum(n_candidates)\n",
    "completely_right_ratio = 100 * sum(completely_right) / len(n_candidates)\n",
    "\n",
    "print(f\"Evaluated {sum(n_candidates)} CPE matches in {len(results)} certificates\")\n",
    "print(f\"In total, {sum(correct)} ({precision:.2f}%) are correct (precision of the positive class).\")\n",
    "print(f\"Also, {sum(completely_right)} ({completely_right_ratio:.2f}%) certificates have perfect matches.\")\n",
    "\n",
    "print(f\"\\\\newcommand{{\\\\evalCcPrecision}}{{${precision:.2f}\\%$}}\")\n",
    "print(f\"\\\\newcommand{{\\\\evalCcRatioErrorFree}}{{${completely_right_ratio:.0f}\\%$}}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.13 ('venv': venv)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "a5b8c5b127d2cfe5bc3a1c933e197485eb9eba25154c3661362401503b4ef9d4"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}