{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "import glob\n",
    "from pathlib import Path\n",
    "from matplotlib import pyplot as plt"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exploration of :garbage: PDFs"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "bad_reports = [\n",
    "    \"c5e25f90c7006546\", # ANSSI spacing\n",
    "    \"03bce905b71945aa\", # ANSSI spacing\n",
    "    \"3477723044183b31\", # ANSSI empty?\n",
    "    \"7e32023021d5aad2\", # empty?\n",
    "    \"4c9468f20fdb04f7\", # empty?\n",
    "    \"82c24f729c2e0092\", # ANSSI spacing\n",
    "    \"e1daa354ae5a61fd\", # ANSSI spacing\n",
    "    \"c80801f9a71b030e\", # ANSSI spacing\n",
    "]\n",
    "\n",
    "good_reports = [\n",
    "    \"2544ffa2d8eef431\", # Japan, short but OK\n",
    "    \"a0aa53cad9c5d049\", # Korea, OK, but low avg\n",
    "    \"10f1399a27470345\", # Korea, OK, but low avg\n",
    "    \"60c49ab7f7d33501\", # Korea, OK, but low avg\n",
    "    \"e133881d7203a6e4\", # Spain, OK\n",
    "    \"4ff70fb16691d53c\", # India, OK\n",
    "]"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def average_line_length(text: str) -> float:\n",
    "    length = 0\n",
    "    lines = 0\n",
    "    for line in text.splitlines():\n",
    "        length += len(line)\n",
    "        lines += 1\n",
    "    if lines:\n",
    "        return length/lines\n",
    "    else:\n",
    "        return 0\n",
    "\n",
    "def overall_size(text: str) -> float:\n",
    "    return len(text)\n",
    "\n",
    "def num_lines(text: str) -> float:\n",
    "    return len(text.splitlines())\n",
    "\n",
    "def every_second_char(text: str) -> float:\n",
    "    c = 0\n",
    "    for line in text.splitlines():\n",
    "        if len(set(line[1::2])) > 1:\n",
    "            c += 1\n",
    "    return c\n",
    "\n",
    "def alpha_chars(text: str) -> float:\n",
    "    tl = len(text)\n",
    "    if tl == 0:\n",
    "        return 0\n",
    "    return len(\"\".join(filter(str.isalpha, text))) / tl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "LINES_THRESHOLD = 30\n",
    "SIZE_THRESHOLD = 1000\n",
    "AVG_LLEN_THRESHOLD = 10\n",
    "EVERY_SECOND_CHAR_THRESHOLD = 15\n",
    "ALPHA_CHARS_THRESHOLD = 0.5\n",
    "\n",
    "def garbage(text: str) -> bool:\n",
    "    size = len(text)\n",
    "    content_len = 0\n",
    "    lines = 0\n",
    "    every_second = 0\n",
    "    alpha_len = len(\"\".join(filter(str.isalpha, text)))\n",
    "    for line in text.splitlines():\n",
    "        content_len += len(line)\n",
    "        lines += 1\n",
    "        if len(set(line[1::2])) > 1:\n",
    "            every_second += 1\n",
    "\n",
    "    if lines:\n",
    "        avg_line_len = content_len / lines\n",
    "    else:\n",
    "        avg_line_len = 0\n",
    "    if size:\n",
    "        alpha = alpha_len / size\n",
    "    else:\n",
    "        alpha = 0\n",
    "\n",
    "    # If number of lines is small, this is garbage.\n",
    "    if lines < LINES_THRESHOLD:\n",
    "        return True\n",
    "    # If the file size is small, this is garbage.\n",
    "    if size < SIZE_THRESHOLD:\n",
    "        return True\n",
    "    # If the average length of a line is small, this is garbage.\n",
    "    if avg_line_len < AVG_LLEN_THRESHOLD:\n",
    "        return True\n",
    "    # If there a small amount of lines that have more than one character at every second character, this is garbage.\n",
    "    # This detects the ANSSI spacing issues.\n",
    "    if every_second < EVERY_SECOND_CHAR_THRESHOLD:\n",
    "        return True\n",
    "    # If there is a small ratio of alphanumeric chars to all chars, this is garbage.\n",
    "    if alpha < ALPHA_CHARS_THRESHOLD:\n",
    "        return True\n",
    "    return False\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "avgs = {}\n",
    "sizes = {}\n",
    "line_counts = {}\n",
    "schars = {}\n",
    "alphas = {}\n",
    "bad = set()\n",
    "for fname in glob.glob(\"../cc_dset/certs/reports/txt/*\"):\n",
    "    path = Path(fname)\n",
    "\n",
    "    with path.open(\"r\") as f:\n",
    "        text = f.read()\n",
    "    dgst = path.stem\n",
    "\n",
    "    avg = average_line_length(text)\n",
    "    size = overall_size(text)\n",
    "    nlines = num_lines(text)\n",
    "    schar = every_second_char(text)\n",
    "    alpha = alpha_chars(text)\n",
    "\n",
    "    avgs[dgst] = avg\n",
    "    sizes[dgst] = size\n",
    "    line_counts[dgst] = nlines\n",
    "    schars[dgst] = schar\n",
    "    alphas[dgst] = alpha\n",
    "\n",
    "    if nlines < 30:\n",
    "        print(f\"{dgst}:  nlines: {nlines:.2f}\")\n",
    "        bad.add(dgst)\n",
    "    if size < 1000:\n",
    "        print(f\"{dgst}:    size: {size:.2f}\")\n",
    "        bad.add(dgst)\n",
    "    if avg < 10:\n",
    "        print(f\"{dgst}:     avg: {avg:.2f}\")\n",
    "        bad.add(dgst)\n",
    "    if schar < 15:\n",
    "        print(f\"{dgst}:   schar: {schar:.2f}\")\n",
    "        bad.add(dgst)\n",
    "    if alpha < 0.5:\n",
    "        print(f\"{dgst}:   alpha: {alpha:.2f}\")\n",
    "        bad.add(dgst)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "print(len(bad))\n",
    "print(\"                  nlines,   size,   lavg, schar, alpha\")\n",
    "for b in bad:\n",
    "    print(f\"{b}: {line_counts[b]:>6}, {sizes[b]:>7}, {avgs[b]:>5.02f}, {schars[b]:>5}, {alphas[b]:>5.02f}\")\n",
    "for b in bad_reports:\n",
    "    print(b in bad)\n",
    "\n",
    "for b in good_reports:\n",
    "    print(b not in bad)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "plt.hist(line_counts.values(), bins=30);"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "plt.hist(sizes.values(), bins=30);"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "plt.hist(avgs.values(), bins=30);"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "plt.hist(schars.values(), bins=30);"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "plt.hist(alphas.values(), bins=30);"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "for pdf_name in glob.glob(\"../cc_dset/certs/reports/pdf/*.pdf\"):\n",
    "    pdf_path = Path(pdf_name)\n",
    "    dgst = pdf_path.stem\n",
    "\n",
    "    txt_path = Path(\"../cc_dset/certs/reports/txt\") / (dgst + \".txt\")\n",
    "    if not txt_path.exists():\n",
    "        print(dgst)\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}