{ "cells": [ { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "import glob\n", "from pathlib import Path\n", "from matplotlib import pyplot as plt" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "markdown", "source": [ "# Exploration of :garbage: PDFs" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "bad_reports = [\n", " \"c5e25f90c7006546\", # ANSSI spacing\n", " \"03bce905b71945aa\", # ANSSI spacing\n", " \"3477723044183b31\", # ANSSI empty?\n", " \"7e32023021d5aad2\", # empty?\n", " \"4c9468f20fdb04f7\", # empty?\n", " \"82c24f729c2e0092\", # ANSSI spacing\n", " \"e1daa354ae5a61fd\", # ANSSI spacing\n", " \"c80801f9a71b030e\", # ANSSI spacing\n", "]\n", "\n", "good_reports = [\n", " \"2544ffa2d8eef431\", # Japan, short but OK\n", " \"a0aa53cad9c5d049\", # Korea, OK, but low avg\n", " \"10f1399a27470345\", # Korea, OK, but low avg\n", " \"60c49ab7f7d33501\", # Korea, OK, but low avg\n", " \"e133881d7203a6e4\", # Spain, OK\n", " \"4ff70fb16691d53c\", # India, OK\n", "]" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def average_line_length(text: str) -> float:\n", " length = 0\n", " lines = 0\n", " for line in text.splitlines():\n", " length += len(line)\n", " lines += 1\n", " if lines:\n", " return length/lines\n", " else:\n", " return 0\n", "\n", "def overall_size(text: str) -> float:\n", " return len(text)\n", "\n", "def num_lines(text: str) -> float:\n", " return len(text.splitlines())\n", "\n", "def every_second_char(text: str) -> float:\n", " c = 0\n", " for line in text.splitlines():\n", " if len(set(line[1::2])) > 1:\n", " c += 1\n", " return c\n", "\n", "def alpha_chars(text: str) -> float:\n", " tl = len(text)\n", " if tl == 0:\n", " return 0\n", " return len(\"\".join(filter(str.isalpha, text))) / tl" ] }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "LINES_THRESHOLD = 30\n", "SIZE_THRESHOLD = 1000\n", "AVG_LLEN_THRESHOLD = 10\n", "EVERY_SECOND_CHAR_THRESHOLD = 15\n", "ALPHA_CHARS_THRESHOLD = 0.5\n", "\n", "def garbage(text: str) -> bool:\n", " size = len(text)\n", " content_len = 0\n", " lines = 0\n", " every_second = 0\n", " alpha_len = len(\"\".join(filter(str.isalpha, text)))\n", " for line in text.splitlines():\n", " content_len += len(line)\n", " lines += 1\n", " if len(set(line[1::2])) > 1:\n", " every_second += 1\n", "\n", " if lines:\n", " avg_line_len = content_len / lines\n", " else:\n", " avg_line_len = 0\n", " if size:\n", " alpha = alpha_len / size\n", " else:\n", " alpha = 0\n", "\n", " # If number of lines is small, this is garbage.\n", " if lines < LINES_THRESHOLD:\n", " return True\n", " # If the file size is small, this is garbage.\n", " if size < SIZE_THRESHOLD:\n", " return True\n", " # If the average length of a line is small, this is garbage.\n", " if avg_line_len < AVG_LLEN_THRESHOLD:\n", " return True\n", " # If there a small amount of lines that have more than one character at every second character, this is garbage.\n", " # This detects the ANSSI spacing issues.\n", " if every_second < EVERY_SECOND_CHAR_THRESHOLD:\n", " return True\n", " # If there is a small ratio of alphanumeric chars to all chars, this is garbage.\n", " if alpha < ALPHA_CHARS_THRESHOLD:\n", " return True\n", " return False\n" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "avgs = {}\n", "sizes = {}\n", "line_counts = {}\n", "schars = {}\n", "alphas = {}\n", "bad = set()\n", "for fname in glob.glob(\"../cc_dset/certs/reports/txt/*\"):\n", " path = Path(fname)\n", "\n", " with path.open(\"r\") as f:\n", " text = f.read()\n", " dgst = path.stem\n", "\n", " avg = average_line_length(text)\n", " size = overall_size(text)\n", " nlines = num_lines(text)\n", " schar = every_second_char(text)\n", " alpha = alpha_chars(text)\n", "\n", " avgs[dgst] = avg\n", " sizes[dgst] = size\n", " line_counts[dgst] = nlines\n", " schars[dgst] = schar\n", " alphas[dgst] = alpha\n", "\n", " if nlines < 30:\n", " print(f\"{dgst}: nlines: {nlines:.2f}\")\n", " bad.add(dgst)\n", " if size < 1000:\n", " print(f\"{dgst}: size: {size:.2f}\")\n", " bad.add(dgst)\n", " if avg < 10:\n", " print(f\"{dgst}: avg: {avg:.2f}\")\n", " bad.add(dgst)\n", " if schar < 15:\n", " print(f\"{dgst}: schar: {schar:.2f}\")\n", " bad.add(dgst)\n", " if alpha < 0.5:\n", " print(f\"{dgst}: alpha: {alpha:.2f}\")\n", " bad.add(dgst)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "print(len(bad))\n", "print(\" nlines, size, lavg, schar, alpha\")\n", "for b in bad:\n", " print(f\"{b}: {line_counts[b]:>6}, {sizes[b]:>7}, {avgs[b]:>5.02f}, {schars[b]:>5}, {alphas[b]:>5.02f}\")\n", "for b in bad_reports:\n", " print(b in bad)\n", "\n", "for b in good_reports:\n", " print(b not in bad)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "plt.hist(line_counts.values(), bins=30);" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "plt.hist(sizes.values(), bins=30);" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "plt.hist(avgs.values(), bins=30);" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "plt.hist(schars.values(), bins=30);" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "plt.hist(alphas.values(), bins=30);" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "for pdf_name in glob.glob(\"../cc_dset/certs/reports/pdf/*.pdf\"):\n", " pdf_path = Path(pdf_name)\n", " dgst = pdf_path.stem\n", "\n", " txt_path = Path(\"../cc_dset/certs/reports/txt\") / (dgst + \".txt\")\n", " if not txt_path.exists():\n", " print(dgst)\n" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }