{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from sec_certs.dataset import CCDataset\n", "from pathlib import Path\n", "import shutil\n", "import subprocess\n", "from sec_certs.utils import helpers\n", "from urllib.parse import unquote_plus, urlparse" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def hash_file(path):\n", " return helpers.get_sha256_filepath(path)\n", "def extract_filename(link):\n", " return unquote_plus(str(urlparse(link).path).split(\"/\")[-1])" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\n", "True\n" ] } ], "source": [ "# A directory with a dump of the documents that will enrich the dataset.\n", "doc_dump_dir = Path(\"cc_certs_09_10_2022\")\n", "print(doc_dump_dir.exists())\n", "\n", "# An output directory for the dataset.\n", "dataset_dir = Path(\"cc_09_10_2022\")\n", "print(dataset_dir.exists())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "95d1ec0\n" ] }, { "data": { "text/plain": [ "CompletedProcess(args=['git', 'rev-parse', '--short', 'HEAD'], returncode=0)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Print the current commit hash\n", "subprocess.run([\"git\", \"rev-parse\", \"--short\", \"HEAD\"])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tesseract 5.2.0\n", " leptonica-1.82.0\n", " libgif 5.2.1 : libjpeg 8d (libjpeg-turbo 2.1.3) : libpng 1.6.38 : libtiff 4.4.0 : zlib 1.2.12 : libwebp 1.2.4 : libopenjp2 2.5.0\n", " Found AVX2\n", " Found AVX\n", " Found FMA\n", " Found SSE4.1\n", " Found OpenMP 201511\n", " Found libarchive 3.6.1 zlib/1.2.12 liblzma/5.2.5 bz2lib/1.0.8 liblz4/1.9.3 libzstd/1.5.2\n", " Found libcurl/7.85.0 OpenSSL/1.1.1q zlib/1.2.12 brotli/1.0.9 zstd/1.5.2 libidn2/2.3.3 libpsl/0.21.1 (+libidn2/2.3.0) libssh2/1.10.0 nghttp2/1.50.0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "pdftotext version 22.09.0\n", "Copyright 2005-2022 The Poppler Developers - http://poppler.freedesktop.org\n", "Copyright 1996-2011, 2022 Glyph & Cog, LLC\n" ] } ], "source": [ "# Print tool versions\n", "subprocess.run([\"pdftotext\", \"-v\"])\n", "subprocess.run([\"tesseract\", \"-v\"]);" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Create the dataset\n", "dset = CCDataset(root_dir=dataset_dir, name=\"cc\", description=\"Final run on 09.10.2022\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:14<00:00, 7.01s/it]\n", "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.00s/it]\n", "The CSV cc_09_10_2022/web/cc_products_active.csv contains 8 duplicates by the primary key.\n", "The CSV cc_09_10_2022/web/cc_products_archived.csv contains 10 duplicates by the primary key.\n", "When merging certificates with dgst b26ce64e0c677e3d, the following mismatch occured: Attribute=security_level, self[security_level]={'ALC_DVS.1', 'EAL2+'}, other[security_level]={'EAL2'}\n", "When merging certificates with dgst 7f043162f88a1c3a, the following mismatch occured: Attribute=not_valid_after, self[not_valid_after]=2024-09-19, other[not_valid_after]=2024-09-18\n", "When merging certificates with dgst 109eb2158ca6a2f9, the following mismatch occured: Attribute=security_level, self[security_level]={'ADV_IMP.2', 'AVA_VLA.4', 'AVA_MSU.3', 'EAL4+', 'ATE_DPT.2'}, other[security_level]={'ADV_DVS.2', 'AVA_VLA.4', 'AVA_MSU.3', 'EAL5+'}\n", "When merging certificates with dgst c437d26a62a22c39, the following mismatch occured: Attribute=not_valid_after, self[not_valid_after]=2019-11-19, other[not_valid_after]=2018-05-07\n" ] } ], "source": [ "# Get the metadata\n", "dset.get_certs_from_web()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Duplicate entry in PP dataset: ('Protection Profile for Enterprise Security Management Policy Management', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ESM_PM_V1.4.pdf')\n", "Duplicate entry in PP dataset: ('Stateful Traffic Filter Firewall Extended Package for Network Device Protection Profile', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ND_TFFWEP_V1.0.pdf')\n", "Duplicate entry in PP dataset: ('Stateful Traffic Filter Firewall Extended Package for Network Device Protection Profile', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ND_TFFWEP_V1.0.pdf')\n", "Duplicate entry in PP dataset: ('Public Key-Enabled Application Family of Protection Profiles', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_VID3004-PP.pdf')\n", "Duplicate entry in PP dataset: ('Protection Profile for Wireless Local Area Network (WLAN) Access Systems', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_WLAN_AS_V1.0.pdf')\n", "Duplicate entry in PP dataset: ('PC Client Specific Trusted Platform Module Family 1.2; Level 2, Version 1.1', 'https://www.commoncriteriaportal.org/files/ppfiles/pp0030b.pdf')\n", "Duplicate entry in PP dataset: ('Protection Profile for Enterprise Security Management Access Control', 'https://www.commoncriteriaportal.org/files/ppfiles/pp_esm_ac_v2.1.pdf')\n", "Duplicate entry in PP dataset: ('U.S. Government Protection Profile Intrusion Detection System - System for Basic Robustness Environments, Version 1.7', 'https://www.commoncriteriaportal.org/files/ppfiles/pp_ids_sys_br_v1.7.pdf')\n", "Duplicate entry in PP dataset: ('Network Device Protection Profile Extended Package SIP Server', 'https://www.commoncriteriaportal.org/files/ppfiles/pp_nd_sip_ep_v1.1.pdf')\n", "Duplicate entry in PP dataset: ('Network Device Protection Profile Extended Package VPN Gateway', 'https://www.commoncriteriaportal.org/files/ppfiles/pp_nd_vpn_gw_ep_v1.1.pdf')\n", "Duplicate entry in PP dataset: ('Extended Package for Secure Shell (SSH), Version 1.0', 'https://www.commoncriteriaportal.org/files/ppfiles/pp_ssh_ep_v1.0.pdf')\n", "Duplicate entry in PP dataset: ('Protection Profile for Software Full Disk Encryption', 'https://www.commoncriteriaportal.org/files/ppfiles/pp_swfde_v1.1.pdf')\n", "Duplicate entry in PP dataset: ('Smart Card Security User Group - Smart Card Protection Profile, Version 3.0', 'https://www.commoncriteriaportal.org/files/ppfiles/scsugpp.pdf')\n" ] } ], "source": [ "# Process the profiles?\n", "dset.process_protection_profiles()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Enrich the dataset with the pre-downloaded PDFs (collected over multiple runs to fill in the timeouts and disappearances).\n", "dset.reports_pdf_dir.mkdir(parents=True, exist_ok=True)\n", "dset.targets_pdf_dir.mkdir(parents=True, exist_ok=True)\n", "for cert in dset:\n", " report_pdf = doc_dump_dir / \"report\" / \"pdf\" / f\"{cert.dgst}.pdf\"\n", " if report_pdf.exists():\n", " shutil.copy(report_pdf, cert.state.report_pdf_path)\n", " cert.state.report_download_ok = True\n", " cert.state.report_pdf_hash = hash_file(cert.state.report_pdf_path)\n", " cert.pdf_data.report_filename = extract_filename(cert.report_link)\n", "\n", " target_pdf = doc_dump_dir / \"target\" / \"pdf\" / f\"{cert.dgst}.pdf\"\n", " if target_pdf.exists():\n", " shutil.copy(target_pdf, cert.state.st_pdf_path)\n", " cert.state.st_download_ok = True\n", " cert.state.st_pdf_hash = hash_file(cert.state.st_pdf_path)\n", " cert.pdf_data.st_filename = extract_filename(cert.st_link)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading reports: 0%| | 0/14 [00:00txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/bfc86f14e2484335.pdf\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/bfc86f14e2484335.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: bfc86f14e2484335 failed to convert report pdf->txt\n", "Converting reports to txt: 3%|███ | 169/5115 [00:09<04:21, 18.89it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/1ee7ecee9e7e131c.pdf\n", "Converting reports to txt: 4%|███▍ | 194/5115 [00:10<04:03, 20.19it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/d8c205b4924f91b3.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/f8a79eb20ade29af.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/3477723044183b31.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/82c24f729c2e0092.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/20874d4fa6e4c878.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/b3757b6bb5f884d4.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/0f9468a2e263d3a3.pdf\n", "Converting reports to txt: 4%|███▋ | 209/5115 [00:21<04:52, 16.79it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/66bbd8fb83f0abbb.pdf\n", "Converting reports to txt: 4%|███▋ | 210/5115 [00:32<41:11, 1.98it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/e1daa354ae5a61fd.pdf\n", "Converting reports to txt: 4%|███▋ | 211/5115 [03:29<7:43:05, 5.67s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/7a0e44767b51f0c4.pdf\n", "Converting reports to txt: 4%|███▋ | 212/5115 [03:32<7:34:16, 5.56s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/051066781c36ad55.pdf\n", "Converting reports to txt: 4%|███▋ | 214/5115 [03:54<8:12:36, 6.03s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/03bce905b71945aa.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/0286c223fc4e626a.pdf\n", "Converting reports to txt: 4%|███▊ | 216/5115 [03:57<7:05:01, 5.21s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/559e4549a852aab2.pdf\n", "Converting reports to txt: 4%|███▊ | 220/5115 [03:58<4:22:42, 3.22s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/c5e25f90c7006546.pdf\n", "Converting reports to txt: 4%|███▊ | 221/5115 [05:32<19:48:13, 14.57s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/744a7a202d909323.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/449c74a92ebb61a4.pdf\n", "Converting reports to txt: 4%|███▊ | 222/5115 [07:49<45:00:22, 33.11s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/c80801f9a71b030e.pdf\n", "Converting reports to txt: 7%|██████ | 339/5115 [08:27<19:08, 4.16it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/852cad2f53bda148.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/b0deb8e9d0026b64.pdf\n", "Converting reports to txt: 7%|██████▏ | 351/5115 [08:28<15:46, 5.03it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/56be1d745bbb4994.pdf\n", "Converting reports to txt: 9%|███████▉ | 446/5115 [08:50<08:50, 8.80it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/af6767c887ae930a.pdf\n", "Converting reports to txt: 9%|████████ | 455/5115 [09:02<08:47, 8.84it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/d2f75cdaeb322e7f.pdf\n", "Converting reports to txt: 9%|███████▉ | 456/5115 [09:12<1:06:21, 1.17it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/d7300e1bfafed26e.pdf\n", "Converting reports to txt: 9%|███████▉ | 459/5115 [10:47<6:22:07, 4.92s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/7015af58766db091.pdf\n", "Converting reports to txt: 9%|████████ | 467/5115 [12:48<11:23:24, 8.82s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/85328590c0d7273f.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/e56e14e8cbf89e3c.pdf\n", "Converting reports to txt: 9%|████████▏ | 474/5115 [13:11<8:54:27, 6.91s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/4844355021ea22f1.pdf\n", "Converting reports to txt: 9%|████████▎ | 476/5115 [13:19<8:26:32, 6.55s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/21a1deda31c3e213.pdf\n", "Converting reports to txt: 9%|████████▎ | 477/5115 [13:20<7:51:36, 6.10s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/d8aff63d56506605.pdf\n", "Converting reports to txt: 9%|████████▏ | 479/5115 [14:07<13:56:49, 10.83s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/aca070da96add3b6.pdf\n", "Converting reports to txt: 9%|████████▎ | 480/5115 [14:39<18:14:05, 14.16s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/c8982f2e8de39b22.pdf\n", "Converting reports to txt: 10%|████████▌ | 489/5115 [14:40<6:09:04, 4.79s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/977b04d04def60a7.pdf\n", "Converting reports to txt: 10%|████████▌ | 494/5115 [15:01<5:53:31, 4.59s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/6eeed6fefb1f0243.pdf\n", "Converting reports to txt: 10%|████████▌ | 496/5115 [17:12<18:45:44, 14.62s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/62313881916550ca.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/6f8d7a6a1dea6a3a.pdf\n", "Converting reports to txt: 10%|████████▌ | 497/5115 [17:17<17:27:33, 13.61s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/e6cffd14e732a030.pdf\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Converting reports to txt: 10%|████████▌ | 498/5115 [17:18<15:20:52, 11.97s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/47913a485c3c8a18.pdf\n", "Converting reports to txt: 10%|████████▊ | 503/5115 [17:29<9:17:26, 7.25s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/b0b7b073ca2dfe5f.pdf\n", "Converting reports to txt: 10%|████████▋ | 504/5115 [18:31<17:41:13, 13.81s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/60f3bd10ee9be85b.pdf\n", "Converting reports to txt: 10%|████████▋ | 505/5115 [19:14<23:14:43, 18.15s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/5c3806bf79eeab7f.pdf\n", "Converting reports to txt: 10%|████████▊ | 514/5115 [21:40<21:48:26, 17.06s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/4c9468f20fdb04f7.pdf\n", "Converting reports to txt: 10%|████████▉ | 518/5115 [21:41<14:52:15, 11.65s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/9ee9f3c598318ae5.pdf\n", "Converting reports to txt: 11%|█████████▌ | 547/5115 [21:54<2:06:01, 1.66s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/afde7a688dcc5c83.pdf\n", "Converting reports to txt: 17%|███████████████ | 849/5115 [22:42<09:07, 7.79it/s]Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/4de4a9f436958574.pdf\n", "Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/98668cafb8dda26f.pdf\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/98668cafb8dda26f.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 98668cafb8dda26f failed to convert report pdf->txt\n", "Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/02958808b57334f0.pdf\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/02958808b57334f0.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 02958808b57334f0 failed to convert report pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/4de4a9f436958574.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 4de4a9f436958574 failed to convert report pdf->txt\n", "Converting reports to txt: 22%|███████████████████▉ | 1132/5115 [23:06<04:36, 14.41it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/9903c89cbb35eec1.pdf\n", "Converting reports to txt: 24%|█████████████████████▉ | 1245/5115 [23:15<05:08, 12.54it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/1faf7bf1f74a1851.pdf\n", "Converting reports to txt: 28%|█████████████████████████ | 1423/5115 [23:30<04:34, 13.46it/s]Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/422dc5758723c7d1.pdf\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/422dc5758723c7d1.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 422dc5758723c7d1 failed to convert report pdf->txt\n", "Converting reports to txt: 31%|████████████████████████████▎ | 1608/5115 [23:43<04:03, 14.41it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/b93dfa0a8ec3609c.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/6ed5861434d5fed9.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/3df01446a2b92093.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/209e2d18e1fa0cca.pdf\n", "Converting reports to txt: 35%|███████████████████████████████▏ | 1769/5115 [24:02<07:50, 7.11it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/3e5591d086a54905.pdf\n", "Converting reports to txt: 36%|████████████████████████████████▌ | 1852/5115 [24:10<05:03, 10.74it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/d3568613c552f9e8.pdf\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/d3568613c552f9e8.pdf, using garbage: pdftoppm failed: 99\n", "Converting reports to txt: 38%|██████████████████████████████████▎ | 1951/5115 [24:19<04:47, 11.00it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/822e5f25e7c4631e.pdf\n", "Converting reports to txt: 38%|██████████████████████████████████▍ | 1960/5115 [24:21<06:34, 8.00it/s]Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/2d05e976e1adf696.pdf\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/2d05e976e1adf696.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 2d05e976e1adf696 failed to convert report pdf->txt\n", "Converting reports to txt: 41%|█████████████████████████████████████▏ | 2110/5115 [24:33<04:05, 12.26it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/69d373903e9635ef.pdf\n", "Converting reports to txt: 42%|█████████████████████████████████████▉ | 2153/5115 [24:37<04:17, 11.51it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/ae1b80e9cccec491.pdf\n", "Converting reports to txt: 42%|██████████████████████████████████████▏ | 2172/5115 [24:42<07:16, 6.74it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/21d5f1b05e1ba491.pdf\n", "Converting reports to txt: 47%|██████████████████████████████████████████ | 2390/5115 [25:09<03:33, 12.75it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/3f8cee6d5bee5397.pdf\n", "Converting reports to txt: 56%|██████████████████████████████████████████████████▋ | 2884/5115 [26:17<04:55, 7.54it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/7e32023021d5aad2.pdf\n", "Converting reports to txt: 59%|████████████████████████████████████████████████████▉ | 3008/5115 [26:49<03:53, 9.02it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/b43100f226a61f53.pdf\n", "Converting reports to txt: 59%|█████████████████████████████████████████████████████▏ | 3023/5115 [26:50<03:20, 10.43it/s]Error during OCR of cc_09_10_2022/certs/reports/pdf/b43100f226a61f53.pdf, using garbage: pdftoppm failed: 99\n", "Converting reports to txt: 75%|███████████████████████████████████████████████████████████████████▎ | 3827/5115 [28:03<01:36, 13.34it/s]Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/f9c8da9deff77ab5.pdf\n", "Converting reports to txt: 75%|███████████████████████████████████████████████████████████████████▌ | 3843/5115 [28:04<01:31, 13.88it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/6db03eff148934a4.pdf\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/f9c8da9deff77ab5.pdf, using garbage: pdftoppm failed: 1\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/03cc150ce40cf816.pdf\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Cert dgst: f9c8da9deff77ab5 failed to convert report pdf->txt\n", "Converting reports to txt: 76%|████████████████████████████████████████████████████████████████████▌ | 3900/5115 [28:07<01:29, 13.63it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/3570791ff9c92912.pdf\n", "Converting reports to txt: 77%|█████████████████████████████████████████████████████████████████████▍ | 3948/5115 [28:12<01:59, 9.80it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/ed886279d0d61096.pdf\n", "Converting reports to txt: 86%|█████████████████████████████████████████████████████████████████████████████▌ | 4407/5115 [29:04<01:00, 11.64it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/c763b37bb95517e9.pdf\n", "Converting reports to txt: 87%|█████████████████████████████████████████████████████████████████████████████▉ | 4430/5115 [29:05<00:53, 12.70it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/40640508ac3a37f2.pdf\n", "Error when converting pdf->txt: poppler error creating document\n", "Converting reports to txt: 87%|██████████████████████████████████████████████████████████████████████████████▏ | 4444/5115 [29:07<00:57, 11.59it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/7147c2f70d983d57.pdf\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/7147c2f70d983d57.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 7147c2f70d983d57 failed to convert report pdf->txt\n", "Converting reports to txt: 95%|█████████████████████████████████████████████████████████████████████████████████████ | 4835/5115 [29:41<00:25, 11.10it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/3ad7749ef678ce95.pdf\n", "Converting reports to txt: 95%|█████████████████████████████████████████████████████████████████████████████████████▊ | 4877/5115 [29:44<00:16, 14.32it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/1683817e5db17d9c.pdf\n", "Converting reports to txt: 96%|██████████████████████████████████████████████████████████████████████████████████████▏ | 4899/5115 [29:45<00:13, 16.09it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/b762fcb2983aa148.pdf\n", "Converting reports to txt: 96%|██████████████████████████████████████████████████████████████████████████████████████▌ | 4919/5115 [29:47<00:15, 12.37it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/cc681ca95621a6db.pdf\n", "Converting reports to txt: 96%|██████████████████████████████████████████████████████████████████████████████████████▊ | 4931/5115 [29:48<00:15, 11.60it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/889693d2b11bdfaf.pdf\n", "Converting reports to txt: 97%|███████████████████████████████████████████████████████████████████████████████████████ | 4945/5115 [29:49<00:14, 12.02it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/876cb2fe02017f3f.pdf\n", "Converting reports to txt: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5115/5115 [30:08<00:00, 2.83it/s]\n", "Converting targets to txt: 2%|█▉ | 109/5038 [00:28<23:55, 3.43it/s]Error when converting pdf->txt: poppler error creating document\n", "Converting targets to txt: 2%|██ | 117/5038 [00:29<20:18, 4.04it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/bfc86f14e2484335.pdf\n", "Error during OCR of cc_09_10_2022/certs/targets/pdf/bfc86f14e2484335.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: bfc86f14e2484335 failed to convert security target pdf->txt\n", "Converting targets to txt: 10%|█████████▍ | 522/5038 [02:45<19:55, 3.78it/s]Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/4c9468f20fdb04f7.pdf\n", "Converting targets to txt: 11%|█████████▌ | 529/5038 [02:49<23:50, 3.15it/s]Error during OCR of cc_09_10_2022/certs/targets/pdf/4c9468f20fdb04f7.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 4c9468f20fdb04f7 failed to convert security target pdf->txt\n", "Converting targets to txt: 21%|███████████████████ | 1064/5038 [05:14<11:32, 5.74it/s]Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/a2b962c7f1d2bc56.pdf\n", "Converting targets to txt: 21%|███████████████████▏ | 1072/5038 [05:15<11:35, 5.70it/s]Error during OCR of cc_09_10_2022/certs/targets/pdf/a2b962c7f1d2bc56.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: a2b962c7f1d2bc56 failed to convert security target pdf->txt\n", "Converting targets to txt: 28%|█████████████████████████▎ | 1415/5038 [06:46<13:42, 4.41it/s]Error when converting pdf->txt: poppler error creating document\n", "Converting targets to txt: 28%|█████████████████████████▍ | 1422/5038 [06:49<17:28, 3.45it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/422dc5758723c7d1.pdf\n", "Converting targets to txt: 28%|█████████████████████████▌ | 1429/5038 [06:50<15:03, 4.00it/s]Error during OCR of cc_09_10_2022/certs/targets/pdf/422dc5758723c7d1.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 422dc5758723c7d1 failed to convert security target pdf->txt\n", "Converting targets to txt: 31%|████████████████████████████▏ | 1576/5038 [07:30<17:18, 3.33it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/063d3805a3b12930.pdf\n", "Converting targets to txt: 33%|█████████████████████████████▋ | 1660/5038 [07:48<10:10, 5.53it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/2f4a41e8a5dfac6a.pdf\n", "Converting targets to txt: 33%|█████████████████████████████▉ | 1678/5038 [07:52<10:57, 5.11it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/504258938266b87f.pdf\n", "Converting targets to txt: 35%|███████████████████████████████▉ | 1785/5038 [08:17<12:26, 4.36it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/de85ff4c31069f12.pdf\n", "Converting targets to txt: 37%|█████████████████████████████████▏ | 1855/5038 [08:33<10:24, 5.10it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/d3568613c552f9e8.pdf\n", "Converting targets to txt: 37%|█████████████████████████████████▏ | 1858/5038 [08:38<22:57, 2.31it/s]Error during OCR of cc_09_10_2022/certs/targets/pdf/d3568613c552f9e8.pdf, using garbage: pdftoppm failed: 99\n", "Converting targets to txt: 40%|███████████████████████████████████▉ | 2015/5038 [09:13<13:29, 3.73it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/b4bc79d8a558c7cf.pdf\n", "Converting targets to txt: 40%|████████████████████████████████████ | 2020/5038 [09:14<13:55, 3.61it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/09b43147f1307352.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/c16a92b40550193d.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/ce987167d42db722.pdf\n", "Converting targets to txt: 40%|███████████████████████████████████▌ | 2034/5038 [10:29<2:33:49, 3.07s/it]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/2ff761edd4ed9b72.pdf\n", "Converting targets to txt: 41%|███████████████████████████████████▉ | 2054/5038 [12:42<3:10:14, 3.83s/it]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/9e4d3347efd95ec9.pdf\n", "Converting targets to txt: 43%|███████████████████████████████████████ | 2184/5038 [13:32<15:16, 3.11it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/ad9ea5a4acf9ac53.pdf\n", "Converting targets to txt: 44%|██████████████████████████████████████▎ | 2193/5038 [15:35<6:11:17, 7.83s/it]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/28922ef97b2dfaa0.pdf\n", "Converting targets to txt: 44%|██████████████████████████████████████▎ | 2195/5038 [15:36<5:00:31, 6.34s/it]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/5c715225c805354f.pdf\n", "Converting targets to txt: 68%|█████████████████████████████████████████████████████████████▌ | 3446/5038 [25:27<06:21, 4.18it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/731102b849e8d104.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/863d91c6a2b8e77a.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/f65cab49ea5422dd.pdf\n", "Converting targets to txt: 77%|████████████████████████████████████████████████████████████████████▉ | 3861/5038 [27:30<04:53, 4.01it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/bc31fd0273ab72f8.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/18e5179ba124af1d.pdf\n", "Converting targets to txt: 79%|██████████████████████████████████████████████████████████████████████▉ | 3974/5038 [28:05<05:39, 3.13it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/5dfeb0219325989b.pdf\n", "Converting targets to txt: 79%|███████████████████████████████████████████████████████████████████████ | 3980/5038 [28:08<06:39, 2.65it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/22617e916bad6f53.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/6344672e3178f63e.pdf\n", "Converting targets to txt: 80%|███████████████████████████████████████████████████████████████████████▊ | 4017/5038 [36:55<48:39, 2.86s/it]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/483dc36fcf11b257.pdf\n", "Converting targets to txt: 80%|██████████████████████████████████████████████████████████████████████▎ | 4027/5038 [38:47<2:20:12, 8.32s/it]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/5885f28f042038c1.pdf\n", "Converting targets to txt: 80%|████████████████████████████████████████████████████████████████████████▍ | 4052/5038 [38:58<19:47, 1.20s/it]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/7b08ad74a882337c.pdf\n", "Converting targets to txt: 87%|██████████████████████████████████████████████████████████████████████████████▎ | 4386/5038 [42:02<02:25, 4.47it/s]Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/7147c2f70d983d57.pdf\n", "Error during OCR of cc_09_10_2022/certs/targets/pdf/7147c2f70d983d57.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 7147c2f70d983d57 failed to convert security target pdf->txt\n", "Converting targets to txt: 90%|████████████████████████████████████████████████████████████████████████████████▉ | 4529/5038 [42:55<02:40, 3.17it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/931e1b2a68a1bcd2.pdf\n", "Converting targets to txt: 92%|██████████████████████████████████████████████████████████████████████████████████▌ | 4622/5038 [43:16<01:33, 4.44it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/16229c97ce083e8c.pdf\n", "Converting targets to txt: 99%|████████████████████████████████████████████████████████████████████████████████████████▉ | 4979/5038 [44:47<00:13, 4.31it/s]Error when converting pdf->txt: poppler error creating page\n", "Converting targets to txt: 99%|█████████████████████████████████████████████████████████████████████████████████████████ | 4984/5038 [44:49<00:13, 3.94it/s]Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/a552244ec3ca8f60.pdf\n", "Converting targets to txt: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5038/5038 [46:48<00:00, 1.79it/s]\n", "Converting reports to txt: 0%| | 0/8 [00:00txt: poppler error creating document\n", "Error when converting pdf->txt: poppler error creating document\n", "Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/02958808b57334f0.pdf\n", "Error when converting pdf->txt: poppler error creating document\n", "Error when converting pdf->txt: poppler error creating document\n", "Error when converting pdf->txt: poppler error creating document\n", "Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/7147c2f70d983d57.pdf\n", "Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/98668cafb8dda26f.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/f9c8da9deff77ab5.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/4de4a9f436958574.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/422dc5758723c7d1.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/2d05e976e1adf696.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/bfc86f14e2484335.pdf\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/02958808b57334f0.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 02958808b57334f0 failed to convert report pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/7147c2f70d983d57.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 7147c2f70d983d57 failed to convert report pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/98668cafb8dda26f.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 98668cafb8dda26f failed to convert report pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/bfc86f14e2484335.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: bfc86f14e2484335 failed to convert report pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/422dc5758723c7d1.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 422dc5758723c7d1 failed to convert report pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/f9c8da9deff77ab5.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: f9c8da9deff77ab5 failed to convert report pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/2d05e976e1adf696.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 2d05e976e1adf696 failed to convert report pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/reports/pdf/4de4a9f436958574.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 4de4a9f436958574 failed to convert report pdf->txt\n", "Converting reports to txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00, 7.98it/s]\n", "Converting targets to txt: 0%| | 0/5 [00:00txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/7147c2f70d983d57.pdf\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error when converting pdf->txt: poppler error creating document\n", "Error when converting pdf->txt: poppler error creating document\n", "Error when converting pdf->txt: poppler error creating document\n", "Error during OCR of cc_09_10_2022/certs/targets/pdf/7147c2f70d983d57.pdf, using garbage: pdftoppm failed: 1\n", "Error when converting pdf->txt: poppler error creating document\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/a2b962c7f1d2bc56.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/422dc5758723c7d1.pdf\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/bfc86f14e2484335.pdf\n", "Cert dgst: 7147c2f70d983d57 failed to convert security target pdf->txt\n", "Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/4c9468f20fdb04f7.pdf\n", "Error during OCR of cc_09_10_2022/certs/targets/pdf/a2b962c7f1d2bc56.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: a2b962c7f1d2bc56 failed to convert security target pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/targets/pdf/bfc86f14e2484335.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: bfc86f14e2484335 failed to convert security target pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/targets/pdf/422dc5758723c7d1.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 422dc5758723c7d1 failed to convert security target pdf->txt\n", "Error during OCR of cc_09_10_2022/certs/targets/pdf/4c9468f20fdb04f7.pdf, using garbage: pdftoppm failed: 1\n", "Cert dgst: 4c9468f20fdb04f7 failed to convert security target pdf->txt\n", "Converting targets to txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00, 4.99it/s]\n" ] } ], "source": [ "# Convert all\n", "dset.convert_all_pdfs()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "pycharm": { "name": "#%%\n" }, "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Extracting report metadata: 0%| | 0/5107 [00:00txt: poppler error creating document\n", "WARNING:sec_certs.utils.pdf:Detected garbage during conversion of cc_09_10_2022/certs/maintenances/reports/pdf/cert_8ba5d4d02ea73d3d_update_d879046e9be61396.pdf\n", "Converting reports to txt: 95%|███████████████████████████████████████████████████████████████████████████████████████▍ | 436/459 [00:10<00:00, 41.44it/s]ERROR:sec_certs.utils.pdf:Error during OCR of cc_09_10_2022/certs/maintenances/reports/pdf/cert_8ba5d4d02ea73d3d_update_d879046e9be61396.pdf, using garbage: pdftoppm failed: 1\n", "ERROR:sec_certs.sample.certificate:Cert dgst: cert_8ba5d4d02ea73d3d_update_d879046e9be61396 failed to convert report pdf->txt\n", "Converting reports to txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 459/459 [00:22<00:00, 20.30it/s]\n", "Converting targets to txt: 36%|█████████████████████████████████▌ | 167/458 [00:45<00:53, 5.49it/s]WARNING:sec_certs.utils.pdf:Detected garbage during conversion of cc_09_10_2022/certs/maintenances/targets/pdf/cert_09b43147f1307352_update_a1ff8f3d472941f7.pdf\n", "Converting targets to txt: 75%|█████████████████████████████████████████████████████████████████████ | 344/458 [01:27<00:27, 4.19it/s]WARNING:sec_certs.utils.pdf:Detected garbage during conversion of cc_09_10_2022/certs/maintenances/targets/pdf/cert_bd5c7bb29151de01_update_070f8e523f3deba1.pdf\n", "WARNING:sec_certs.utils.pdf:Detected garbage during conversion of cc_09_10_2022/certs/maintenances/targets/pdf/cert_ef9565620c399b7e_update_b169fd5fee2eb72b.pdf\n", "Converting targets to txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 458/458 [05:10<00:00, 1.47it/s]\n", "Converting reports to txt: 0%| | 0/1 [00:00txt: poppler error creating document\n", "WARNING:sec_certs.utils.pdf:Detected garbage during conversion of cc_09_10_2022/certs/maintenances/reports/pdf/cert_8ba5d4d02ea73d3d_update_d879046e9be61396.pdf\n", "ERROR:sec_certs.utils.pdf:Error during OCR of cc_09_10_2022/certs/maintenances/reports/pdf/cert_8ba5d4d02ea73d3d_update_d879046e9be61396.pdf, using garbage: pdftoppm failed: 1\n", "ERROR:sec_certs.sample.certificate:Cert dgst: cert_8ba5d4d02ea73d3d_update_d879046e9be61396 failed to convert report pdf->txt\n", "Converting reports to txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00, 1.00s/it]\n", "Extracting report metadata: 81%|█████████████████████████████████████████████████████████████████████████ | 372/458 [00:01<00:00, 369.89it/s]ERROR:sec_certs.utils.pdf:Failed to read metadata of maintenances/reports/pdf/cert_d455e5408b744b44_update_16f77dbd34439401.pdf, error: PDF metadata unavailable\n", "Extracting report metadata: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 458/458 [00:02<00:00, 227.06it/s]\n", "Extracting target metadata: 63%|█████████████████████████████████████████████████████████▍ | 289/458 [00:09<00:05, 28.98it/s]ERROR:sec_certs.utils.pdf:Failed to read metadata of maintenances/targets/pdf/cert_bbc41d7d09e40c0c_update_784535cca61b58a7.pdf, error: too many values to unpack (expected 2)\n", "Extracting target metadata: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 458/458 [00:12<00:00, 37.95it/s]\n", "Extracting report frontpages: 100%|████████████████████████████████████████████████████████████████████████████████████████| 458/458 [00:01<00:00, 455.49it/s]\n", "Extracting target frontpages: 100%|████████████████████████████████████████████████████████████████████████████████████████| 458/458 [00:02<00:00, 227.51it/s]\n", "Extracting report keywords: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 458/458 [00:15<00:00, 30.39it/s]\n", "Extracting target keywords: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 458/458 [03:48<00:00, 2.01it/s]\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# And do maintenance updates as well\n", "dset.process_maintenance_updates()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Finally, dump it all\n", "dset.to_json()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.7" } }, "nbformat": 4, "nbformat_minor": 1 }