src/sec_certs/utils/tables.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

from __future__ import annotations

import logging
import re
from pathlib import Path

from sec_certs.cert_rules import FIPS_LIST_OF_TABLES

logger = logging.getLogger(__name__)


def parse_list_of_tables(txt: str) -> set[int]:
    """
    Parses list of tables in policy txt, returns page numbers of tables that mention algorithms
    """
    rr = re.compile(r"^.+?(?:[Ff]unction|[Aa]lgorithm|[Ss]ecurity [Ff]unctions?).+?(?P<page_num>\d+)$", re.MULTILINE)
    return {int(m.group("page_num")) for m in rr.finditer(txt)}


def get_table_rich_page_numbers_from_footer(file_text: str) -> set[int]:
    """
    Parses page numbers of policy txt pages that may contain tables with algorithm data
    """
    current_page = 1
    pages = set()

    for line in file_text.split("\n"):
        if "\f" in line:
            current_page += 1
        if line.startswith("Table ") or line.startswith("Exhibit"):
            pages.add(current_page)
            pages.add(current_page + 1)
            if current_page > 2:
                pages.add(current_page - 1)

    for page in pages:
        if page > current_page - 1:
            return pages - {page}

    return pages


def find_pages_with_tables(txt_filepath: Path) -> set[int]:
    """
    Identifies pages in txt file that may contain tables. Return their page numbers.
    """
    with txt_filepath.open("r", encoding="utf-8") as handle:
        txt = handle.read()

    # Parse page numbers from list of tables if available
    # Else look for "Table" in text and \f representing footer, then extract page number from footer
    if list_of_tables := FIPS_LIST_OF_TABLES.search(txt):
        result = parse_list_of_tables(list_of_tables.group())
    else:
        result = get_table_rich_page_numbers_from_footer(txt)

    return result if result else set()


def get_algs_from_table(dataframe_text: str) -> set[str]:
    reg = r"(?:#?\s?|(?:Cert)\.?[^. ]*?\s?)(?:[CcAa]\s)?(?P<id>[CcAa]? ?\d+)"
    return {m.group() for m in re.finditer(reg, dataframe_text)}