aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/sec_certs/sample/document_state.py
blob: ef598c5e330ac8d4c27f35742d38180a7ad5e67b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path

from sec_certs.serialization.json import ComplexSerializableType


@dataclass
class DocumentState(ComplexSerializableType):
    download_ok: bool = False  # Whether download went OK
    convert_garbage: bool = False  # Whether initial conversion resulted in garbage
    convert_ok: bool = False  # Whether overall conversion went OK (either pdftotext or via OCR)
    extract_ok: bool = False  # Whether extraction went OK

    pdf_hash: str | None = None
    txt_hash: str | None = None

    _pdf_path: Path | None = None
    _txt_path: Path | None = None

    def is_ok_to_download(self, fresh: bool = True) -> bool:
        return True if fresh else not self.download_ok

    def is_ok_to_convert(self, fresh: bool = True) -> bool:
        return self.download_ok if fresh else self.download_ok and not self.convert_ok

    def is_ok_to_analyze(self, fresh: bool = True) -> bool:
        if fresh:
            return self.download_ok and self.convert_ok
        else:
            return self.download_ok and self.convert_ok and not self.extract_ok

    @property
    def pdf_path(self) -> Path:
        if not self._pdf_path:
            raise ValueError(f"pdf_path not set on {type(self)}")
        return self._pdf_path

    @pdf_path.setter
    def pdf_path(self, pth: str | Path | None) -> None:
        self._pdf_path = Path(pth) if pth else None

    @property
    def txt_path(self) -> Path:
        if not self._txt_path:
            raise ValueError(f"txt_path not set on {type(self)}")
        return self._txt_path

    @txt_path.setter
    def txt_path(self, pth: str | Path | None) -> None:
        self._txt_path = Path(pth) if pth else None

    @property
    def serialized_attributes(self) -> list[str]:
        return [
            "download_ok",
            "convert_garbage",
            "convert_ok",
            "extract_ok",
            "pdf_hash",
            "txt_hash",
        ]