notebooks/cc/vector_search.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388

import argparse
import sqlite3
import struct
import sys
import urllib.parse
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path

import nltk
import sqlite_vec
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

from sec_certs.dataset import CCDataset
from sec_certs.sample import CCCertificate

# initialize NLP resources - one-time downloads
nltk.download("punkt")
nltk.download("punkt_tab")

TOKEN_LIMIT = 512
OVERLAP = 128
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"


def serialize_f32(vector: list[float]) -> bytes:
    """Serialize float list to bytes for sqlite_vec"""
    return struct.pack(f"{len(vector)}f", *vector)


class CertProcessor:
    def __init__(self, model=None, fresh_db=False):
        if model is not None:
            self.model = model
        else:
            self.model = SentenceTransformer(MODEL_NAME)
        self.db = self.init_db(fresh_db)
        self.cursor = self.db.cursor()

    def init_db(self, fresh_db) -> sqlite3.Connection:
        """Initialize database with vector extension"""
        conn = sqlite3.connect(DB_PATH)
        conn.enable_load_extension(True)
        sqlite_vec.load(conn)
        conn.enable_load_extension(False)

        if fresh_db is True:
            conn.execute("DROP TABLE IF EXISTS cert_chunks;")
            conn.execute("DROP TABLE IF EXISTS metadata;")
            conn.execute("DROP TABLE IF EXISTS chunk_texts;")

            conn.execute("""
                CREATE VIRTUAL TABLE cert_chunks USING vec0(
                    embedding float[384],
                    dgst TEXT,
                    chunk_index INTEGER,
                    source_type TEXT,
                )
            """)
            conn.execute("""
                CREATE TABLE metadata (
                    dgst TEXT PRIMARY KEY,
                    name TEXT,
                    category TEXT,
                    manufacturer TEXT,
                    security_level TEXT,
                    valid_from TEXT,
                    valid_to TEXT,
                    report_link TEXT,
                    st_link TEXT,
                    cert_link TEXT,
                    manufacturer_web TEXT
                )
            """)

            conn.execute("""
                CREATE TABLE chunk_texts (
                    dgst TEXT,
                    source_type TEXT,
                    chunk_index INTEGER,
                    text TEXT,
                    PRIMARY KEY (dgst, source_type, chunk_index)
                )
            """)

            # create indices for metadata
            conn.execute("CREATE INDEX idx_chunk_texts_dgst ON chunk_texts(dgst);")
            conn.execute("CREATE INDEX idx_chunk_texts_source ON chunk_texts(source_type);")

        return conn

    def store_metadata(self, meta: dict) -> None:
        """Store certificate metadata in the metadata table"""
        params = {
            "dgst": meta["dgst"],
            "name": meta.get("name", "Unknown"),
            "category": meta.get("category", "unknown"),
            "manufacturer": meta.get("manufacturer", "Unknown"),
            "security_level": ",".join(meta.get("security_level", [])),
            "valid_from": meta["not_valid_before"].isoformat(),
            "valid_to": meta["not_valid_after"].isoformat(),
            "report_link": meta.get("report_link", ""),
            "st_link": meta.get("st_link", ""),
            "cert_link": meta.get("cert_link", ""),
            "manufacturer_web": meta.get("manufacturer_web", ""),
        }
        self.cursor.execute(
            """
            INSERT INTO metadata
            (dgst, name, category, manufacturer, security_level,
             valid_from, valid_to, report_link, st_link, cert_link, manufacturer_web)
            VALUES (:dgst, :name, :category, :manufacturer, :security_level,
                    :valid_from, :valid_to, :report_link, :st_link, :cert_link, :manufacturer_web)
        """,
            params,
        )

    def chunk_text(self, text: str) -> list[str]:
        """Split text into overlapping chunks"""
        words = word_tokenize(text)
        return [" ".join(words[i : i + TOKEN_LIMIT]) for i in range(0, len(words), TOKEN_LIMIT - OVERLAP)]

    def process_cert(self, cert: CCCertificate) -> None:
        """Process a single certificate with all its documents"""
        meta = cert.to_dict()
        file_paths = {
            "report": cert.state.report._txt_path,
            "targets": cert.state.st._txt_path,
            "cert": cert.state.cert._txt_path,
        }

        for doc_type, path in file_paths.items():
            if path is None or not path.exists():
                continue

            with path.open(encoding="utf-8") as f:
                text = f.read()

            chunks = self.chunk_text(text)
            self.store_chunks(chunks, meta, doc_type)

    def store_chunks(self, chunks: list[str], meta: dict, doc_type: str) -> None:
        """Store chunks with metadata in database"""
        for idx, chunk in enumerate(tqdm(chunks, desc=f"Chunking {doc_type}", leave=False)):
            # generate embedding
            embedding = self.model.encode(chunk).flatten().tolist()

            # insert into cert_chunks table
            self.cursor.execute(
                """
                INSERT INTO cert_chunks
                (embedding, dgst, chunk_index, source_type)
                VALUES (?, ?, ?, ?)
            """,
                [serialize_f32(embedding), meta["dgst"], idx, doc_type],
            )

            self.cursor.execute(
                """
                INSERT INTO chunk_texts
                (dgst, source_type, chunk_index, text)
                VALUES (?, ?, ?, ?)
            """,
                [meta["dgst"], doc_type, idx, chunk],
            )

        # store metadata if not already present
        self.cursor.execute("SELECT dgst FROM metadata WHERE dgst = ?", [meta["dgst"]])
        if not self.cursor.fetchone():
            self.store_metadata(meta)

    def process_all(self, dataset) -> None:
        """Process entire dataset"""
        for cert in tqdm(dataset, desc="Processing certificates", unit="cert"):
            self.process_cert(cert)
        self.db.commit()

    def close(self) -> None:
        """Close the database connection"""
        self.cursor.close()
        self.db.close()


def query_similar_chunks(
    query_text: str, processor: CertProcessor, k: int = 5
) -> list[tuple[str, str, str, str, str, str]]:
    """
    Query the database for similar chunks and include metadata.

    Args:
        query_embedding: The embedding of the query text.
        processor: An instance of CertProcessor.
        k: The number of results to return.

    Returns:
        A list of tuples containing (dgst, name, chunk_index, source_type, distance).
    """
    query_embedding = processor.model.encode(query_text).flatten().tolist()
    serialized_query = serialize_f32(query_embedding)

    # fetch similar chunks and join with metadata
    query = """
        SELECT
        m.dgst,
        m.name,
        c.source_type,
        0.7 * MIN(c.distance) + 0.3 * AVG(c.distance) AS weighted_score,
        first_value(c.chunk_index) OVER (
            PARTITION BY m.dgst, c.source_type
            ORDER BY c.distance ASC
        ) AS best_chunk_index
        FROM cert_chunks c
        JOIN metadata m ON c.dgst = m.dgst
        WHERE c.embedding MATCH ? AND k=2000
        GROUP BY
        m.dgst, c.source_type
        ORDER BY
        weighted_score ASC
        LIMIT ?
        """

    doc_results = processor.db.execute(query, [serialized_query, k]).fetchall()

    # fetch the text for each best chunk
    results_with_text = []
    for dgst, name, source_type, weighted_score, best_chunk_idx in doc_results:
        # query to get the text chunk
        text_query = """
            SELECT text
            FROM chunk_texts
            WHERE dgst = ? AND source_type = ? AND chunk_index = ?
        """
        text_result = processor.db.execute(text_query, [dgst, source_type, best_chunk_idx]).fetchone()
        chunk_text = text_result[0] if text_result else "Text not found"

        # add text chunk to result
        results_with_text.append((dgst, name, source_type, weighted_score, best_chunk_idx, chunk_text))

    return results_with_text


source_map = {"targets": "Security target", "report": "Certification report", "cert": "Certificate"}


class Server(BaseHTTPRequestHandler):
    def do_GET(self) -> None:
        self.send_response(200)
        self.send_header("Content-type", "text/html; charset=utf-8")
        self.end_headers()

        # parse query
        query = ""
        results = []
        if "?" in self.path:
            query = urllib.parse.parse_qs(self.path.split("?")[1]).get("q", [""])[0]
            if query:
                results = query_similar_chunks(query, processor, k=5)

        # Build response
        html = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <meta charset="UTF-8">
            <title>Cert Search</title>
            <style>
                body {{ font-family: sans-serif; max-width: 900px; margin: 0 auto; padding: 20px; }}
                .result {{ border: 1px solid #ddd; margin: 15px 0; padding: 15px; border-radius: 5px; }}
            </style>
        </head>
        <body>
            <h1>Certificate Vector Search</h1>
            <form method="get">
                <input type="text" name="q" placeholder="Search query..." value="{query}">
                <input type="submit" value="Search">
            </form>
        """

        if results:
            html += "<h2>Results</h2>"
            for dgst, name, source_type, score, chunk_idx, text in results:
                html += f"""
                <div class="result">
                    <h3>{name}</h3>
                    <p>Source: {source_map.get(source_type, "Unknown")} | Score: {score:.4f}</p>
                    <p>{text[:500]}...</p>
                </div>
                """

        html += "</body></html>"
        self.wfile.write(html.encode())


def check_dataset_path(path_dset: Path) -> bool:
    """Verify that the dataset path exists and contains dataset.json."""
    if not path_dset.exists():
        print(f"Error: Dataset path '{path_dset}' does not exist")
        sys.exit(1)

    if not (path_dset / "dataset.json").exists():
        print(f"Error: dataset.json not found in '{path_dset}'")
        sys.exit(1)

    return True


def check_certificate_count(dset: CCDataset) -> int:
    """Check and report the number of certificates in the dataset."""
    cert_count = len(list(dset.certs))
    print(f"Dataset contains {cert_count} certificates")
    if cert_count == 0:
        print("Error: Dataset is empty")
        sys.exit(1)

    return cert_count


def check_txt_files(path_dset: Path) -> int:
    """Verify that text files exist in the expected directories."""
    txt_paths = [path_dset / "certs/certificates/txt", path_dset / "certs/reports/txt", path_dset / "certs/targets/txt"]

    txt_counts = []
    for path in txt_paths:
        if path.exists():
            count = len(list(path.glob("*.txt")))
            txt_counts.append(count)
            print(f"Found {count} .txt files in {path}")
        else:
            print(f"Warning: Path {path} does not exist")
            txt_counts.append(0)

    total = sum(txt_counts)
    if total == 0:
        print("Error: No .txt files found in any of the expected directories")
        sys.exit(1)

    return total


def manage_database(db_path: Path, model: SentenceTransformer, force_rebuild: bool, dset: CCDataset) -> bool:
    """Create or rebuild the database if needed."""
    if force_rebuild or not db_path.exists():
        print(f"{'Rebuilding' if force_rebuild else 'Creating'} database at {db_path}")
        if db_path.exists() and force_rebuild:
            db_path.unlink()

        processor = CertProcessor(model, fresh_db=True)
        processor.process_all(dset)
        processor.close()
        print("Database initialization complete.")
        return True
    return False


# pip install sec_certs sentence-transformers nltk sqlite-vec
# Usage example: python vector_search.py --data-path ../../cc_data_gemini --db-path vector_db.sqlite --force-rebuild --port 8080
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Certificate Vector Search")
    parser.add_argument("--data-path", type=str, default="../../cc_data_gemini", help="Path to the dataset directory")
    parser.add_argument("--db-path", type=str, default="vector_db.sqlite", help="Path to the SQLite database file")
    parser.add_argument("--force-rebuild", action="store_true", help="Force database rebuild even if it exists")
    parser.add_argument("--port", type=int, default=8000, help="Port to run the server on")
    args = parser.parse_args()

    path_dset = Path(args.data_path)
    DB_PATH = Path(args.db_path)

    check_dataset_path(path_dset)

    dset = CCDataset.from_json(path_dset / "dataset.json")

    cert_count = check_certificate_count(dset)
    txt_count = check_txt_files(path_dset)

    model = SentenceTransformer(MODEL_NAME)

    db_rebuilt = manage_database(DB_PATH, model, args.force_rebuild, dset)

    processor = CertProcessor(model, fresh_db=False)

    server = HTTPServer(("localhost", 8000), Server)
    print("Server started at http://localhost:8000")
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        processor.close()
        print("Server stopped.")