Add dmap deduplication.

author: J08nY 2024-01-29 14:27:10 +0100
committer: J08nY 2024-01-29 14:27:10 +0100
commit: 9333147035a317a2a8b6529612435bee7e3a66ea (patch)
tree: 768483d5f3672eb07dc707d8734fc2dfb7b07c1c
parent: 3720d55c229d62b848f90d8fcd6db5c3b937e6d1 (diff)
download: pyecsca-9333147035a317a2a8b6529612435bee7e3a66ea.tar.gz
pyecsca-9333147035a317a2a8b6529612435bee7e3a66ea.tar.zst
pyecsca-9333147035a317a2a8b6529612435bee7e3a66ea.zip
2 files changed, 38 insertions, 16 deletions
diff --git a/pyecsca/sca/re/tree.py b/pyecsca/sca/re/tree.py
index 6aae1ea..2955631 100644
--- a/pyecsca/sca/re/tree.py
+++ b/pyecsca/sca/re/tree.py
@@ -152,10 +152,17 @@ class Map:
 
     def deduplicate(self):
         """Deduplicate the configs of this distinguishing map based on the rows."""
-        for row, data in self.mapping.groupby(
-            self.mapping.columns.tolist(), as_index=False
-        ):
-            pass
+        indices = []
+
+        def agg(thing):
+            indices.append(thing.index)
+            return thing.iloc[0]
+
+        self.mapping = self.mapping.groupby(self.mapping.columns.tolist(), as_index=False, dropna=False).agg(agg)
+        new_cfg_map = self.cfg_map.copy()
+        for i, index in enumerate(indices):
+            new_cfg_map.loc[self.cfg_map["vals"].isin(index), "vals"] = i
+        self.cfg_map = new_cfg_map
 
     def merge(self, other: "Map"):
         """Merge in another distinguishing map operating on different configs."""
@@ -167,7 +174,9 @@ class Map:
         last = max(self.cfg_map["vals"])
         # Offset the other cfg_map and mapping index by last + 1
         other_cfg_map = other.cfg_map + (last + 1)
-        other_mapping = other.mapping[reordering].set_index(other.mapping.index + (last + 1))
+        other_mapping = other.mapping[reordering].set_index(
+            other.mapping.index + (last + 1)
+        )
         # Now concat the cfg_map and mapping
         self.cfg_map = pd.concat([self.cfg_map, other_cfg_map], copy=False)
         self.mapping = pd.concat([self.mapping, other_mapping], copy=False)
@@ -350,7 +359,7 @@ def _build_tree(
         )
         log(pad + f"Split {len(group_cfgs)} via dmap {best_i}.")
         # And build the tree recursively
-        child = _build_tree(group_cfgs, maps, response=output, depth=depth+1)
+        child = _build_tree(group_cfgs, maps, response=output, depth=depth + 1)
         child.parent = result
 
     return result
diff --git a/test/sca/test_tree.py b/test/sca/test_tree.py
index ce16f97..89a9f61 100644
--- a/test/sca/test_tree.py
+++ b/test/sca/test_tree.py
@@ -1,5 +1,6 @@
 import random
 import time
+from copy import deepcopy
 
 from pyecsca.sca.re.tree import Tree, Map
 import pandas as pd
@@ -35,7 +36,19 @@ def test_map_merge():
     assert len(dmap1.cfg_map) == 4
     assert len(dmap1.codomain) == 2
     assert not dmap1["c", 3]
-    assert dmap1["a", 0]
+    assert dmap1["a", 1]
+
+
+def test_map_deduplicate():
+    cfgs = {"a", "b", "c", "d"}
+    binary_sets = {"a": {1, 2, 3}, "b": {2, 3, 4}, "c": {1, 2, 3}, "d": {4, 2}}
+    dmap = Map.from_sets(cfgs, binary_sets)
+    original = deepcopy(dmap)
+    dmap.deduplicate()
+    for cfg in cfgs:
+        for i in [1, 2, 3, 4]:
+            assert dmap[cfg, i] == original[cfg, i]
+    assert len(dmap.mapping) < len(original.mapping)
 
 
 def test_build_tree():
@@ -75,13 +88,13 @@ def test_expand_tree():
 def test_df():
     nrows = 12_000_000
     ncols = 5
-    index = list(range(nrows))
-    df = pd.DataFrame(
-        [random.choices((True, False), k=ncols) for _ in index], index=index
+    df = pd.DataFrame([random.choices((True, False), k=ncols) for _ in range(nrows)])
+    cfg_map = pd.DataFrame(
+        [(i,) for i in range(nrows)],
+        index=[str(i) for i in range(nrows)],
+        columns=["vals"],
     )
-    print(df.memory_usage().sum())
-    start = time.perf_counter()
-    for row, data in df.groupby(df.columns.tolist(), as_index=False):
-        pass
-    end = time.perf_counter()
-    print(end - start)
+    dmap = Map(df, cfg_map, list(range(ncols)), {True, False})
+    # start = time.perf_counter()
+    dmap.deduplicate()
+    # end = time.perf_counter()
author	J08nY	2024-01-29 14:27:10 +0100
committer	J08nY	2024-01-29 14:27:10 +0100
commit	9333147035a317a2a8b6529612435bee7e3a66ea (patch)
tree	768483d5f3672eb07dc707d8734fc2dfb7b07c1c
parent	3720d55c229d62b848f90d8fcd6db5c3b937e6d1 (diff)
download	pyecsca-9333147035a317a2a8b6529612435bee7e3a66ea.tar.gz pyecsca-9333147035a317a2a8b6529612435bee7e3a66ea.tar.zst pyecsca-9333147035a317a2a8b6529612435bee7e3a66ea.zip