Add customizable splitting criterion in tree building.

author: J08nY 2025-03-19 13:56:43 +0100
committer: J08nY 2025-03-19 13:56:43 +0100
commit: 57434113ec4b0400a5ac7b949536e5edfebddc00 (patch)
tree: 095eeba1bb205e21ff558f2fa564ada5eb73d693
parent: dde027debb675fa20c72eb1d9d315bdf544f3e16 (diff)
download: pyecsca-57434113ec4b0400a5ac7b949536e5edfebddc00.tar.gz
pyecsca-57434113ec4b0400a5ac7b949536e5edfebddc00.tar.zst
pyecsca-57434113ec4b0400a5ac7b949536e5edfebddc00.zip
2 files changed, 80 insertions, 21 deletions
diff --git a/pyecsca/sca/re/tree.py b/pyecsca/sca/re/tree.py
index e2174a3..39a7ca7 100644
--- a/pyecsca/sca/re/tree.py
+++ b/pyecsca/sca/re/tree.py
@@ -355,6 +355,61 @@ class Node(NodeMixin):
 
 
 @public
+class SplitCriterion:
+    """
+    A splitting criterion to be used in tree building.
+    """
+
+    def __call__(self, split: pd.Series) -> float:
+        raise NotImplementedError
+
+    def is_better(self, score: float, current_best: float) -> bool:
+        raise NotImplementedError
+
+    def is_optimal(self, score: float, n_cfgs: int, n_codomain: int) -> bool:
+        raise NotImplementedError
+
+
+class Degree(SplitCriterion):
+    def __call__(self, split: pd.Series) -> float:
+        # We want to maximize the number of groups.
+        # The score is optimal if it is equal to min(n_cfgs, len(dmap.codomain)).
+        return len(split)
+
+    def is_better(self, score: float, current_best: float) -> bool:
+        return score > current_best
+
+    def is_optimal(self, score: float, n_cfgs: int, n_codomain: int) -> bool:
+        return score == min(n_cfgs, n_codomain)
+
+
+class SizeOfLargest(SplitCriterion):
+    def __call__(self, split: pd.Series) -> float:
+        # We want to minimize size of largest group.
+        # The score is optimal if it is equal to ceil(n_cfgs / len(dmap.codomain)).
+        return max(split)
+
+    def is_better(self, score: float, current_best: float) -> bool:
+        return score < current_best
+
+    def is_optimal(self, score: float, n_cfgs: int, n_codomain: int) -> bool:
+        return score == ceil(n_cfgs / n_codomain)
+
+
+class AverageSize(SplitCriterion):
+    def __call__(self, split: pd.Series) -> float:
+        # We want to minimize the average size of the groups.
+        # The score is optimal if it is equal to ceil(n_cfgs / len(dmap.codomain)).
+        return sum(split**2) / sum(split)
+
+    def is_better(self, score: float, current_best: float) -> bool:
+        return score < current_best
+
+    def is_optimal(self, score: float, n_cfgs: int, n_codomain: int) -> bool:
+        return score == ceil(n_cfgs / n_codomain)
+
+
+@public
 class Tree:
     """A distinguishing tree."""
 
@@ -493,27 +548,16 @@ class Tree:
         return tree
 
     @classmethod
-    def build(cls, cfgs: Set[Any], *maps: Map) -> "Tree":
+    def build(cls, cfgs: Set[Any], *maps: Map, split: str | SplitCriterion = "largest") -> "Tree":
         """
         Build a tree.
 
         :param cfgs: The set of configs to build the tree for.
         :param maps: The distinguishing maps to use.
+        :param split: The split criterion to use. Can be one of "degree", "largest", "average".
         :return: The tree.
         """
-        return cls(_build_tree(cfgs, dict(enumerate(maps))), *maps)
-
-
-def _degree(split: pd.Series) -> float:
-    return len(split)
-
-
-def _size_of_largest(split: pd.Series) -> float:
-    return max(split)
-
-
-def _average_size(split: pd.Series) -> float:
-    return sum(split**2) / sum(split)
+        return cls(_build_tree(cfgs, dict(enumerate(maps)), split=split), *maps)
 
 
 def _build_tree(
@@ -521,6 +565,7 @@ def _build_tree(
     maps: Mapping[int, Map],
     response: Optional[Any] = None,
     depth: int = 0,
+    split: str | SplitCriterion = "largest",
 ) -> Node:
     pad = " " * depth
     # If there is only one remaining cfg, we do not need to continue and just return (base case 1).
@@ -533,6 +578,19 @@ def _build_tree(
         log(pad + "Trivial.")
         return Node(cfgset, response=response)
 
+    # Choose the split criterion
+    criterion: SplitCriterion
+    if split == "degree":
+        criterion = Degree()
+    elif split == "largest":
+        criterion = SizeOfLargest()
+    elif split == "average":
+        criterion = AverageSize()
+    elif isinstance(split, SplitCriterion):
+        criterion = split
+    else:
+        raise ValueError(f"Unknown split criterion: {split}")
+
     # Go over the maps and figure out which one splits the best.
     best_i = None
     best_dmap = None
@@ -544,17 +602,16 @@ def _build_tree(
         # Note we should look at the restriction of the map to the current "cfgs" and split those
         restricted = dmap.mapping.loc[dmap.cfg_map.loc[list(cfgs), "vals"]]
         for j, column in restricted.items():
-            split = column.value_counts(dropna=False)
-            # TODO: Try the other scores.
-            score = _size_of_largest(split)
-            if best_score is None or score < best_score:
+            counts = column.value_counts(dropna=False)
+            score = criterion(counts)
+            if best_score is None or criterion.is_better(score, best_score):
                 best_i = i
                 best_column = j
                 best_dmap = dmap
                 best_score = score
                 best_restricted = restricted
             # Early abort if optimal score is hit.
-            if score == ceil(n_cfgs / len(dmap.codomain)):
+            if criterion.is_optimal(score, n_cfgs, len(dmap.codomain)):
                 break
     # We found nothing distinguishing the configs, so return them all (base case 2).
     if best_column is None or best_dmap is None:
diff --git a/test/sca/test_tree.py b/test/sca/test_tree.py
index 58537f5..a3afaf9 100644
--- a/test/sca/test_tree.py
+++ b/test/sca/test_tree.py
@@ -3,6 +3,7 @@ from collections import OrderedDict
 from copy import deepcopy
 
 import pandas as pd
+import pytest
 
 from pyecsca.sca.re.tree import Tree, Map
 
@@ -81,7 +82,8 @@ def test_map_with_callable(secp128r1):
     assert dmap[cfgs[0], 1]
 
 
-def test_build_tree():
+@pytest.mark.parametrize("split", ["degree", "largest", "average"])
+def test_build_tree(split):
     cfgs = ["a", "b", "c"]
     cfg_map = pd.DataFrame([0, 1, 2], index=cfgs, columns=["vals"])
     inputs1 = [1, 2, 3, 4]
@@ -93,7 +95,7 @@ def test_build_tree():
     codomain2 = {0, 1, 2, 3}
     mapping2 = pd.DataFrame([(1, 0, 0), (2, 0, 0), (3, 0, 0)])
     dmap2 = Map(mapping2, cfg_map, inputs2, codomain2)
-    tree = Tree.build(set(cfgs), dmap1, dmap2)
+    tree = Tree.build(set(cfgs), dmap1, dmap2, split=split)
     tree.render()
     tree.render_basic()
     tree.describe()
author	J08nY	2025-03-19 13:56:43 +0100
committer	J08nY	2025-03-19 13:56:43 +0100
commit	57434113ec4b0400a5ac7b949536e5edfebddc00 (patch)
tree	095eeba1bb205e21ff558f2fa564ada5eb73d693
parent	dde027debb675fa20c72eb1d9d315bdf544f3e16 (diff)
download	pyecsca-57434113ec4b0400a5ac7b949536e5edfebddc00.tar.gz pyecsca-57434113ec4b0400a5ac7b949536e5edfebddc00.tar.zst pyecsca-57434113ec4b0400a5ac7b949536e5edfebddc00.zip