1 files changed, 93 insertions, 3 deletions
diff --git a/epare/distinguish.ipynb b/epare/distinguish.ipynb
index 875b9ed..02ee444 100644
--- a/epare/distinguish.ipynb
+++ b/epare/distinguish.ipynb
@@ -609,12 +609,102 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "id": "f16a5868-e92c-4b84-9f19-664627d9848a",
+   "metadata": {},
+   "source": [
+    "## Simulate distinguishing using a Bayes classifier"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed81e076-9ccb-445d-ada9-384b73efb2c5",
+   "metadata": {},
+   "source": [
+    "### Feature selection using trees\n",
+    "\n",
+    "We can reuse the clustering + tree building approach above and just take the inputs that the greedy tree building choses as the features. However, we can also use more conventional feature selection approaches."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f24b323-3604-4e34-a880-9dfd611fb245",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "good_inputs = Counter()\n",
+    "for node in PreOrderIter(tree.root):\n",
+    "    if node.is_leaf:\n",
+    "        continue\n",
+    "    good_inputs[node.dmap_input] += 1\n",
+    "for good in sorted(good_inputs):\n",
+    "    print(good)\n",
+    "    print(bin(good))\n",
+    "    print(f\"used {good_inputs[good]} times\")\n",
+    "    print(f\"nbits {good.bit_length()}\")\n",
+    "    for div_name, div_group in divisor_map.items():\n",
+    "        if good in div_group and div_name != \"all\":\n",
+    "            print(div_name, end=\", \")\n",
+    "    print(\"\\n\")"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f2787faf-a487-4f28-aa3c-8fdd9562550d",
+   "id": "f1052222-ad32-4e25-97ca-851cc42bf546",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "simulations = 400\n",
+    "retries = 1000\n",
+    "\n",
+    "for nfeats in (6,): #trange(1, 7)\n",
+    "    for nattack in range(100, 200, 100):\n",
+    "        best_feats = None\n",
+    "        best_feats_mean_pos = None\n",
+    "        best_successes = None\n",
+    "        for _ in trange(retries):\n",
+    "            feats = random.sample(sorted(good_inputs), nfeats)\n",
+    "            successes = {k:0 for k in range(1, 11)}\n",
+    "            mean_pos = 0\n",
+    "            for _ in range(simulations):\n",
+    "                true_mult = random.choice(list(distributions_mults.keys()))\n",
+    "                probmap = distributions_mults[true_mult]\n",
+    "                feat_vector = []\n",
+    "                for divisor in enumerate(feats):\n",
+    "                    prob = probmap[divisor]\n",
+    "                    sampled = binom(nattack, prob).rvs()\n",
+    "                    feat_vector.append(sampled)\n",
+    "                scoring = []\n",
+    "                for other_mult, other_probmap in distributions_mults.items():\n",
+    "                    proba = 1\n",
+    "                    for sampled, divisor in zip(feat_vector, feats):\n",
+    "                        other_p = other_probmap[divisor]\n",
+    "                        prob = binom(nattack, other_p).pmf(sampled)\n",
+    "                        proba *= prob\n",
+    "                    scoring.append((proba, other_mult))\n",
+    "                scoring.sort(key=lambda item: item[0], reverse=True)\n",
+    "                for i, (sim, other) in enumerate(scoring):\n",
+    "                    if other == true_mult:\n",
+    "                        mean_pos += i\n",
+    "                        for k in range(10):\n",
+    "                            if i <= k:\n",
+    "                                successes[k+1] +=1\n",
+    "            for i in successes.keys():\n",
+    "                successes[i] /= simulations\n",
+    "            #print(f\"{nattack:<10}: mean position {mean_pos/simulations}\")\n",
+    "            #print(f\"          top1: {successes[1]}, top5: {successes[5]}, top10: {successes[10]}\")\n",
+    "            if best_feats is None or best_feats_mean_pos > mean_pos/simulations:\n",
+    "                best_feats = feats\n",
+    "                best_feats_mean_pos = mean_pos/simulations\n",
+    "                best_successes = successes\n",
+    "        print(flush=True)\n",
+    "        print(nattack)\n",
+    "        print(f\"Features: ({nfeats}) {best_feats}\")\n",
+    "        print(f\"mean_pos: {best_feats_mean_pos}\")\n",
+    "        print(f\"top1: {best_successes[1]}, top2: {best_successes[2]}, top5: {best_successes[5]}, top10: {best_successes[10]}\")"
+   ]
   }
  ],
  "metadata": {
@@ -633,7 +723,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.13.1"
   }
  },
  "nbformat": 4,