aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--epare/distinguish.ipynb96
1 files changed, 93 insertions, 3 deletions
diff --git a/epare/distinguish.ipynb b/epare/distinguish.ipynb
index 875b9ed..02ee444 100644
--- a/epare/distinguish.ipynb
+++ b/epare/distinguish.ipynb
@@ -609,12 +609,102 @@
]
},
{
+ "cell_type": "markdown",
+ "id": "f16a5868-e92c-4b84-9f19-664627d9848a",
+ "metadata": {},
+ "source": [
+ "## Simulate distinguishing using a Bayes classifier"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ed81e076-9ccb-445d-ada9-384b73efb2c5",
+ "metadata": {},
+ "source": [
+ "### Feature selection using trees\n",
+ "\n",
+ "We can reuse the clustering + tree building approach above and just take the inputs that the greedy tree building choses as the features. However, we can also use more conventional feature selection approaches."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1f24b323-3604-4e34-a880-9dfd611fb245",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "good_inputs = Counter()\n",
+ "for node in PreOrderIter(tree.root):\n",
+ " if node.is_leaf:\n",
+ " continue\n",
+ " good_inputs[node.dmap_input] += 1\n",
+ "for good in sorted(good_inputs):\n",
+ " print(good)\n",
+ " print(bin(good))\n",
+ " print(f\"used {good_inputs[good]} times\")\n",
+ " print(f\"nbits {good.bit_length()}\")\n",
+ " for div_name, div_group in divisor_map.items():\n",
+ " if good in div_group and div_name != \"all\":\n",
+ " print(div_name, end=\", \")\n",
+ " print(\"\\n\")"
+ ]
+ },
+ {
"cell_type": "code",
"execution_count": null,
- "id": "f2787faf-a487-4f28-aa3c-8fdd9562550d",
+ "id": "f1052222-ad32-4e25-97ca-851cc42bf546",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "simulations = 400\n",
+ "retries = 1000\n",
+ "\n",
+ "for nfeats in (6,): #trange(1, 7)\n",
+ " for nattack in range(100, 200, 100):\n",
+ " best_feats = None\n",
+ " best_feats_mean_pos = None\n",
+ " best_successes = None\n",
+ " for _ in trange(retries):\n",
+ " feats = random.sample(sorted(good_inputs), nfeats)\n",
+ " successes = {k:0 for k in range(1, 11)}\n",
+ " mean_pos = 0\n",
+ " for _ in range(simulations):\n",
+ " true_mult = random.choice(list(distributions_mults.keys()))\n",
+ " probmap = distributions_mults[true_mult]\n",
+ " feat_vector = []\n",
+ " for divisor in enumerate(feats):\n",
+ " prob = probmap[divisor]\n",
+ " sampled = binom(nattack, prob).rvs()\n",
+ " feat_vector.append(sampled)\n",
+ " scoring = []\n",
+ " for other_mult, other_probmap in distributions_mults.items():\n",
+ " proba = 1\n",
+ " for sampled, divisor in zip(feat_vector, feats):\n",
+ " other_p = other_probmap[divisor]\n",
+ " prob = binom(nattack, other_p).pmf(sampled)\n",
+ " proba *= prob\n",
+ " scoring.append((proba, other_mult))\n",
+ " scoring.sort(key=lambda item: item[0], reverse=True)\n",
+ " for i, (sim, other) in enumerate(scoring):\n",
+ " if other == true_mult:\n",
+ " mean_pos += i\n",
+ " for k in range(10):\n",
+ " if i <= k:\n",
+ " successes[k+1] +=1\n",
+ " for i in successes.keys():\n",
+ " successes[i] /= simulations\n",
+ " #print(f\"{nattack:<10}: mean position {mean_pos/simulations}\")\n",
+ " #print(f\" top1: {successes[1]}, top5: {successes[5]}, top10: {successes[10]}\")\n",
+ " if best_feats is None or best_feats_mean_pos > mean_pos/simulations:\n",
+ " best_feats = feats\n",
+ " best_feats_mean_pos = mean_pos/simulations\n",
+ " best_successes = successes\n",
+ " print(flush=True)\n",
+ " print(nattack)\n",
+ " print(f\"Features: ({nfeats}) {best_feats}\")\n",
+ " print(f\"mean_pos: {best_feats_mean_pos}\")\n",
+ " print(f\"top1: {best_successes[1]}, top2: {best_successes[2]}, top5: {best_successes[5]}, top10: {best_successes[10]}\")"
+ ]
}
],
"metadata": {
@@ -633,7 +723,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.3"
+ "version": "3.13.1"
}
},
"nbformat": 4,