aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJ08nY2025-03-24 17:33:01 +0100
committerJ08nY2025-04-16 12:25:06 +0200
commitbd8dbf4c97b915a9432d457c390a07ab5fe466e3 (patch)
tree590cc22b4919f7fe0ff2896bfe524458e7dc3c6d
parent95566a5b75b58a6a3aa2d3be9527e41585e3572a (diff)
downloadECTester-bd8dbf4c97b915a9432d457c390a07ab5fe466e3.tar.gz
ECTester-bd8dbf4c97b915a9432d457c390a07ab5fe466e3.tar.zst
ECTester-bd8dbf4c97b915a9432d457c390a07ab5fe466e3.zip
-rw-r--r--epare/distinguish.ipynb666
1 files changed, 597 insertions, 69 deletions
diff --git a/epare/distinguish.ipynb b/epare/distinguish.ipynb
index 02ee444..6f5fe5c 100644
--- a/epare/distinguish.ipynb
+++ b/epare/distinguish.ipynb
@@ -10,7 +10,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"id": "bc1528b8-61cd-4219-993f-e3f1ac79e801",
"metadata": {},
"outputs": [],
@@ -25,7 +25,7 @@
"\n",
"import numpy as np\n",
"import pandas as pd\n",
- "from scipy.stats import binom\n",
+ "from scipy.stats import binom, entropy\n",
"from scipy.spatial import distance\n",
"from tqdm.auto import tqdm, trange\n",
"from statsmodels.stats.proportion import proportion_confint\n",
@@ -104,7 +104,7 @@
"metadata": {},
"outputs": [],
"source": [
- "selected_mults = all_mults\n",
+ "selected_mults = all_mults # distributions_mults.keys()\n",
"divisor_name = \"all\"\n",
"kind = \"precomp+necessary\"\n",
"selected_divisors = divisor_map[divisor_name]"
@@ -431,23 +431,6 @@
]
},
{
- "cell_type": "markdown",
- "id": "9b94be24-d0ee-4597-ad99-4c55558a38c9",
- "metadata": {},
- "source": [
- "### Feature selection using variance"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "11ee99e6-a90e-4304-bd7d-6491ce936b57",
- "metadata": {},
- "source": [
- "### Feature selection using synthetic data\n",
- "The below contains some experiments that mostly do not work. Ignore."
- ]
- },
- {
"cell_type": "code",
"execution_count": null,
"id": "1e8440f3-f856-41e0-8d37-56b750e1309d",
@@ -630,7 +613,9 @@
"cell_type": "code",
"execution_count": null,
"id": "1f24b323-3604-4e34-a880-9dfd611fb245",
- "metadata": {},
+ "metadata": {
+ "scrolled": true
+ },
"outputs": [],
"source": [
"good_inputs = Counter()\n",
@@ -652,59 +637,602 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "fa49b67c-0a52-443e-904c-98f0a3d7febf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def bayes(nattack: int, feat_vector: list[int], feats, probmap):\n",
+ " bayes.reverse = True\n",
+ " log_likelihood = 0.0\n",
+ " for sampled, divisor in zip(feat_vector, feats):\n",
+ " other_p = probmap[divisor]\n",
+ " log_prob = binom(nattack, other_p).logpmf(sampled)\n",
+ " log_likelihood += log_prob\n",
+ " return log_likelihood\n",
+ "\n",
+ "def euclid(nattack: int, feat_vector: list[int], feats, probmap):\n",
+ " euclid.reverse = False\n",
+ " other_vector = np.zeros(nfeats)\n",
+ " for i, divisor in enumerate(feats):\n",
+ " other_vector[i] = probmap[divisor]\n",
+ " return distance.euclidean(feat_vector, other_vector)\n",
+ "\n",
+ "# TODO: Adjust scorers to penalize/reject when sampled prob of a feature is != 1.0 but the mult has that feature at 1.0 proba.\n",
+ "\n",
+ "def one_simulation(nattack, mults, feats, scorer):\n",
+ " true_mult = random.choice(list(mults))\n",
+ " probmap = mults[true_mult]\n",
+ " feat_vector = []\n",
+ " for divisor in feats:\n",
+ " prob = probmap[divisor]\n",
+ " sampled = binom(nattack, prob).rvs()\n",
+ " feat_vector.append(sampled)\n",
+ " scoring = []\n",
+ " for other_mult, other_probmap in mults.items():\n",
+ " score = scorer(nattack, feat_vector, feats, other_probmap)\n",
+ " scoring.append((score, other_mult))\n",
+ " scoring.sort(key=lambda item: item[0], reverse=scorer.reverse)\n",
+ " for i, (sim, other) in enumerate(scoring):\n",
+ " if other == true_mult:\n",
+ " return i\n",
+ "\n",
+ "def many_simulations(nattack, mults, feats, scorer, simulations):\n",
+ " successes = {k:0 for k in range(1, 11)}\n",
+ " mean_pos = 0\n",
+ " for _ in range(simulations):\n",
+ " pos = one_simulation(nattack, mults, feats, scorer)\n",
+ " mean_pos += pos\n",
+ " for k in range(1, 11):\n",
+ " if pos + 1 <= k:\n",
+ " successes[k] += 1\n",
+ " mean_pos /= simulations\n",
+ " for i in successes.keys():\n",
+ " successes[i] /= simulations\n",
+ " return mean_pos, successes\n",
+ "\n",
+ "def find_features_random(nfeat_range, nattack_range, num_workers, feat_retries, simulations, scorer):\n",
+ " for nfeats in nfeat_range:\n",
+ " for nattack in nattack_range:\n",
+ " best_feats = None\n",
+ " best_feats_mean_pos = None\n",
+ " best_successes = None\n",
+ " with TaskExecutor(max_workers=num_workers) as pool:\n",
+ " for retry in range(feat_retries):\n",
+ " feats = random.sample(sorted(good_inputs), nfeats)\n",
+ " pool.submit_task(retry,\n",
+ " many_simulations,\n",
+ " nattack, distributions_mults, feats, scorer, simulations)\n",
+ " for i, future in tqdm(pool.as_completed(), leave=False, desc=\"Retries\", total=feat_retries, smoothing=0):\n",
+ " mean_pos, successes = future.result()\n",
+ " #print(f\"{nfeats} {nattack}({i}): mean pos {mean_pos:.2f} top1: {successes[1]:.2f}, top5: {successes[5]:.2f}, top10: {successes[10]:.2f}\")\n",
+ " if best_feats is None or best_feats_mean_pos > mean_pos:\n",
+ " best_feats = feats\n",
+ " best_feats_mean_pos = mean_pos\n",
+ " best_successes = successes\n",
+ " \n",
+ " print(f\"Best results for {nfeats} feats at {nattack} samples out of {retries} random feat subsets.\")\n",
+ " print(f\"Features: {best_feats}\")\n",
+ " print(f\"mean_pos: {best_feats_mean_pos:.2f}\")\n",
+ " print(f\"top1: {best_successes[1]:.2f}, top2: {best_successes[2]:.2f}, top5: {best_successes[5]:.2f}, top10: {best_successes[10]:.2f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"id": "f1052222-ad32-4e25-97ca-851cc42bf546",
"metadata": {},
"outputs": [],
"source": [
- "simulations = 400\n",
- "retries = 1000\n",
+ "simulations = 500\n",
+ "retries = 200\n",
+ "nfeats = trange(1, 11, leave=False, desc=\"nfeats\")\n",
+ "nattack = trange(50, 350, 50, leave=False, desc=\"nattack\")\n",
+ "num_workers = 30\n",
"\n",
- "for nfeats in (6,): #trange(1, 7)\n",
- " for nattack in range(100, 200, 100):\n",
- " best_feats = None\n",
- " best_feats_mean_pos = None\n",
- " best_successes = None\n",
- " for _ in trange(retries):\n",
- " feats = random.sample(sorted(good_inputs), nfeats)\n",
- " successes = {k:0 for k in range(1, 11)}\n",
- " mean_pos = 0\n",
- " for _ in range(simulations):\n",
- " true_mult = random.choice(list(distributions_mults.keys()))\n",
- " probmap = distributions_mults[true_mult]\n",
- " feat_vector = []\n",
- " for divisor in enumerate(feats):\n",
- " prob = probmap[divisor]\n",
- " sampled = binom(nattack, prob).rvs()\n",
- " feat_vector.append(sampled)\n",
- " scoring = []\n",
- " for other_mult, other_probmap in distributions_mults.items():\n",
- " proba = 1\n",
- " for sampled, divisor in zip(feat_vector, feats):\n",
- " other_p = other_probmap[divisor]\n",
- " prob = binom(nattack, other_p).pmf(sampled)\n",
- " proba *= prob\n",
- " scoring.append((proba, other_mult))\n",
- " scoring.sort(key=lambda item: item[0], reverse=True)\n",
- " for i, (sim, other) in enumerate(scoring):\n",
- " if other == true_mult:\n",
- " mean_pos += i\n",
- " for k in range(10):\n",
- " if i <= k:\n",
- " successes[k+1] +=1\n",
- " for i in successes.keys():\n",
- " successes[i] /= simulations\n",
- " #print(f\"{nattack:<10}: mean position {mean_pos/simulations}\")\n",
- " #print(f\" top1: {successes[1]}, top5: {successes[5]}, top10: {successes[10]}\")\n",
- " if best_feats is None or best_feats_mean_pos > mean_pos/simulations:\n",
- " best_feats = feats\n",
- " best_feats_mean_pos = mean_pos/simulations\n",
- " best_successes = successes\n",
- " print(flush=True)\n",
- " print(nattack)\n",
- " print(f\"Features: ({nfeats}) {best_feats}\")\n",
- " print(f\"mean_pos: {best_feats_mean_pos}\")\n",
- " print(f\"top1: {best_successes[1]}, top2: {best_successes[2]}, top5: {best_successes[5]}, top10: {best_successes[10]}\")"
+ "find_features_random(nfeats, nattack, num_workers, retries, simulations, bayes)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6df03e89-d517-4243-bbfb-a5f52de24bb1",
+ "metadata": {},
+ "source": [
+ "### Feature selection via greedy classification\n",
+ "We can also use the classifier itself for feature selection. We iterate over all the divisors to pick the first feature with the best classifier results in simulation. Then we iteratively add features to it."
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "93c778a4-0855-4248-91a9-750fdd76ffa6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def find_features_iteratively(nfeats, nattack, num_workers, simulations, scorer, start_features=None):\n",
+ " available_feats = selected_divisors\n",
+ " feats = []\n",
+ " if start_features is not None:\n",
+ " if nfeats <= len(start_features):\n",
+ " raise ValueError(\"Features already picked.\")\n",
+ " feats.extend(start_features)\n",
+ " for feat in start_features:\n",
+ " available_feats.remove(feat)\n",
+ "\n",
+ " with TaskExecutor(max_workers=num_workers) as pool:\n",
+ " while len(feats) < nfeats:\n",
+ " for feat in available_feats:\n",
+ " pool.submit_task(feat,\n",
+ " many_simulations,\n",
+ " nattack, distributions_mults, feats + [feat], scorer, simulations)\n",
+ " best_feat = None\n",
+ " best_feat_mean_pos = None\n",
+ " best_successes = Noned\n",
+ " for feat, future in tqdm(pool.as_completed(), total=len(available_feats), desc=f\"Picking feature {len(feats)}\", smoothing=0):\n",
+ " mean_pos, successes = future.result()\n",
+ " if best_feat is None or best_feat_mean_pos > mean_pos:\n",
+ " best_feat = feat\n",
+ " best_feat_mean_pos = mean_pos\n",
+ " best_successes = successes\n",
+ " print(f\"Picked {best_feat} with mean pos: {mean_pos:.2f}\")\n",
+ " print(f\"top1: {best_successes[1]:.2f}, top2: {best_successes[2]:.2f}, top5: {best_successes[5]:.2f}, top10: {best_successes[10]:.2f}\")\n",
+ " feats.append(best_feat)\n",
+ " available_feats.remove(best_feat)\n",
+ " return feats"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6e4c2313-83b0-43f8-80d6-14c39be0d9ec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nfeats = 5\n",
+ "nattack = 100\n",
+ "num_workers = 30\n",
+ "simulations = 200\n",
+ "scorer = bayes\n",
+ "\n",
+ "feats = find_features_iteratively(nfeats, nattack, num_workers, simulations, scorer)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7c030a1c-c13d-401a-bcdb-212c064681e4",
+ "metadata": {},
+ "source": [
+ "### Feature selection via mRMR using mutual information"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd769175-c188-411c-af36-2973e7a0ffd1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def mutual_information(class_priors, p_ci_list, n):\n",
+ " \"\"\"\n",
+ " Compute mutual information I(X; Y) for a binomial feature with given class parameters.\n",
+ " \n",
+ " Args:\n",
+ " class_priors (np.array): P(Y=c), shape (num_classes,)\n",
+ " p_ci_list (np.array): Binomial parameters [p_{c,i}] for each class c, shape (num_classes,)\n",
+ " n (int): Number of trials in binomial distribution\n",
+ " \n",
+ " Returns:\n",
+ " float: Mutual information I(X; Y)\n",
+ " \"\"\"\n",
+ " num_classes = len(class_priors)\n",
+ " \n",
+ " # Precompute all PMFs across x and classes\n",
+ " x_values = np.arange(0, n + 1)[:, None] # (n+1, 1)\n",
+ " pmfs = binom.pmf(x_values, n, p_ci_list[None, :]) # Shape: (n+1, num_classes)\n",
+ " \n",
+ " # Compute joint probabilities P(Y=c) * P(X=x | Y=c)\n",
+ " # Multiply class_priors (shape C) with each row of pmfs (each x has shape (C,))\n",
+ " # class_priors[None, :] becomes (1, C), so broadcasting works.\n",
+ " joint_probs = pmfs * class_priors[None, :]\n",
+ " \n",
+ " # Compute P(X=x) for all x\n",
+ " px = np.sum(joint_probs, axis=1)\n",
+ "\n",
+ " # Compute H(Y|X):\n",
+ " h_ygx = 0.0\n",
+ "\n",
+ " for x_idx in range(n + 1):\n",
+ " current_px = px[x_idx]\n",
+ " \n",
+ " if current_px < 1e-9: # Skip negligible probabilities\n",
+ " continue\n",
+ " \n",
+ " cond_probs = joint_probs[x_idx] / current_px # P(Y=c | X=x)\n",
+ " \n",
+ " # Compute entropy H(Y|X=x) using scipy's entropy function\n",
+ " h_x = entropy(cond_probs, base=2)\n",
+ " \n",
+ " h_ygx += current_px * h_x\n",
+ " \n",
+ " # Prior entropy H(Y)\n",
+ " h_y = entropy(class_priors, base=2)\n",
+ "\n",
+ " return h_y - h_ygx\n",
+ "\n",
+ "\n",
+ "def mutual_information_between_features(class_priors, p_ci_i, p_ci_j, n):\n",
+ " \"\"\"\n",
+ " Compute mutual information between two features X_i and X_j.\n",
+ " \n",
+ " Parameters:\n",
+ " class_priors (array): Prior probabilities of each class. Shape: (num_classes,)\n",
+ " p_ci_i (array): Binomial parameters for feature i across classes. Shape: (num_classes,)\n",
+ " p_ci_j (array): Binomial parameters for feature j across classes. Shape: (num_classes,)\n",
+ " n (int): Number of trials for the binomial distribution.\n",
+ " \n",
+ " Returns:\n",
+ " float: Mutual information I(X_i; X_j)\n",
+ " \"\"\"\n",
+ " num_classes = len(class_priors)\n",
+ " x_vals = np.arange(0, n + 1) # Possible values of features\n",
+ " \n",
+ " ### Compute marginal distributions P(Xi=x), P(Xj=y) ###\n",
+ " # PMF for feature i across all classes\n",
+ " pmf_i_per_class = binom.pmf(x_vals[:, None], n, p_ci_i[None, :])\n",
+ " px_i = np.sum(pmf_i_per_class * class_priors[None, :], axis=1)\n",
+ " entropy_xi = entropy(px_i, base=2) if not np.allclose(px_i, 0.0) else 0.0\n",
+ " \n",
+ " # PMF for feature j across all classes\n",
+ " pmf_j_per_class = binom.pmf(x_vals[:, None], n, p_ci_j[None, :])\n",
+ " px_j = np.sum(pmf_j_per_class * class_priors[None, :], axis=1)\n",
+ " entropy_xj = entropy(px_j, base=2) if not np.allclose(px_j, 0.0) else 0.0\n",
+ " \n",
+ " ### Compute joint distribution P(Xi=x, Xj=y) ###\n",
+ " joint_xy = np.zeros((n + 1, n + 1))\n",
+ " \n",
+ " for c in range(num_classes):\n",
+ " pmf_i_c = binom.pmf(x_vals, n, p_ci_i[c])\n",
+ " pmf_j_c = binom.pmf(x_vals, n, p_ci_j[c])\n",
+ " \n",
+ " # Outer product gives joint PMF for class c\n",
+ " outer = np.outer(pmf_i_c, pmf_j_c)\n",
+ " joint_xy += class_priors[c] * outer\n",
+ " \n",
+ " # Compute entropy of the joint distribution\n",
+ " epsilon = 1e-10 # To avoid log(0) issues\n",
+ " non_zero = (joint_xy > epsilon)\n",
+ " entropy_joint = -np.sum(joint_xy[non_zero] * np.log2(joint_xy[non_zero]))\n",
+ " \n",
+ " ### Mutual Information ###\n",
+ " mi = entropy_xi + entropy_xj - entropy_joint\n",
+ " \n",
+ " return mi\n",
+ "\n",
+ "\n",
+ "def conditional_mutual_info(class_priors, XJ_params, XK_params, n):\n",
+ " \"\"\"\n",
+ " Compute I(XK; Y | XJ) using vectorization with broadcasting.\n",
+ " \n",
+ " Args:\n",
+ " XJ_params (array): p_{c,J} for all classes c.\n",
+ " XK_params (array): p_{c,K} for all classes c.\n",
+ " class_priors (array): P(Y=c) for all classes c.\n",
+ " n (int): Number of trials in the binomial distribution.\n",
+ "\n",
+ " Returns:\n",
+ " float: Conditional mutual information I(XK; Y | XJ).\n",
+ " \"\"\"\n",
+ " K = len(class_priors)\n",
+ " x_values = np.arange(n + 1)\n",
+ "\n",
+ " # Precompute PMFs for each class\n",
+ " P_XJ_giv_Y = binom.pmf(x_values[:, None], n, XJ_params) \n",
+ " P_XK_giv_Y = binom.pmf(x_values[:, None], n, XK_params) \n",
+ "\n",
+ " P_XJ_T = P_XJ_giv_Y.T # Shape: (K, n+1)\n",
+ " P_XK_T = P_XK_giv_Y.T\n",
+ "\n",
+ " ######################################################################\n",
+ " ### Compute H(Y | XJ) ###############################################\n",
+ " ######################################################################\n",
+ "\n",
+ " # Calculate P(XJ=xj) for all xj\n",
+ " P_XJ_total = np.dot(class_priors, P_XJ_T)\n",
+ "\n",
+ " # Numerators of posterior probabilities P(Y=c | XJ=xj)\n",
+ " numerators_YgXJ = class_priors[:, None] * P_XJ_T \n",
+ "\n",
+ " valid_mask = P_XJ_total > 1e-9\n",
+ " posterior_YgXJ = np.zeros_like(numerators_YgXJ, dtype=float)\n",
+ " posterior_YgXJ[:, valid_mask] = (\n",
+ " numerators_YgXJ[:, valid_mask] / \n",
+ " P_XJ_total[valid_mask]\n",
+ " )\n",
+ "\n",
+ " log_p = np.log2(posterior_YgXJ + 1e-9) \n",
+ " entropy_terms_HYgXJ = -np.sum(\n",
+ " posterior_YgXJ * log_p, \n",
+ " axis=0,\n",
+ " where=(posterior_YgXJ > 1e-9)\n",
+ " )\n",
+ " \n",
+ " H_Y_given_XJ = np.dot(P_XJ_total, entropy_terms_HYgXJ)\n",
+ "\n",
+ " ######################################################################\n",
+ " ### Compute H(Y | XJ, XK) ###########################################\n",
+ " ######################################################################\n",
+ "\n",
+ " # Broadcast to compute joint PMF P(XJ=xj, XK=xk | Y=c)\n",
+ " P_XJ_giv_Y_T = P_XJ_T[..., None] # Shape: (K, n+1, 1)\n",
+ " P_XK_giv_Y_T = P_XK_T[:, None, :] # Shape: (K, 1, n+1)\n",
+ "\n",
+ " joint_pmf_conditional = (\n",
+ " P_XJ_giv_Y_T * \n",
+ " P_XK_giv_Y_T\n",
+ " ) # Shape: (K, n+1, n+1)\n",
+ "\n",
+ " numerators = class_priors[:, None, None] * joint_pmf_conditional \n",
+ "\n",
+ " denominators = np.sum(numerators, axis=0) # Shape: (n+1, n+1)\n",
+ "\n",
+ " valid_mask_3d = (denominators > 1e-9)[None, ...] # Expand for class dimension\n",
+ "\n",
+ " # Compute posterior probabilities using broadcasting and where\n",
+ " posterior_YgXJXK = numerators / denominators[None, ...]\n",
+ " posterior_YgXJXK = np.where(valid_mask_3d, posterior_YgXJXK, 0.0)\n",
+ "\n",
+ " log_p_joint = np.log2(posterior_YgXJXK + 1e-9) \n",
+ " entropy_terms_HYgXJXK = -np.sum(\n",
+ " posterior_YgXJXK * log_p_joint,\n",
+ " axis=0, # Sum over classes (axis 0 is K)\n",
+ " where=(posterior_YgXJXK > 1e-9),\n",
+ " )\n",
+ "\n",
+ " H_Y_given_XJXK = np.sum(denominators * entropy_terms_HYgXJXK)\n",
+ "\n",
+ " ######################################################################\n",
+ " ### Compute CMI #####################################################\n",
+ " ######################################################################\n",
+ "\n",
+ " cmi = H_Y_given_XJ - H_Y_given_XJXK\n",
+ "\n",
+ " return max(cmi, 0.0) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "febfb392-017f-442f-8aaa-cb48bcdb9a6b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nmults = len(distributions_mults.keys())\n",
+ "nallfeats = len(selected_divisors)\n",
+ "priors = np.full(nmults, 1/nmults, dtype=np.float64)\n",
+ "probs = np.zeros((nallfeats, nmults), dtype=np.float64)\n",
+ "for i, divisor in enumerate(selected_divisors):\n",
+ " for j, (mult, probmap) in enumerate(distributions_mults.items()):\n",
+ " probs[i, j] = probmap[divisor]\n",
+ "\n",
+ "nattack = 100\n",
+ "mis = []\n",
+ "relevance = np.zeros(nallfeats, dtype=np.float64)\n",
+ "for i, divisor in enumerate(selected_divisors):\n",
+ " mi = mutual_information(priors, probs[i, ], nattack)\n",
+ " relevance[i] = mi\n",
+ " mis.append((mi, divisor))\n",
+ "mis.sort(key=lambda item: item[0], reverse=True)\n",
+ "\n",
+ "print(\"Top 10 feats\")\n",
+ "for mi, divisor in mis[:10]:\n",
+ " print(f\"{divisor} {mi:.3f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8361d1a3-87d1-4d35-9a8a-1a9a4a6eb638",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "num_workers = 30\n",
+ "\n",
+ "redundancy = np.zeros((nallfeats, nallfeats), dtype=np.float64)\n",
+ "with TaskExecutor(max_workers=num_workers) as pool:\n",
+ " for i in trange(nallfeats):\n",
+ " for j in range(nallfeats):\n",
+ " if i < j:\n",
+ " continue\n",
+ " pool.submit_task((i, j),\n",
+ " mutual_information_between_features,\n",
+ " priors, probs[i, ], probs[j, ], nattack)\n",
+ " for (i, j), future in pool.as_completed():\n",
+ " mi = future.result()\n",
+ " redundancy[i][j] = mi\n",
+ " redundancy[j][i] = mi\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d64cf0a1-a83f-4837-8113-5de536fb0f09",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(\"relevance.pickle\", \"wb\") as f:\n",
+ " pickle.dump(relevance, f)\n",
+ "with open(\"redundancy.pickle\", \"wb\") as f:\n",
+ " pickle.dump(redundancy, f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d3223edc-f3b2-4137-bc1f-75e311ff075e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def mrmr_selection(relevance, redundancy, nfeatsall, nfeats):\n",
+ " \"\"\"\n",
+ " Select top features using mRMR.\n",
+ " \n",
+ " Parameters:\n",
+ " features: List of feature parameter arrays (each is a binomial parameter array for all classes)\n",
+ " class_priors: Prior probabilities of each class\n",
+ " n: Number of trials in the binomial distribution\n",
+ " num_features_to_select: Desired number of features\n",
+ " \n",
+ " Returns:\n",
+ " indices of selected features.\n",
+ " \"\"\"\n",
+ " selected_indices = []\n",
+ " remaining_indices = list(range(nfeatsall))\n",
+ " \n",
+ " # Initialize by selecting the most relevant feature\n",
+ " first_feature_idx = np.argmax(relevance)\n",
+ " selected_indices.append(first_feature_idx)\n",
+ " remaining_indices.remove(first_feature_idx)\n",
+ " \n",
+ " while len(selected_indices) < nfeats:\n",
+ " candidates_scores = []\n",
+ " \n",
+ " for candidate in remaining_indices:\n",
+ " # Compute mRMR score: relevance - average redundancy with selected features\n",
+ " current_relevance = relevance[candidate]\n",
+ " \n",
+ " avg_red = 0.0\n",
+ " if len(selected_indices) > 0:\n",
+ " sum_red = np.sum(redundancy[candidate][selected_indices])\n",
+ " avg_red = sum_red / len(selected_indices)\n",
+ " \n",
+ " score = current_relevance - avg_red\n",
+ " candidates_scores.append(score)\n",
+ " \n",
+ " # Select the candidate with highest score\n",
+ " best_candidate_idx = remaining_indices[np.argmax(candidates_scores)]\n",
+ " selected_indices.append(best_candidate_idx)\n",
+ " remaining_indices.remove(best_candidate_idx)\n",
+ " \n",
+ " return selected_indices"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5604d599-ef63-49fc-a7c8-65fa90c15620",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nfeatsall = len(selected_divisors)\n",
+ "selected_mrmr = mrmr_selection(relevance, redundancy, nfeatsall, nfeats=5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8c33498a-99b0-41bb-bc3d-7bbe8906fcc4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for i in selected_mrmr:\n",
+ " print(selected_divisors[i])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a12b75cd-3c62-4b87-a7df-f0c5f7748386",
+ "metadata": {},
+ "source": [
+ "## Feature selection via JMI"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d8b3f827-baef-49c2-af60-0be74ff0efa2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def jmi_selection(features_params_list, class_priors, n_trials, relevance, nfeatsall, nfeats):\n",
+ " \"\"\"\n",
+ " Select top features using JMI.\n",
+ " \n",
+ " Parameters:\n",
+ " features: List of feature parameter arrays (each is a binomial parameter array for all classes)\n",
+ " class_priors: Prior probabilities of each class\n",
+ " n: Number of trials in the binomial distribution\n",
+ " num_features_to_select: Desired number of features\n",
+ " \n",
+ " Returns:\n",
+ " indices of selected features.\n",
+ " \"\"\"\n",
+ " selected_indices = []\n",
+ " remaining_indices = list(range(nfeatsall))\n",
+ " \n",
+ " # Initialize by selecting the most relevant feature\n",
+ " first_feature_idx = np.argmax(relevance)\n",
+ " selected_indices.append(first_feature_idx)\n",
+ " remaining_indices.remove(first_feature_idx)\n",
+ " \n",
+ " while len(selected_indices) < nfeats:\n",
+ " candidates_scores = []\n",
+ " \n",
+ " for candidate in tqdm(remaining_indices):\n",
+ " # Compute mRMR score: relevance - average redundancy with selected features\n",
+ " current_relevance = relevance[candidate]\n",
+ " \n",
+ " sum_cmi = 0.0\n",
+ " for selected in selected_indices:\n",
+ " XJ_params = features_params_list[selected]\n",
+ " XK_params = features_params_list[candidate]\n",
+ " \n",
+ " cmi_val = conditional_mutual_info(\n",
+ " class_priors=class_priors,\n",
+ " XJ_params=XJ_params,\n",
+ " XK_params=XK_params,\n",
+ " n=n_trials\n",
+ " )\n",
+ " sum_cmi += cmi_val\n",
+ " avg_cmi = sum_cmi / len(selected_indices)\n",
+ " score = current_relevance + avg_cmi\n",
+ " candidates_scores.append(score)\n",
+ " \n",
+ " # Select the candidate with highest score\n",
+ " best_candidate_idx = remaining_indices[np.argmax(candidates_scores)]\n",
+ " selected_indices.append(best_candidate_idx)\n",
+ " remaining_indices.remove(best_candidate_idx)\n",
+ " \n",
+ " return selected_indices"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6739192e-879a-4862-b862-6a8fc3939b73",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nfeatsall = len(selected_divisors)\n",
+ "selected_jmi = jmi_selection(probs, priors, nattack, relevance, nfeatsall, nfeats=5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "27dfa144-309a-4624-a709-e0a3f0704d6e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for i in selected_jmi:\n",
+ " print(selected_divisors[i])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "da7cdd96-0891-4bd9-85e9-07a701240749",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
@@ -723,7 +1251,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.13.1"
+ "version": "3.12.3"
}
},
"nbformat": 4,