aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJ08nY2025-03-24 18:29:35 +0100
committerJ08nY2025-04-16 12:25:06 +0200
commit28453a025209a201df20bdbb1c6e4dd39a313fc3 (patch)
tree5a48f1cf401255a56a48b2b156b7092485e12b95
parentb51d7a41141a47be39886eb4af61ad39714eaeef (diff)
downloadECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.tar.gz
ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.tar.zst
ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.zip
-rw-r--r--epare/distinguish.ipynb173
1 files changed, 7 insertions, 166 deletions
diff --git a/epare/distinguish.ipynb b/epare/distinguish.ipynb
index 6f5fe5c..0ca42c7 100644
--- a/epare/distinguish.ipynb
+++ b/epare/distinguish.ipynb
@@ -106,7 +106,7 @@
"source": [
"selected_mults = all_mults # distributions_mults.keys()\n",
"divisor_name = \"all\"\n",
- "kind = \"precomp+necessary\"\n",
+ "kind = \"all\"\n",
"selected_divisors = divisor_map[divisor_name]"
]
},
@@ -261,7 +261,9 @@
"cell_type": "code",
"execution_count": null,
"id": "de577429-d87c-4967-be17-75cbb378860c",
- "metadata": {},
+ "metadata": {
+ "scrolled": true
+ },
"outputs": [],
"source": [
"print(tree.render_basic())"
@@ -344,7 +346,7 @@
"id": "62d2f2a2-495e-459d-b0e2-89c9a5973b1e",
"metadata": {},
"source": [
- "### Feature selection using trees\n",
+ "### Feature selection using trees + classification error\n",
"\n",
"We can reuse the clustering + tree building approach above and just take the inputs that the greedy tree building choses as the features. However, we can also use more conventional feature selection approaches."
]
@@ -431,167 +433,6 @@
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "1e8440f3-f856-41e0-8d37-56b750e1309d",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Lets pick n as if we were doing the reversing\n",
- "n = 100\n",
- "# Lets pick m as the number of repeats\n",
- "m = 100\n",
- "# then for each mult and each divisor (thus each point) do binom(n, p) m times, save this synthetic data\n",
- "nmults = len(distributions_mults)\n",
- "ndivs = len(selected_divisors)\n",
- "base_X = np.zeros((nmults, ndivs))\n",
- "base_y = np.zeros(nmults)\n",
- "synthetic_X = np.zeros((nmults * m, ndivs))\n",
- "synthetic_y = np.zeros(nmults * m)\n",
- "for i, (mult, probmap) in enumerate(distributions_mults.items()):\n",
- " for j, divisor in enumerate(selected_divisors):\n",
- " p = probmap[divisor]\n",
- " r = binom.rvs(n, p, size=m) / n\n",
- " synthetic_X[i*m:(i+1)*m, j] = r\n",
- " base_X[i, j] = p\n",
- " synthetic_y[i*m:(i+1)*m] = i\n",
- " base_y[i] = i\n",
- "print(synthetic_X)\n",
- "# so we have !mults! classes and !mults! * m samples\n",
- "# on this synthetic data we can run whatever"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6396296e-9352-4599-8ee9-45f9b4f4ce70",
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.feature_selection import SelectKBest, SelectFdr, SelectFpr, SelectFwe, SequentialFeatureSelector\n",
- "from sklearn.feature_selection import f_classif, mutual_info_classif, chi2\n",
- "from sklearn.neighbors import KNeighborsClassifier\n",
- "\n",
- "from sklearn.datasets import load_iris"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6d7050a1-b1ef-4eed-a885-cc11d8703b24",
- "metadata": {},
- "outputs": [],
- "source": [
- "selection = SelectKBest(f_classif, k=10).fit(synthetic_X, synthetic_y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "393d9d99-67e1-4d0a-b4ad-a0adcd6491d8",
- "metadata": {},
- "outputs": [],
- "source": [
- "len(selection.get_feature_names_out())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0f324988-04bd-4c87-9af0-45abe1ebb6e9",
- "metadata": {},
- "outputs": [],
- "source": [
- "for divisor, present in zip(selected_divisors, selection.get_support()):\n",
- " if present:\n",
- " print(divisor)\n",
- " print(bin(divisor))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f0a40bd9-753e-4bc4-9bc7-f0eb2f96ce7b",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_new = selection.transform(synthetic_X)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "88809007-7b21-4985-83f9-f4cd9247fccf",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_new.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "cddc8885-37ad-4225-b83f-4798018f80f3",
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn import tree"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e11a8fc4-0df9-4cdc-a6d3-2d6297b8e085",
- "metadata": {},
- "outputs": [],
- "source": [
- "clf = tree.DecisionTreeClassifier()\n",
- "clf = clf.fit(synthetic_X, synthetic_y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8e9a91df-845c-4eaa-944e-62d07d7cb1c6",
- "metadata": {},
- "outputs": [],
- "source": [
- "clf"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "21516983-06be-4ad9-91f4-7454eacbf121",
- "metadata": {},
- "outputs": [],
- "source": [
- "from mrmr import mrmr_classif"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7c2215a5-c073-4118-a21e-e78afb724eda",
- "metadata": {},
- "outputs": [],
- "source": [
- "selected_features = mrmr_classif(X=pd.DataFrame(synthetic_X), y=pd.Series(synthetic_y), K=35)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c05c7b33-6a75-4477-97a0-6b70808d0e1e",
- "metadata": {},
- "outputs": [],
- "source": [
- "for selected in selected_features:\n",
- " divisor = selected_divisors[selected]\n",
- " print(divisor, bin(divisor))"
- ]
- },
- {
"cell_type": "markdown",
"id": "f16a5868-e92c-4b84-9f19-664627d9848a",
"metadata": {},
@@ -604,7 +445,7 @@
"id": "ed81e076-9ccb-445d-ada9-384b73efb2c5",
"metadata": {},
"source": [
- "### Feature selection using trees\n",
+ "### Feature selection using trees + classification error\n",
"\n",
"We can reuse the clustering + tree building approach above and just take the inputs that the greedy tree building choses as the features. However, we can also use more conventional feature selection approaches."
]
@@ -1142,7 +983,7 @@
"id": "a12b75cd-3c62-4b87-a7df-f0c5f7748386",
"metadata": {},
"source": [
- "## Feature selection via JMI"
+ "### Feature selection via JMI"
]
},
{