author: J08nY 2025-03-24 18:29:35 +0100
committer: J08nY 2025-04-16 12:25:06 +0200
commit: 28453a025209a201df20bdbb1c6e4dd39a313fc3 (patch)
tree: 5a48f1cf401255a56a48b2b156b7092485e12b95
parent: b51d7a41141a47be39886eb4af61ad39714eaeef (diff)
download: ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.tar.gz
ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.tar.zst
ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.zip
1 files changed, 7 insertions, 166 deletions
diff --git a/epare/distinguish.ipynb b/epare/distinguish.ipynb
index 6f5fe5c..0ca42c7 100644
--- a/epare/distinguish.ipynb
+++ b/epare/distinguish.ipynb
@@ -106,7 +106,7 @@
    "source": [
     "selected_mults = all_mults # distributions_mults.keys()\n",
     "divisor_name = \"all\"\n",
-    "kind = \"precomp+necessary\"\n",
+    "kind = \"all\"\n",
     "selected_divisors = divisor_map[divisor_name]"
    ]
   },
@@ -261,7 +261,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "de577429-d87c-4967-be17-75cbb378860c",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "print(tree.render_basic())"
@@ -344,7 +346,7 @@
    "id": "62d2f2a2-495e-459d-b0e2-89c9a5973b1e",
    "metadata": {},
    "source": [
-    "### Feature selection using trees\n",
+    "### Feature selection using trees + classification error\n",
     "\n",
     "We can reuse the clustering + tree building approach above and just take the inputs that the greedy tree building choses as the features. However, we can also use more conventional feature selection approaches."
    ]
@@ -431,167 +433,6 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1e8440f3-f856-41e0-8d37-56b750e1309d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Lets pick n as if we were doing the reversing\n",
-    "n = 100\n",
-    "# Lets pick m as the number of repeats\n",
-    "m = 100\n",
-    "# then for each mult and each divisor (thus each point) do binom(n, p) m times, save this synthetic data\n",
-    "nmults = len(distributions_mults)\n",
-    "ndivs = len(selected_divisors)\n",
-    "base_X = np.zeros((nmults, ndivs))\n",
-    "base_y = np.zeros(nmults)\n",
-    "synthetic_X = np.zeros((nmults * m, ndivs))\n",
-    "synthetic_y = np.zeros(nmults * m)\n",
-    "for i, (mult, probmap) in enumerate(distributions_mults.items()):\n",
-    "    for j, divisor in enumerate(selected_divisors):\n",
-    "        p = probmap[divisor]\n",
-    "        r = binom.rvs(n, p, size=m) / n\n",
-    "        synthetic_X[i*m:(i+1)*m, j] = r\n",
-    "        base_X[i, j] = p\n",
-    "    synthetic_y[i*m:(i+1)*m] = i\n",
-    "    base_y[i] = i\n",
-    "print(synthetic_X)\n",
-    "# so we have !mults! classes and !mults! * m samples\n",
-    "# on this synthetic data we can run whatever"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6396296e-9352-4599-8ee9-45f9b4f4ce70",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.feature_selection import SelectKBest, SelectFdr, SelectFpr, SelectFwe, SequentialFeatureSelector\n",
-    "from sklearn.feature_selection import f_classif, mutual_info_classif, chi2\n",
-    "from sklearn.neighbors import KNeighborsClassifier\n",
-    "\n",
-    "from sklearn.datasets import load_iris"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6d7050a1-b1ef-4eed-a885-cc11d8703b24",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "selection = SelectKBest(f_classif, k=10).fit(synthetic_X, synthetic_y)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "393d9d99-67e1-4d0a-b4ad-a0adcd6491d8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(selection.get_feature_names_out())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0f324988-04bd-4c87-9af0-45abe1ebb6e9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for divisor, present in zip(selected_divisors, selection.get_support()):\n",
-    "    if present:\n",
-    "        print(divisor)\n",
-    "        print(bin(divisor))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f0a40bd9-753e-4bc4-9bc7-f0eb2f96ce7b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_new = selection.transform(synthetic_X)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "88809007-7b21-4985-83f9-f4cd9247fccf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_new.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cddc8885-37ad-4225-b83f-4798018f80f3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn import tree"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e11a8fc4-0df9-4cdc-a6d3-2d6297b8e085",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "clf = tree.DecisionTreeClassifier()\n",
-    "clf = clf.fit(synthetic_X, synthetic_y)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8e9a91df-845c-4eaa-944e-62d07d7cb1c6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "clf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "21516983-06be-4ad9-91f4-7454eacbf121",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from mrmr import mrmr_classif"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7c2215a5-c073-4118-a21e-e78afb724eda",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "selected_features = mrmr_classif(X=pd.DataFrame(synthetic_X), y=pd.Series(synthetic_y), K=35)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c05c7b33-6a75-4477-97a0-6b70808d0e1e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for selected in selected_features:\n",
-    "    divisor = selected_divisors[selected]\n",
-    "    print(divisor, bin(divisor))"
-   ]
-  },
-  {
    "cell_type": "markdown",
    "id": "f16a5868-e92c-4b84-9f19-664627d9848a",
    "metadata": {},
@@ -604,7 +445,7 @@
    "id": "ed81e076-9ccb-445d-ada9-384b73efb2c5",
    "metadata": {},
    "source": [
-    "### Feature selection using trees\n",
+    "### Feature selection using trees + classification error\n",
     "\n",
     "We can reuse the clustering + tree building approach above and just take the inputs that the greedy tree building choses as the features. However, we can also use more conventional feature selection approaches."
    ]
@@ -1142,7 +983,7 @@
    "id": "a12b75cd-3c62-4b87-a7df-f0c5f7748386",
    "metadata": {},
    "source": [
-    "## Feature selection via JMI"
+    "### Feature selection via JMI"
    ]
   },
   {
author	J08nY	2025-03-24 18:29:35 +0100
committer	J08nY	2025-04-16 12:25:06 +0200
commit	28453a025209a201df20bdbb1c6e4dd39a313fc3 (patch)
tree	5a48f1cf401255a56a48b2b156b7092485e12b95
parent	b51d7a41141a47be39886eb4af61ad39714eaeef (diff)
download	ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.tar.gz ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.tar.zst ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.zip