diff options
| author | J08nY | 2025-03-24 18:29:35 +0100 |
|---|---|---|
| committer | J08nY | 2025-04-16 12:25:06 +0200 |
| commit | 28453a025209a201df20bdbb1c6e4dd39a313fc3 (patch) | |
| tree | 5a48f1cf401255a56a48b2b156b7092485e12b95 | |
| parent | b51d7a41141a47be39886eb4af61ad39714eaeef (diff) | |
| download | ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.tar.gz ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.tar.zst ECTester-28453a025209a201df20bdbb1c6e4dd39a313fc3.zip | |
| -rw-r--r-- | epare/distinguish.ipynb | 173 |
1 files changed, 7 insertions, 166 deletions
diff --git a/epare/distinguish.ipynb b/epare/distinguish.ipynb index 6f5fe5c..0ca42c7 100644 --- a/epare/distinguish.ipynb +++ b/epare/distinguish.ipynb @@ -106,7 +106,7 @@ "source": [ "selected_mults = all_mults # distributions_mults.keys()\n", "divisor_name = \"all\"\n", - "kind = \"precomp+necessary\"\n", + "kind = \"all\"\n", "selected_divisors = divisor_map[divisor_name]" ] }, @@ -261,7 +261,9 @@ "cell_type": "code", "execution_count": null, "id": "de577429-d87c-4967-be17-75cbb378860c", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "print(tree.render_basic())" @@ -344,7 +346,7 @@ "id": "62d2f2a2-495e-459d-b0e2-89c9a5973b1e", "metadata": {}, "source": [ - "### Feature selection using trees\n", + "### Feature selection using trees + classification error\n", "\n", "We can reuse the clustering + tree building approach above and just take the inputs that the greedy tree building choses as the features. However, we can also use more conventional feature selection approaches." ] @@ -431,167 +433,6 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "1e8440f3-f856-41e0-8d37-56b750e1309d", - "metadata": {}, - "outputs": [], - "source": [ - "# Lets pick n as if we were doing the reversing\n", - "n = 100\n", - "# Lets pick m as the number of repeats\n", - "m = 100\n", - "# then for each mult and each divisor (thus each point) do binom(n, p) m times, save this synthetic data\n", - "nmults = len(distributions_mults)\n", - "ndivs = len(selected_divisors)\n", - "base_X = np.zeros((nmults, ndivs))\n", - "base_y = np.zeros(nmults)\n", - "synthetic_X = np.zeros((nmults * m, ndivs))\n", - "synthetic_y = np.zeros(nmults * m)\n", - "for i, (mult, probmap) in enumerate(distributions_mults.items()):\n", - " for j, divisor in enumerate(selected_divisors):\n", - " p = probmap[divisor]\n", - " r = binom.rvs(n, p, size=m) / n\n", - " synthetic_X[i*m:(i+1)*m, j] = r\n", - " base_X[i, j] = p\n", - " synthetic_y[i*m:(i+1)*m] = i\n", - " base_y[i] = i\n", - "print(synthetic_X)\n", - "# so we have !mults! classes and !mults! * m samples\n", - "# on this synthetic data we can run whatever" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6396296e-9352-4599-8ee9-45f9b4f4ce70", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.feature_selection import SelectKBest, SelectFdr, SelectFpr, SelectFwe, SequentialFeatureSelector\n", - "from sklearn.feature_selection import f_classif, mutual_info_classif, chi2\n", - "from sklearn.neighbors import KNeighborsClassifier\n", - "\n", - "from sklearn.datasets import load_iris" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d7050a1-b1ef-4eed-a885-cc11d8703b24", - "metadata": {}, - "outputs": [], - "source": [ - "selection = SelectKBest(f_classif, k=10).fit(synthetic_X, synthetic_y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "393d9d99-67e1-4d0a-b4ad-a0adcd6491d8", - "metadata": {}, - "outputs": [], - "source": [ - "len(selection.get_feature_names_out())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f324988-04bd-4c87-9af0-45abe1ebb6e9", - "metadata": {}, - "outputs": [], - "source": [ - "for divisor, present in zip(selected_divisors, selection.get_support()):\n", - " if present:\n", - " print(divisor)\n", - " print(bin(divisor))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0a40bd9-753e-4bc4-9bc7-f0eb2f96ce7b", - "metadata": {}, - "outputs": [], - "source": [ - "X_new = selection.transform(synthetic_X)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88809007-7b21-4985-83f9-f4cd9247fccf", - "metadata": {}, - "outputs": [], - "source": [ - "X_new.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cddc8885-37ad-4225-b83f-4798018f80f3", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn import tree" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e11a8fc4-0df9-4cdc-a6d3-2d6297b8e085", - "metadata": {}, - "outputs": [], - "source": [ - "clf = tree.DecisionTreeClassifier()\n", - "clf = clf.fit(synthetic_X, synthetic_y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e9a91df-845c-4eaa-944e-62d07d7cb1c6", - "metadata": {}, - "outputs": [], - "source": [ - "clf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21516983-06be-4ad9-91f4-7454eacbf121", - "metadata": {}, - "outputs": [], - "source": [ - "from mrmr import mrmr_classif" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c2215a5-c073-4118-a21e-e78afb724eda", - "metadata": {}, - "outputs": [], - "source": [ - "selected_features = mrmr_classif(X=pd.DataFrame(synthetic_X), y=pd.Series(synthetic_y), K=35)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c05c7b33-6a75-4477-97a0-6b70808d0e1e", - "metadata": {}, - "outputs": [], - "source": [ - "for selected in selected_features:\n", - " divisor = selected_divisors[selected]\n", - " print(divisor, bin(divisor))" - ] - }, - { "cell_type": "markdown", "id": "f16a5868-e92c-4b84-9f19-664627d9848a", "metadata": {}, @@ -604,7 +445,7 @@ "id": "ed81e076-9ccb-445d-ada9-384b73efb2c5", "metadata": {}, "source": [ - "### Feature selection using trees\n", + "### Feature selection using trees + classification error\n", "\n", "We can reuse the clustering + tree building approach above and just take the inputs that the greedy tree building choses as the features. However, we can also use more conventional feature selection approaches." ] @@ -1142,7 +983,7 @@ "id": "a12b75cd-3c62-4b87-a7df-f0c5f7748386", "metadata": {}, "source": [ - "## Feature selection via JMI" + "### Feature selection via JMI" ] }, { |
