-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c1abc20
commit 36a9c99
Showing
10 changed files
with
368 additions
and
198 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ API Reference | |
|
||
FastCan | ||
ssc | ||
ols | ||
|
||
Useful Links | ||
------------ | ||
|
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,6 @@ User Guide | |
:numbered: | ||
:maxdepth: 1 | ||
|
||
intuitive.rst | ||
unsupervised.rst | ||
multioutput.rst | ||
redundancy.rst | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
""" | ||
================= | ||
Affine Invariance | ||
Affine invariance | ||
================= | ||
.. currentmodule:: fastcan | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
""" | ||
======================= | ||
Intuitively explanation | ||
======================= | ||
.. currentmodule:: fastcan | ||
Let's intuitively understand the two methods, h-correlation and eta-cosine, | ||
in :class:`FastCan`. | ||
""" | ||
|
||
# Authors: Sikai Zhang | ||
# SPDX-License-Identifier: MIT | ||
|
||
# %% | ||
# Select the first feature | ||
# ------------------------ | ||
# For feature selection, it is normally easy to define a criterion to evaluate a | ||
# feature's usefulness, but it is hard to compute the amount of redundancy between | ||
# a new feature and many selected features. Here we use the ``diabetes`` dataset, | ||
# which has 10 features, as an example. If R-squared between a feature (transformed to | ||
# the predicted target by a linear regression model) and the target to describe its | ||
# usefulness, the results are shown in the following figure. It can be seen that | ||
# Feature 2 is the most useful and Feature 8 is the second. However, does that mean | ||
# that the total usefullness of Feature 2 + Feature 8 is the sum of their R-squared | ||
# scores? Probably not, because there may be redundancy between Feature 2 and Feature 8. | ||
# Actually, what we want is a kind of usefulness score which has the **superposition** | ||
# property, so that the usefullness of each feature can be added together without | ||
# redundancy. | ||
|
||
|
||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
from matplotlib.patches import Patch | ||
from sklearn.datasets import load_diabetes | ||
from sklearn.linear_model import LinearRegression | ||
|
||
from fastcan import FastCan | ||
|
||
plt.rcParams['axes.spines.right'] = False | ||
plt.rcParams['axes.spines.top'] = False | ||
|
||
def get_r2(feats, target, feats_selected=None): | ||
"""Get R-squared between [feats_selected, feat_i] and target.""" | ||
|
||
n_samples, n_features = feats.shape | ||
if feats_selected is None: | ||
feats_selected = np.zeros((n_samples, 0)) | ||
|
||
lr = LinearRegression() | ||
r2 = np.zeros(n_features) | ||
for i in range(n_features): | ||
feats_i = np.column_stack((feats_selected, feats[:, i])) | ||
r2[i] = lr.fit(feats_i, target).score(feats_i, target) | ||
return r2 | ||
|
||
def plot_bars(ids, r2_left, r2_selected): | ||
"""Plot the relative R-squared with a bar plot.""" | ||
legend_selected = Patch(color='tab:green', label='X_selected') | ||
legend_cand = Patch(color='tab:blue', label='x_i: candidates') | ||
legend_best = Patch(color='tab:orange', label='Best candidate') | ||
n_features = len(ids) | ||
n_selected = len(r2_selected) | ||
|
||
left = np.zeros(n_features)+sum(r2_selected) | ||
left_selected = np.cumsum(r2_selected) | ||
left_selected = np.r_[0, left_selected] | ||
left_selected = left_selected[:-1] | ||
left[:n_selected] = left_selected | ||
|
||
label = [""]*n_features | ||
label[np.argmax(r2_left)+n_selected] = f"{max(r2_left):.5f}" | ||
|
||
colors = ["tab:blue"]*(n_features - n_selected) | ||
colors[np.argmax(r2_left)] = "tab:orange" | ||
colors = ["tab:green"]*n_selected + colors | ||
|
||
hbars = plt.barh(ids, width=np.r_[score_selected, r2_left], color=colors, left=left) | ||
plt.axvline(x = sum(r2_selected), color = 'tab:orange', linestyle="--") | ||
plt.bar_label(hbars, label) | ||
plt.yticks(np.arange(n_features)) | ||
plt.xlabel("R-squared between [X_selected, x_i] and y") | ||
plt.ylabel("Feature index") | ||
plt.legend(handles=[legend_selected, legend_cand, legend_best]) | ||
plt.show() | ||
|
||
X, y = load_diabetes(return_X_y=True) | ||
|
||
|
||
id_left = np.arange(X.shape[1]) | ||
id_selected = [] | ||
score_selected = [] | ||
|
||
|
||
|
||
score_0 = get_r2(X, y) | ||
|
||
plot_bars(id_left, score_0, score_selected) | ||
|
||
|
||
# %% | ||
# Select the second feature | ||
# ------------------------- | ||
# Let's compute the R-squared between Feature 2 + Feature i and the target, which is | ||
# shown in the figure below. The bars at the right-hand-side (RHS) of the dashed line is | ||
# the additional R-squared scores based on the scores of Feature 2, which we call | ||
# **relative** usefulness to Feature 2. It is also seen that the bar of Feature 8 | ||
# in this figure is much shorter than the bar in the previous figure. | ||
# Because the redundancy between Feature 2 and Feature 8 is removed. | ||
# Therefore, these bars at RHS can be the desired usefulness score with the | ||
# **superposition** property. | ||
|
||
index = np.argmax(score_0) | ||
id_selected += [id_left[index]] | ||
score_selected += [score_0[index]] | ||
id_left = np.delete(id_left, index) | ||
score_1 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected) | ||
|
||
|
||
plot_bars(np.r_[id_selected, id_left], score_1, score_selected) | ||
|
||
|
||
|
||
# %% | ||
# Select the third feature | ||
# ------------------------ | ||
# Again, let's compute the R-squared between Feature 2 + Feature 8 + Feature i and | ||
# the target, and the additonal R-squared contributed by the rest of the features is | ||
# shown in following figure. It can be found that after selecting Features 2 and 8, the | ||
# rest of the features can provide a very limited contribution. | ||
|
||
index = np.argmax(score_1) | ||
id_selected += [id_left[index]] | ||
score_selected += [score_1[index]] | ||
id_left = np.delete(id_left, index) | ||
score_2 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected) | ||
|
||
plot_bars(np.r_[id_selected, id_left], score_2, score_selected) | ||
|
||
|
||
|
||
# %% | ||
# h-correlation and eta-cosine | ||
# ---------------------------- | ||
# ``h-correlation`` is a fast way to compute the value of the bars | ||
# at the RHS of the dashed lines. The fast computational speed is achieved by | ||
# orthogonalization, which removes the redundancy between the features. We use the | ||
# orthogonalization first to makes the rest of features orthogonal to the selected | ||
# features and then compute their additonal R-squared values. ``eta-cosine`` uses | ||
# the samilar idea, but has an additonal preprocessing step to compress the features | ||
# :math:`X \in \mathbb{R}^{N\times n}` and the target | ||
# :math:`X \in \mathbb{R}^{N\times n}` to :math:`X_c \in \mathbb{R}^{(m+n)\times n}` | ||
# and :math:`Y_c \in \mathbb{R}^{(m+n)\times m}`. | ||
|
||
scores = FastCan(3, verbose=0).fit(X, y).scores_ | ||
|
||
print(f"First selected feature's score: {scores[0]:.5f}") | ||
print(f"Second selected feature's score: {scores[1]:.5f}") | ||
print(f"Third selected feature's score: {scores[2]:.5f}") | ||
|
||
# %% | ||
# Relative usefulness | ||
# ------------------- | ||
# The idea about relative usefulness can be very helpful, when we want to | ||
# evaluate features based on some prior knowledges. For example, we have | ||
# some magnetic impedance spectroscopy (MIS) features of cervix tissue in | ||
# pregnant women and we want to evaluate the usefulness of these features | ||
# for predicting spontaneous preterm births (sPTB). The prior knowledge is that | ||
# cervical length (CL) and quantitative fetal fibronectin (fFN) are effective risk | ||
# factors for sPTB, so the redundancy between CL+fFN and MIS features should be | ||
# avoided. Therefore, the relative usefulness of MIS features to CL and fFN should | ||
# be computed. We can use the argument ``indices_include`` to compute the relative | ||
# usefulness. Use the ``diabetes`` dataset as an example. Assuming the prior | ||
# knowledge is that Feature 3 is very important, the relative usefulness of the rest | ||
# features to Feature 3 given in the figure below, which is the same as the | ||
# result from :class:`FastCan`. | ||
|
||
index = 3 | ||
id_selected = [index] | ||
score_selected = [score_0[index]] | ||
id_left = np.arange(X.shape[1]) | ||
id_left = np.delete(id_left, index) | ||
score_1_7 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected) | ||
|
||
plot_bars(np.r_[id_selected, id_left], score_1_7, score_selected) | ||
|
||
scores = FastCan(2, indices_include=[3], verbose=0).fit(X, y).scores_ | ||
|
||
print(f"First selected feature's score: {scores[0]:.5f}") | ||
print(f"Second selected feature's score: {scores[1]:.5f}") |
Oops, something went wrong.