Skip to content

Commit

Permalink
DOC add plot_intuitive
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthewSZhang committed Oct 14, 2024
1 parent c1abc20 commit 36a9c99
Show file tree
Hide file tree
Showing 10 changed files with 368 additions and 198 deletions.
8 changes: 5 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@ FastCan is a feature selection method, which has following advantages:

#. Extremely **fast**. See :ref:`sphx_glr_auto_examples_plot_speed.py`.

#. Support unsupervised feature selection.
#. Support unsupervised feature selection. See :ref:`Unsupervised feature selection <unsupervised>`.

#. Support multioutput feature selection.
#. Support multioutput feature selection. See :ref:`Multioutput feature selection <multioutput>`.

#. Skip redundant features.
#. Skip redundant features. See :ref:`Feature redundancy <redundancy>`.

#. Evalaute relative usefulness of features. See :ref:`sphx_glr_auto_examples_plot_intuitive.py`.


Installation
Expand Down
1 change: 0 additions & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
"sphinx.ext.intersphinx",
"sphinx_gallery.gen_gallery",
"sphinx_design",
"matplotlib.sphinxext.plot_directive",
]

# List of patterns, relative to source directory, that match files and
Expand Down
1 change: 1 addition & 0 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ API Reference

FastCan
ssc
ols

Useful Links
------------
Expand Down
16 changes: 0 additions & 16 deletions doc/intuitive.rst

This file was deleted.

1 change: 0 additions & 1 deletion doc/user_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ User Guide
:numbered:
:maxdepth: 1

intuitive.rst
unsupervised.rst
multioutput.rst
redundancy.rst
Expand Down
2 changes: 1 addition & 1 deletion examples/plot_affinity.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
=================
Affine Invariance
Affine invariance
=================
.. currentmodule:: fastcan
Expand Down
3 changes: 1 addition & 2 deletions examples/plot_fisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder


X, y = datasets.load_iris(return_X_y=True)
# drop="first" is necessary, otherwise, the transformed target is not full column rank
y_enc = OneHotEncoder(
Expand All @@ -40,8 +39,8 @@

import numpy as np
from scipy import linalg
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.covariance import empirical_covariance
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis(solver="eigen").fit(X, y)
Sw = clf.covariance_ # within scatter
Expand Down
190 changes: 190 additions & 0 deletions examples/plot_intuitive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
"""
=======================
Intuitively explanation
=======================
.. currentmodule:: fastcan
Let's intuitively understand the two methods, h-correlation and eta-cosine,
in :class:`FastCan`.
"""

# Authors: Sikai Zhang
# SPDX-License-Identifier: MIT

# %%
# Select the first feature
# ------------------------
# For feature selection, it is normally easy to define a criterion to evaluate a
# feature's usefulness, but it is hard to compute the amount of redundancy between
# a new feature and many selected features. Here we use the ``diabetes`` dataset,
# which has 10 features, as an example. If R-squared between a feature (transformed to
# the predicted target by a linear regression model) and the target to describe its
# usefulness, the results are shown in the following figure. It can be seen that
# Feature 2 is the most useful and Feature 8 is the second. However, does that mean
# that the total usefullness of Feature 2 + Feature 8 is the sum of their R-squared
# scores? Probably not, because there may be redundancy between Feature 2 and Feature 8.
# Actually, what we want is a kind of usefulness score which has the **superposition**
# property, so that the usefullness of each feature can be added together without
# redundancy.


import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

from fastcan import FastCan

plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False

def get_r2(feats, target, feats_selected=None):
"""Get R-squared between [feats_selected, feat_i] and target."""

n_samples, n_features = feats.shape
if feats_selected is None:
feats_selected = np.zeros((n_samples, 0))

lr = LinearRegression()
r2 = np.zeros(n_features)
for i in range(n_features):
feats_i = np.column_stack((feats_selected, feats[:, i]))
r2[i] = lr.fit(feats_i, target).score(feats_i, target)
return r2

def plot_bars(ids, r2_left, r2_selected):
"""Plot the relative R-squared with a bar plot."""
legend_selected = Patch(color='tab:green', label='X_selected')
legend_cand = Patch(color='tab:blue', label='x_i: candidates')
legend_best = Patch(color='tab:orange', label='Best candidate')
n_features = len(ids)
n_selected = len(r2_selected)

left = np.zeros(n_features)+sum(r2_selected)
left_selected = np.cumsum(r2_selected)
left_selected = np.r_[0, left_selected]
left_selected = left_selected[:-1]
left[:n_selected] = left_selected

label = [""]*n_features
label[np.argmax(r2_left)+n_selected] = f"{max(r2_left):.5f}"

colors = ["tab:blue"]*(n_features - n_selected)
colors[np.argmax(r2_left)] = "tab:orange"
colors = ["tab:green"]*n_selected + colors

hbars = plt.barh(ids, width=np.r_[score_selected, r2_left], color=colors, left=left)
plt.axvline(x = sum(r2_selected), color = 'tab:orange', linestyle="--")
plt.bar_label(hbars, label)
plt.yticks(np.arange(n_features))
plt.xlabel("R-squared between [X_selected, x_i] and y")
plt.ylabel("Feature index")
plt.legend(handles=[legend_selected, legend_cand, legend_best])
plt.show()

X, y = load_diabetes(return_X_y=True)


id_left = np.arange(X.shape[1])
id_selected = []
score_selected = []



score_0 = get_r2(X, y)

plot_bars(id_left, score_0, score_selected)


# %%
# Select the second feature
# -------------------------
# Let's compute the R-squared between Feature 2 + Feature i and the target, which is
# shown in the figure below. The bars at the right-hand-side (RHS) of the dashed line is
# the additional R-squared scores based on the scores of Feature 2, which we call
# **relative** usefulness to Feature 2. It is also seen that the bar of Feature 8
# in this figure is much shorter than the bar in the previous figure.
# Because the redundancy between Feature 2 and Feature 8 is removed.
# Therefore, these bars at RHS can be the desired usefulness score with the
# **superposition** property.

index = np.argmax(score_0)
id_selected += [id_left[index]]
score_selected += [score_0[index]]
id_left = np.delete(id_left, index)
score_1 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected)


plot_bars(np.r_[id_selected, id_left], score_1, score_selected)



# %%
# Select the third feature
# ------------------------
# Again, let's compute the R-squared between Feature 2 + Feature 8 + Feature i and
# the target, and the additonal R-squared contributed by the rest of the features is
# shown in following figure. It can be found that after selecting Features 2 and 8, the
# rest of the features can provide a very limited contribution.

index = np.argmax(score_1)
id_selected += [id_left[index]]
score_selected += [score_1[index]]
id_left = np.delete(id_left, index)
score_2 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected)

plot_bars(np.r_[id_selected, id_left], score_2, score_selected)



# %%
# h-correlation and eta-cosine
# ----------------------------
# ``h-correlation`` is a fast way to compute the value of the bars
# at the RHS of the dashed lines. The fast computational speed is achieved by
# orthogonalization, which removes the redundancy between the features. We use the
# orthogonalization first to makes the rest of features orthogonal to the selected
# features and then compute their additonal R-squared values. ``eta-cosine`` uses
# the samilar idea, but has an additonal preprocessing step to compress the features
# :math:`X \in \mathbb{R}^{N\times n}` and the target
# :math:`X \in \mathbb{R}^{N\times n}` to :math:`X_c \in \mathbb{R}^{(m+n)\times n}`
# and :math:`Y_c \in \mathbb{R}^{(m+n)\times m}`.

scores = FastCan(3, verbose=0).fit(X, y).scores_

print(f"First selected feature's score: {scores[0]:.5f}")
print(f"Second selected feature's score: {scores[1]:.5f}")
print(f"Third selected feature's score: {scores[2]:.5f}")

# %%
# Relative usefulness
# -------------------
# The idea about relative usefulness can be very helpful, when we want to
# evaluate features based on some prior knowledges. For example, we have
# some magnetic impedance spectroscopy (MIS) features of cervix tissue in
# pregnant women and we want to evaluate the usefulness of these features
# for predicting spontaneous preterm births (sPTB). The prior knowledge is that
# cervical length (CL) and quantitative fetal fibronectin (fFN) are effective risk
# factors for sPTB, so the redundancy between CL+fFN and MIS features should be
# avoided. Therefore, the relative usefulness of MIS features to CL and fFN should
# be computed. We can use the argument ``indices_include`` to compute the relative
# usefulness. Use the ``diabetes`` dataset as an example. Assuming the prior
# knowledge is that Feature 3 is very important, the relative usefulness of the rest
# features to Feature 3 given in the figure below, which is the same as the
# result from :class:`FastCan`.

index = 3
id_selected = [index]
score_selected = [score_0[index]]
id_left = np.arange(X.shape[1])
id_left = np.delete(id_left, index)
score_1_7 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected)

plot_bars(np.r_[id_selected, id_left], score_1_7, score_selected)

scores = FastCan(2, indices_include=[3], verbose=0).fit(X, y).scores_

print(f"First selected feature's score: {scores[0]:.5f}")
print(f"Second selected feature's score: {scores[1]:.5f}")
Loading

0 comments on commit 36a9c99

Please sign in to comment.