From 6b789d39b1f9859a7efac9bf7e6b539bb584acbd Mon Sep 17 00:00:00 2001 From: RobinVogel Date: Thu, 14 Nov 2019 17:58:23 +0100 Subject: [PATCH 1/8] maj --- test_components.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 test_components.py diff --git a/test_components.py b/test_components.py new file mode 100644 index 00000000..6edf73db --- /dev/null +++ b/test_components.py @@ -0,0 +1,21 @@ +import numpy as np +import pytest +from numpy.linalg import LinAlgError +from scipy.stats import ortho_group + +rng = np.random.RandomState(42) + +# an orthonormal matrix useful for creating matrices with given +# eigenvalues: +P = ortho_group.rvs(7, random_state=rng) + +# matrix with a determinant still high but which should be considered as a +# non-definite matrix (to check we don't test the definiteness with the +# determinant which is a bad strategy) +M = np.diag([1e5, 1e5, 1e5, 1e5, 1e5, 1e5, 1e-20]) +M = P.dot(M).dot(P.T) +assert np.abs(np.linalg.det(M)) > 10 +assert np.linalg.slogdet(M)[1] > 1 # (just to show that the computed +# determinant is far from null) +with pytest.raises(LinAlgError) as err_msg: + np.linalg.cholesky(M) From 275c69a8493dfba872aaf2db6dbaad1acbd7c4e0 Mon Sep 17 00:00:00 2001 From: RobinVogel Date: Thu, 28 Nov 2019 17:01:07 +0100 Subject: [PATCH 2/8] added fit checks --- metric_learn/base_metric.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 6feccc72..f238ccd2 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -215,6 +215,7 @@ def score_pairs(self, pairs): :ref:`mahalanobis_distances` : The section of the project documentation that describes Mahalanobis Distances. """ + check_is_fitted(self, 'components_') pairs = check_input(pairs, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=2) @@ -240,12 +241,15 @@ def transform(self, X): X_embedded : `numpy.ndarray`, shape=(n_samples, n_components) The embedded data points. """ + check_is_fitted(self, 'components_') + X_checked = check_input(X, type_of_inputs='classic', estimator=self, preprocessor=self.preprocessor_, accept_sparse=True) return X_checked.dot(self.components_.T) def get_metric(self): + check_is_fitted(self, 'components_') components_T = self.components_.T.copy() def metric_fun(u, v, squared=False): @@ -285,6 +289,7 @@ def metric(self): """Deprecated. Will be removed in v0.6.0. Use `get_mahalanobis_matrix` instead""" # TODO: remove this method in version 0.6.0 + check_is_fitted(self, 'components_') warnings.warn(("`metric` is deprecated since version 0.5.0 and will be " "removed in 0.6.0. Use `get_mahalanobis_matrix` instead."), DeprecationWarning) @@ -298,6 +303,7 @@ def get_mahalanobis_matrix(self): M : `numpy.ndarray`, shape=(n_features, n_features) The copy of the learned Mahalanobis matrix. """ + check_is_fitted(self, 'components_') return self.components_.T.dot(self.components_) @@ -357,6 +363,7 @@ def decision_function(self, pairs): y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) The predicted decision function value for each pair. """ + check_is_fitted(self, 'components_') pairs = check_input(pairs, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=self._tuple_size) @@ -628,6 +635,7 @@ def decision_function(self, quadruplets): decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) Metric differences. """ + check_is_fitted(self, 'components_') quadruplets = check_input(quadruplets, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=self._tuple_size) From 1c28b5663dc5f6e9d2f002bda93c4dc9072471cd Mon Sep 17 00:00:00 2001 From: RobinVogel Date: Thu, 28 Nov 2019 17:02:31 +0100 Subject: [PATCH 3/8] maj --- test_components.py | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 test_components.py diff --git a/test_components.py b/test_components.py deleted file mode 100644 index 6edf73db..00000000 --- a/test_components.py +++ /dev/null @@ -1,21 +0,0 @@ -import numpy as np -import pytest -from numpy.linalg import LinAlgError -from scipy.stats import ortho_group - -rng = np.random.RandomState(42) - -# an orthonormal matrix useful for creating matrices with given -# eigenvalues: -P = ortho_group.rvs(7, random_state=rng) - -# matrix with a determinant still high but which should be considered as a -# non-definite matrix (to check we don't test the definiteness with the -# determinant which is a bad strategy) -M = np.diag([1e5, 1e5, 1e5, 1e5, 1e5, 1e5, 1e-20]) -M = P.dot(M).dot(P.T) -assert np.abs(np.linalg.det(M)) > 10 -assert np.linalg.slogdet(M)[1] > 1 # (just to show that the computed -# determinant is far from null) -with pytest.raises(LinAlgError) as err_msg: - np.linalg.cholesky(M) From 76ffccb9e578cf8220177de801a24451810a1b8d Mon Sep 17 00:00:00 2001 From: RobinVogel Date: Thu, 28 Nov 2019 17:09:15 +0100 Subject: [PATCH 4/8] Added checks that the function was fitted. --- metric_learn/base_metric.py | 1 - 1 file changed, 1 deletion(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index f238ccd2..707b9d8b 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -242,7 +242,6 @@ def transform(self, X): The embedded data points. """ check_is_fitted(self, 'components_') - X_checked = check_input(X, type_of_inputs='classic', estimator=self, preprocessor=self.preprocessor_, accept_sparse=True) From 340ac69aecfb8aace5376794edaa41f074deed1d Mon Sep 17 00:00:00 2001 From: RobinVogel Date: Thu, 28 Nov 2019 17:58:42 +0100 Subject: [PATCH 5/8] Wrote a semi-supervised-rca. --- metric_learn/rca.py | 92 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 2a9ab1e8..839cebcb 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -62,7 +62,7 @@ class RCA(MahalanobisMixin, TransformerMixin): Examples -------- - >>> from metric_learn import RCA_Supervised + >>> from metric_learn import RCA_SemiSupervised >>> from sklearn.datasets import load_iris >>> iris_data = load_iris() >>> X = iris_data['data'] @@ -108,7 +108,7 @@ def fit(self, X, chunks): Parameters ---------- - data : (n x d) data matrix + X : (n x d) data matrix Each row corresponds to a single instance chunks : (n,) array of ints When ``chunks[i] == -1``, point i doesn't belong to any chunklet. @@ -242,3 +242,91 @@ def fit(self, X, y, random_state='deprecated'): chunk_size=self.chunk_size, random_state=self.random_state) return RCA.fit(self, X, chunks) + + +class RCA_SemiSupervised(RCA): + """Semi-Supervised version of Relevant Components Analysis (RCA) + + `RCA_SemiSupervised` combines data in the form of chunks with + data in the form of labeled points that goes through the same + process as in `RCA_SemiSupervised`. + + Parameters + ---------- + n_components : int or None, optional (default=None) + Dimensionality of reduced space (if None, defaults to dimension of X). + + num_dims : Not used + + .. deprecated:: 0.5.0 + `num_dims` was deprecated in version 0.5.0 and will + be removed in 0.6.0. Use `n_components` instead. + + num_chunks: int, optional + + chunk_size: int, optional + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. + It is used to randomly sample constraints from labels. + + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_components, n_features) + The learned linear transformation ``L``. + """ + + def __init__(self, num_dims='deprecated', n_components=None, + pca_comps='deprecated', num_chunks=100, chunk_size=2, + preprocessor=None, random_state=None): + """Initialize the supervised version of `RCA`.""" + RCA.__init__(self, num_dims=num_dims, n_components=n_components, + pca_comps=pca_comps, preprocessor=preprocessor) + self.num_chunks = num_chunks + self.chunk_size = chunk_size + self.random_state = random_state + + def fit(self, X, y, X_u, chunks, + random_state='deprecated'): + """Create constraints from labels and learn the RCA model. + Needs num_constraints specified in constructor. + + Parameters + ---------- + X : (n x d) labeled data matrix + each row corresponds to a single instance + y : (n) data labels + X_u : (n x d) unlabeled data matrix + chunks : (n,) array of ints + When ``chunks[i] == -1``, point i doesn't belong to any chunklet. + When ``chunks[i] == j``, point i belongs to chunklet j. + random_state : Not used + .. deprecated:: 0.5.0 + `random_state` in the `fit` function was deprecated in version 0.5.0 + and will be removed in 0.6.0. Set `random_state` at initialization + instead (when instantiating a new `RCA_SemiSupervised` object). + """ + if random_state != 'deprecated': + warnings.warn('"random_state" parameter in the `fit` function is ' + 'deprecated. Set `random_state` at initialization ' + 'instead (when instantiating a new `RCA_SemiSupervised` ' + 'object).', DeprecationWarning) + else: + warnings.warn('As of v0.5.0, `RCA_SemiSupervised` now uses the ' + '`random_state` given at initialization to sample ' + 'constraints, not the default `np.random` from the `fit` ' + 'method, since this argument is now deprecated. ' + 'This warning will disappear in v0.6.0.', + ChangedBehaviorWarning) + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) + sup_chunks = Constraints(y).chunks(num_chunks=self.num_chunks, + chunk_size=self.chunk_size, + random_state=self.random_state) + X_tot = np.concatenate([X, X_u]) + chunks_tot = np.concatenate([sup_chunks, chunks]) + + return RCA.fit(self, X_tot, chunks_tot) From 36694f67c629022cbdc3b6c8c713c5ce3b5e44b6 Mon Sep 17 00:00:00 2001 From: RobinVogel Date: Thu, 28 Nov 2019 18:50:07 +0100 Subject: [PATCH 6/8] added a very simple test --- metric_learn/__init__.py | 5 +++-- test/metric_learn_test.py | 13 +++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/metric_learn/__init__.py b/metric_learn/__init__.py index b036ccfa..9a0268fa 100644 --- a/metric_learn/__init__.py +++ b/metric_learn/__init__.py @@ -8,7 +8,7 @@ from .sdml import SDML, SDML_Supervised from .nca import NCA from .lfda import LFDA -from .rca import RCA, RCA_Supervised +from .rca import RCA, RCA_Supervised, RCA_SemiSupervised from .mlkr import MLKR from .mmc import MMC, MMC_Supervised @@ -17,4 +17,5 @@ __all__ = ['Constraints', 'Covariance', 'ITML', 'ITML_Supervised', 'LMNN', 'LSML', 'LSML_Supervised', 'SDML', 'SDML_Supervised', 'NCA', 'LFDA', 'RCA', 'RCA_Supervised', - 'MLKR', 'MMC', 'MMC_Supervised', '__version__'] + 'RCA_SemiSupervised', 'MLKR', 'MMC', 'MMC_Supervised', + '__version__'] diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index f713a059..6a34402b 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -23,9 +23,9 @@ from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC, LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised, SDML, RCA, ITML, - LSML) + LSML, RCA_SemiSupervised) # Import this specially for testing. -from metric_learn.constraints import wrap_pairs +from metric_learn.constraints import wrap_pairs, Constraints from metric_learn.lmnn import _sum_outer_products @@ -1136,6 +1136,15 @@ def test_changed_behaviour_warning_random_state(self): rca_supervised.fit(X, y) assert any(msg == str(wrn.message) for wrn in raised_warning) + def test_semi_supervised(self): + n = 100 + X, y = make_classification(random_state=42, n_samples=2 * n) + rca_semisupervised = RCA_SemiSupervised(num_chunks=20) + cons = Constraints(y[n:]) + chunks = cons.chunks(num_chunks=20) + rca_semisupervised.fit(X[:n], y[:n], + X[n:], chunks) + @pytest.mark.parametrize('num_dims', [None, 2]) def test_deprecation_num_dims_rca(num_dims): From 77fb53a3429bd956527bb5a3ef567ec0d780c474 Mon Sep 17 00:00:00 2001 From: RobinVogel Date: Thu, 28 Nov 2019 18:55:28 +0100 Subject: [PATCH 7/8] typos --- metric_learn/base_metric.py | 7 ------- metric_learn/rca.py | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 707b9d8b..6feccc72 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -215,7 +215,6 @@ def score_pairs(self, pairs): :ref:`mahalanobis_distances` : The section of the project documentation that describes Mahalanobis Distances. """ - check_is_fitted(self, 'components_') pairs = check_input(pairs, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=2) @@ -241,14 +240,12 @@ def transform(self, X): X_embedded : `numpy.ndarray`, shape=(n_samples, n_components) The embedded data points. """ - check_is_fitted(self, 'components_') X_checked = check_input(X, type_of_inputs='classic', estimator=self, preprocessor=self.preprocessor_, accept_sparse=True) return X_checked.dot(self.components_.T) def get_metric(self): - check_is_fitted(self, 'components_') components_T = self.components_.T.copy() def metric_fun(u, v, squared=False): @@ -288,7 +285,6 @@ def metric(self): """Deprecated. Will be removed in v0.6.0. Use `get_mahalanobis_matrix` instead""" # TODO: remove this method in version 0.6.0 - check_is_fitted(self, 'components_') warnings.warn(("`metric` is deprecated since version 0.5.0 and will be " "removed in 0.6.0. Use `get_mahalanobis_matrix` instead."), DeprecationWarning) @@ -302,7 +298,6 @@ def get_mahalanobis_matrix(self): M : `numpy.ndarray`, shape=(n_features, n_features) The copy of the learned Mahalanobis matrix. """ - check_is_fitted(self, 'components_') return self.components_.T.dot(self.components_) @@ -362,7 +357,6 @@ def decision_function(self, pairs): y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) The predicted decision function value for each pair. """ - check_is_fitted(self, 'components_') pairs = check_input(pairs, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=self._tuple_size) @@ -634,7 +628,6 @@ def decision_function(self, quadruplets): decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) Metric differences. """ - check_is_fitted(self, 'components_') quadruplets = check_input(quadruplets, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=self._tuple_size) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 839cebcb..93292723 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -62,7 +62,7 @@ class RCA(MahalanobisMixin, TransformerMixin): Examples -------- - >>> from metric_learn import RCA_SemiSupervised + >>> from metric_learn import RCA_Supervised >>> from sklearn.datasets import load_iris >>> iris_data = load_iris() >>> X = iris_data['data'] From b3445c50a4242ef511ab4ee516017bcdeb7aac67 Mon Sep 17 00:00:00 2001 From: RobinVogel Date: Fri, 29 Nov 2019 11:48:11 +0100 Subject: [PATCH 8/8] test cov correction --- test/metric_learn_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 6a34402b..20c94f46 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -1144,6 +1144,8 @@ def test_semi_supervised(self): chunks = cons.chunks(num_chunks=20) rca_semisupervised.fit(X[:n], y[:n], X[n:], chunks) + rca_semisupervised.fit(X[:n], y[:n], + X[n:], chunks, random_state=42) @pytest.mark.parametrize('num_dims', [None, 2])