From 4a8fccbcedf71a7d886fdffab361ece0311079bb Mon Sep 17 00:00:00 2001 From: mathurinm Date: Mon, 27 May 2024 18:20:00 +0200 Subject: [PATCH 1/5] censored quadratic df --- skglm/datafits/__init__.py | 4 +- skglm/datafits/single_task.py | 96 ++++++++++++++++++++++++++++++++++- skglm/demo_censored.py | 19 +++++++ 3 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 skglm/demo_censored.py diff --git a/skglm/datafits/__init__.py b/skglm/datafits/__init__.py index 856b65247..c18a652bf 100644 --- a/skglm/datafits/__init__.py +++ b/skglm/datafits/__init__.py @@ -1,5 +1,7 @@ from .base import BaseDatafit, BaseMultitaskDatafit -from .single_task import Quadratic, QuadraticSVC, Logistic, Huber, Poisson, Gamma, Cox +from .single_task import ( + Quadratic, QuadraticSVC, CensoredQuadratic, Logistic, Huber, Poisson, Gamma, Cox, +) from .multi_task import QuadraticMultiTask from .group import QuadraticGroup, LogisticGroup diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 7e8888591..138f07b1d 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -75,7 +75,8 @@ def initialize_sparse(self, X_data, X_indptr, X_indices, y): self.Xty[j] = xty def get_global_lipschitz(self, X, y): - return norm(X, ord=2) ** 2 / len(y) + n_samples = X.shape[0] + return norm(X, ord=2) ** 2 / n_samples def get_global_lipschitz_sparse(self, X_data, X_indptr, X_indices, y): return spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2 / len(y) @@ -107,6 +108,99 @@ def full_grad_sparse( def intercept_update_step(self, y, Xw): return np.mean(Xw - y) +class CensoredQuadratic(Quadratic): + """Quadratic datafit with access to Xty but not to y. + + The datafit reads: + + .. math:: 1 / (2 xx n_"samples") ||Xw||_2 ^ 2 - 1 / (n_"samples") w^\top (X^\top y) + + Attributes + ---------- + Xty : array, shape (n_features,) + Pre-computed quantity used during the gradient evaluation. + Equal to ``X.T @ y``. + + Note + ---- + The class is jit compiled at fit time using Numba compiler. + This allows for faster computations. + """ + + def __init__(self, Xty): + self.Xty = Xty + + # def get_spec(self): + # spec = ( + # ('Xty', float64[:]), + # ) + # return spec + + # def params_to_dict(self): + # return dict() + + # def get_lipschitz(self, X, y): + + + # lipschitz = np.zeros(n_features, dtype=X.dtype) + # for j in range(n_features): + # lipschitz[j] = (X[:, j] ** 2).sum() / len(y) + + # return lipschitz + + # XXX TODO check without y? or pass y = np.zeros(n_samples) + # def get_lipschitz_sparse(self, X_data, X_indptr, X_indices, y): + # n_features = len(X_indptr) - 1 + # lipschitz = np.zeros(n_features, dtype=X_data.dtype) + + # for j in range(n_features): + # nrm2 = 0. + # for idx in range(X_indptr[j], X_indptr[j + 1]): + # nrm2 += X_data[idx] ** 2 + + # lipschitz[j] = nrm2 / len(y) + + # return lipschitz + + def initialize(self, X, y): + pass + + def initialize_sparse(self, X_data, X_indptr, X_indices, y): + pass + + # def get_global_lipschitz(self, X, y): + # return norm(X, ord=2) ** 2 / len(y) + + # def get_global_lipschitz_sparse(self, X_data, X_indptr, X_indices, y): + # return spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2 / len(y) + + def value(self, y, w, Xw): + return np.sum((Xw) ** 2) / (2 * len(Xw)) - w @ self.Xty / len(Xw) + + # def gradient_scalar(self, X, y, w, Xw, j): + # return (X[:, j] @ Xw - self.Xty[j]) / len(Xw) + + # def gradient_scalar_sparse(self, X_data, X_indptr, X_indices, y, Xw, j): + # XjTXw = 0. + # for i in range(X_indptr[j], X_indptr[j+1]): + # XjTXw += X_data[i] * Xw[X_indices[i]] + # return (XjTXw - self.Xty[j]) / len(Xw) + + # def full_grad_sparse( + # self, X_data, X_indptr, X_indices, y, Xw): + # n_features = X_indptr.shape[0] - 1 + # n_samples = y.shape[0] + # grad = np.zeros(n_features, dtype=Xw.dtype) + # for j in range(n_features): + # XjTXw = 0. + # for i in range(X_indptr[j], X_indptr[j + 1]): + # XjTXw += X_data[i] * Xw[X_indices[i]] + # grad[j] = (XjTXw - self.Xty[j]) / n_samples + # return grad + + def intercept_update_step(self, y, Xw): + return np.mean(Xw) # MM TODO check + @njit def sigmoid(x): diff --git a/skglm/demo_censored.py b/skglm/demo_censored.py new file mode 100644 index 000000000..f0089b671 --- /dev/null +++ b/skglm/demo_censored.py @@ -0,0 +1,19 @@ +from skglm.datafits import Quadratic, CensoredQuadratic +from skglm.penalties import L1 +from skglm.solvers import AndersonCD +from skglm.utils.jit_compilation import compiled_clone +from skglm.utils.data import make_correlated_data + +X, y, _ = make_correlated_data(100, 150) + +pen = compiled_clone(L1(alpha=0)) + +solver = AndersonCD(verbose=3) +df = Quadratic() +df = compiled_clone(df) + +solver.solve(X, y, df, pen) + + + + From 5c60dcc62ce9db4a8ee319df0de677fb6909e66e Mon Sep 17 00:00:00 2001 From: mathurinm Date: Mon, 27 May 2024 18:23:42 +0200 Subject: [PATCH 2/5] it compiles but it fails --- skglm/datafits/single_task.py | 4 ++-- skglm/demo_censored.py | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 138f07b1d..9c27a36b1 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -136,8 +136,8 @@ def __init__(self, Xty): # ) # return spec - # def params_to_dict(self): - # return dict() + def params_to_dict(self): + return dict(Xty=self.Xty) # def get_lipschitz(self, X, y): diff --git a/skglm/demo_censored.py b/skglm/demo_censored.py index f0089b671..574df9699 100644 --- a/skglm/demo_censored.py +++ b/skglm/demo_censored.py @@ -1,3 +1,6 @@ +import numpy as np + + from skglm.datafits import Quadratic, CensoredQuadratic from skglm.penalties import L1 from skglm.solvers import AndersonCD @@ -12,8 +15,10 @@ df = Quadratic() df = compiled_clone(df) -solver.solve(X, y, df, pen) - +w = solver.solve(X, y, df, pen)[0] +df2 = CensoredQuadratic(X.T @ y) +df2 = compiled_clone(df2) +w2 = solver.solve(X, np.zeros(X.shape[0]), df2, pen)[0] From 923a8dc77026443652d143a4fb2990c9266601d0 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Mon, 27 May 2024 18:24:54 +0200 Subject: [PATCH 3/5] without fit_intercept it works --- skglm/demo_censored.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skglm/demo_censored.py b/skglm/demo_censored.py index 574df9699..c49b2d7fc 100644 --- a/skglm/demo_censored.py +++ b/skglm/demo_censored.py @@ -11,7 +11,7 @@ pen = compiled_clone(L1(alpha=0)) -solver = AndersonCD(verbose=3) +solver = AndersonCD(verbose=3, fit_intercept=False) df = Quadratic() df = compiled_clone(df) @@ -21,4 +21,4 @@ df2 = compiled_clone(df2) w2 = solver.solve(X, np.zeros(X.shape[0]), df2, pen)[0] - +np.testing.assert_allclose(w2, w) \ No newline at end of file From a1f4accaf1bc6f88316f5b33ae67afd3abe769a0 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Mon, 27 May 2024 19:16:16 +0200 Subject: [PATCH 4/5] make it work with intercept if one passes y_mean --- skglm/datafits/single_task.py | 18 ++++++++++-------- skglm/demo_censored.py | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 9c27a36b1..eddc1cc08 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -127,17 +127,19 @@ class CensoredQuadratic(Quadratic): This allows for faster computations. """ - def __init__(self, Xty): + def __init__(self, Xty, y_mean): self.Xty = Xty + self.y_mean = y_mean - # def get_spec(self): - # spec = ( - # ('Xty', float64[:]), - # ) - # return spec + def get_spec(self): + spec = ( + ('Xty', float64[:]), + ('y_mean', float64), + ) + return spec def params_to_dict(self): - return dict(Xty=self.Xty) + return dict(Xty=self.Xty, y_mean=self.y_mean) # def get_lipschitz(self, X, y): @@ -199,7 +201,7 @@ def value(self, y, w, Xw): # return grad def intercept_update_step(self, y, Xw): - return np.mean(Xw) # MM TODO check + return np.mean(Xw) - self.y_mean # MM TODO check @njit diff --git a/skglm/demo_censored.py b/skglm/demo_censored.py index c49b2d7fc..2c01194d5 100644 --- a/skglm/demo_censored.py +++ b/skglm/demo_censored.py @@ -11,13 +11,13 @@ pen = compiled_clone(L1(alpha=0)) -solver = AndersonCD(verbose=3, fit_intercept=False) +solver = AndersonCD(verbose=3, fit_intercept=True) df = Quadratic() df = compiled_clone(df) w = solver.solve(X, y, df, pen)[0] -df2 = CensoredQuadratic(X.T @ y) +df2 = CensoredQuadratic(X.T @ y, y.mean()) df2 = compiled_clone(df2) w2 = solver.solve(X, np.zeros(X.shape[0]), df2, pen)[0] From a35453556c0c6cc6c6bd7f8dd45e11b73a2ab768 Mon Sep 17 00:00:00 2001 From: QB3 Date: Thu, 27 Jun 2024 06:40:19 -0400 Subject: [PATCH 5/5] [ci skip] loaded X_csc magenpy data --- skglm/demo_censored.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/skglm/demo_censored.py b/skglm/demo_censored.py index 2c01194d5..e93a3deac 100644 --- a/skglm/demo_censored.py +++ b/skglm/demo_censored.py @@ -21,4 +21,16 @@ df2 = compiled_clone(df2) w2 = solver.solve(X, np.zeros(X.shape[0]), df2, pen)[0] -np.testing.assert_allclose(w2, w) \ No newline at end of file +np.testing.assert_allclose(w2, w) + + +########################################### +# Load the design matrix +bed_file = "../magenpy/magenpy/data/1000G_eur_chr22.bed" +import os.path +import scipy +from pandas_plink import read_plink1_bin + +assert os.path.isfile(bed_file) +X_dask = read_plink1_bin(bed_file, ref="a0", verbose=False) +X_csc = scipy.sparse.csc_matrix(X_dask)