Example of optimizing lowess fit and max and redundant set selection

import os,errno
import dRFEtools
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.datasets import make_regression
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

def mkdir_p(directory):
    try:
        os.makedirs(directory)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

Optimize classification

outdir = 'classification/'
mkdir_p(outdir)

# Create a dataset with only 10 informative features
X, y = make_classification(
    n_samples=500, n_features=20000, n_informative=100, n_redundant=300,
    n_repeated=0, n_classes=2, n_clusters_per_class=1, random_state=13
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
#cla = dRFEtools.LogisticRegression(max_iter=1000, n_jobs=-1)
cla = dRFEtools.RandomForestClassifier(n_estimators=100, n_jobs=-1, 
                                      oob_score=True, random_state=13)

fold = 1
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    fold += 1
fold -= 1

features = ["feature_%d" % x for x in range(X_train.shape[1])]
d, pfirst = dRFEtools.rf_rfe(cla, X_train, y_train, np.array(features), 
                            fold, outdir, elimination_rate=0.1, RANK=False)

Optimize lowess fit: fraction of data to use when estimating y-values

## Single Fold examination
for frac in [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=frac, step_size=0.02, 
                                   classify=True, save_plot=True)

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

Optimize redundant selection: examine rate change

for step_size in [0.01, 0.02, 0.03, 0.04, 0.05, 0.10]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=0.3, step_size=step_size, 
                                   classify=True, save_plot=True)

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

## Highest value (without smoothing)
max(d, key=lambda x: d[x][1])

print(dRFEtools.extract_max_lowess(d, frac=0.30))
## Using a conservative step size
dRFEtools.extract_redundant_lowess(d, frac=0.30, step_size=0.02)

(89, 4.4942386252808095)





(291, 5.675040005790547)

Optimize regression

outdir = 'regression/'
mkdir_p(outdir)

# Create a dataset with only 10 informative features
X, y = make_regression(
    n_samples=500, n_features=20000, n_informative=400, bias=0.02,
    n_targets=1, noise=5, random_state=13
)
cv = KFold(n_splits=5, shuffle=True, random_state=13)
regr = dRFEtools.RandomForestRegressor(n_estimators=100, oob_score=True, 
                                       n_jobs=-1, random_state=13)

fold = 1
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    fold += 1
fold -= 1

features = ["feature_%d" % x for x in range(X_train.shape[1])]
d, pfirst = dRFEtools.rf_rfe(regr, X_train, y_train, np.array(features), 
                            fold, outdir, elimination_rate=0.1, RANK=False)

Optimize lowess fit: fraction of data to use when estimating y-values

## Single Fold examination
for frac in [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=frac, step_size=0.02, 
                                   classify=False, save_plot=True)

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

Optimize redundant selection: examine rate change

for step_size in [0.01, 0.02, 0.03, 0.04, 0.05, 0.1]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=0.25, step_size=step_size, 
                                   classify=False, save_plot=True)

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

## Highest values without smoothing
max(d, key=lambda x: d[x][1])

print(dRFEtools.extract_max_lowess(d, frac=0.25))
dRFEtools.extract_redundant_lowess(d, frac=0.25, step_size=0.02)

(40, 3.7013019741124933)





(110, 4.705015520957808)

Optimize classification: multi-class

outdir = 'multiclass/'
mkdir_p(outdir)

# Create a dataset with only 10 informative features
X, y = make_classification(
    n_samples=500, n_features=20000, n_informative=100, n_redundant=300,
    n_repeated=0, n_classes=4, n_clusters_per_class=1, random_state=13
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
cla = dRFEtools.RandomForestClassifier(n_estimators=100, n_jobs=-1, 
                                      oob_score=True, random_state=13)

fold = 1
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    fold += 1
fold -= 1

features = ["feature_%d" % x for x in range(X_train.shape[1])]
d, pfirst = dRFEtools.rf_rfe(cla, X_train, y_train, np.array(features), 
                            fold, outdir, elimination_rate=0.1, RANK=False)

Optimize lowess fit: fraction of data to use when estimating y-values

## Single Fold examination
for frac in [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=frac, step_size=0.02, 
                                   classify=True, multi=True, save_plot=True)

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

Optimize redundant selection: examine rate change

for step_size in [0.01, 0.02, 0.03, 0.04, 0.05, 0.1]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=0.25, step_size=step_size, 
                                   classify=True, multi=True, save_plot=True)

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

## Highest value (without smoothing)
max(d, key=lambda x: d[x][1])

print(dRFEtools.extract_max_lowess(d, frac=0.25))
dRFEtools.extract_redundant_lowess(d, frac=0.25, step_size=0.015)

(32, 3.481240089335692)





(72, 4.283586561860629)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

optimization.md

optimization.md

Example of optimizing lowess fit and max and redundant set selection

Optimize classification

Optimize lowess fit: fraction of data to use when estimating y-values

Optimize redundant selection: examine rate change

Optimize regression

Optimize lowess fit: fraction of data to use when estimating y-values

Optimize redundant selection: examine rate change

Optimize classification: multi-class

Optimize lowess fit: fraction of data to use when estimating y-values

Optimize redundant selection: examine rate change

Files

optimization.md

Latest commit

History

optimization.md

File metadata and controls

Example of optimizing lowess fit and max and redundant set selection

Optimize classification

Optimize lowess fit: fraction of data to use when estimating y-values

Optimize redundant selection: examine rate change

Optimize regression

Optimize lowess fit: fraction of data to use when estimating y-values

Optimize redundant selection: examine rate change

Optimize classification: multi-class

Optimize lowess fit: fraction of data to use when estimating y-values

Optimize redundant selection: examine rate change