Getting Started¶
Automatic data analysis pipeline¶
The package defined pipelines to do automatic data analysis of
tabular data. Only the path to data and the name of the outcome are
necessary arguments for the pipelines. In addition, there are a lot of optional
arguments available to tweak the behaviour of the automatic data
analysis pipelines. The following code section contains a full data
analysis work using the defined pipeline on the titanic data. The generated
report, preprocessed data, data preprocessing steps, selected models
and evaluation results are saved in the results folder of the
working directory. The meaning of the arguments can be found in the
Module Index section of this document or help(automate_analysis).
# %% load required libs
import os
from dummyML.automate_analysis import automate_analysis
# %% arguments to control the data loading behavior
## path2data and outcome
print("current working directory: {}".format(os.getcwd()))
# regression problem
# path2data = "./data/titanic.csv"
# outcome = "Age"
# binary classification problem
path2data = "./data/titanic.csv"
outcome = "Survived"
# multi-classification problem
# path2data = "./data/Iris.csv"
# outcome = "Species"
# imbalanced classification
# path2data = "./data/credit.csv"
# outcome = "Credit Default"
## the the column of the ID in the data
index_col = 0
# method for coding categorical variables
categorical_coding = "dummy" # "dummy" or "ordinal"
# get the dummy_coding method
if categorical_coding == "dummy":
dummy_coding = True
elif categorical_coding == "ordinal":
dummy_coding = False
else:
raise ValueError("categorical_coding should be either 'dummy' or 'ordinal'")
# %% arguments to control the preprocessing steps
cat_levels_threshold = 15
missing_threshold = 0.5
for_future_test = False
max_sample = 1000
# %% log information & save the model
verbose = 0
save_results = True
# %% arguments to control the training and test sets splitting
test_size = 0.2
random_state = None
# %% arguments to control the selection and fitting of ML models
imbalance = True
imbalance_force = True
scaler = "standard"
decomposer = None
n_components = None
models = ["linear", "lasso", "ridge", "elasticNet", "svm", "gb", "rf"]
n_trials = 30
n_jobs = 10
max_iter = 100
# %% arguments to control the model evaluations
cv = 10
cv_force = False
eval_cv = False
# %%
# automate data analysis. results are saved in the results folder of
# the current working directory
automate_analysis(
path2data,
outcome,
imbalance=imbalance,
imbalance_force=imbalance_force,
index_col=index_col,
dummy_coding=dummy_coding,
test_size=test_size,
eval_cv=eval_cv,
scaler=scaler,
decomposer=decomposer,
n_components=n_components,
models=models,
cv=cv,
cv_force=cv_force,
n_trials=n_trials,
n_jobs=n_jobs,
max_iter=max_iter,
max_sample=max_sample,
verbose=verbose,
cat_levels_threshold=cat_levels_threshold,
missing_threshold=missing_threshold,
for_future_test=for_future_test,
save_results=save_results,
random_state=random_state,
)
Semi-automatic data analysis¶
Some data analysts would prefer a semi-automate data analysis pipeline, so they can have more control over the whole data analysis process. Or they just want to use part of the pipeline to save time. We defined a template pipeline to do semi-automatic data analysis, which is shown as follows.
# %% load required libs
import os
import joblib
import pandas as pd
from dummyML.preprocessing import data_preprocessing
from dummyML.automate_modeling_evaluation import automate_modeling
from dummyML.automate_modeling_evaluation import automate_evaluation
from dummyML.automate_modeling_evaluation import fair_evaluation_imb_clf_models
import dummyML.utilities as utilities
import dummyML
print(dummyML.__version__)
# %% load and summarize the data
## specify the path2data and outcome
# regression problem
# path2data = "./data/titanic.csv"
# outcome = "Age"
# binary classification problem
path2data = "./data/titanic.csv"
outcome = "Survived"
# multi-classification problem
# path2data = './data/Iris.csv'; outcome = 'Species'
# read sas data sets
# path2data = "./data/PaySimSample.sas7bdat"
# outcome = "isFlaggedFraud"
# method for coding categorical variables
categorical_coding = "dummy" # "dummy" or "ordinal"
## using the combination of data name and outcome as experiment name to save results
name = os.path.basename(path2data).split(".")[0] + "_" + categorical_coding + "_" + outcome
## load into the data set
data = utilities.read_data(path2data, index_col=0)
# data = data.sample(1000)
print("Data size is {}".format(data.shape))
## summarize the data set
utilities.summarize_data(data, name, max_sample=1000, minimal=True)
# %% preprocess the data set
# hyperparameters in this step
# help(data_preprocessing) to see the meaning
cat_levels_threshold = 15
missing_threshold = 0.5
for_future_test = False
save_results = True
verbose = 0
# get the dummy_coding method
if categorical_coding == "dummy":
dummy_coding = True
elif categorical_coding == "ordinal":
dummy_coding = False
else:
raise ValueError("categorical_coding should be either 'dummy' or 'ordinal'")
# preprocess the data
X, y, _, _, saved_preprocess_steps = data_preprocessing(
data,
outcome,
name,
dummy_coding=dummy_coding,
cat_levels_threshold=cat_levels_threshold,
missing_threshold=missing_threshold,
for_future_test=for_future_test,
verbose=verbose,
save_results=save_results,
)
print(X.shape, y.shape)
print(saved_preprocess_steps)
# help(saved_preprocess_steps.transform)
# test the saved_preprocess_steps
X_tmp = saved_preprocess_steps.transform(data)
print(X_tmp.shape[1] == X.shape[1])
# %% if some models have already been selected and trained, append rather than
# replace the previous models
# models used in this experiment
models = ["linear", "lasso", "ridge", "elasticNet", "svm", "gb", "rf"]
target_path = os.path.join("./results", name)
path2result = os.path.join(target_path, "saved_selected_models.joblib")
if os.path.isfile(path2result):
previous_results = joblib.load(path2result)
# remove the already saved models
previous_models = previous_results.keys()
previous_models = [ele.split("_")[-1] for ele in previous_models]
trained_models = utilities.intersection(previous_models, models)
if len(trained_models) > 0:
print(f"These models {trained_models} have already existed.")
models = utilities.setdiff(models, previous_models)
if len(models) == 0:
print("All the specified models have already trained and saved.")
# %% prepare the training and test set
test_size = 0.2 # split ratio for test set
random_state = None # seed for splitting the data
# split train and test set
X_train, X_test, y_train, y_test = utilities.split_data(
X, y, test_size=test_size, random_state=random_state
)
# %% selected and fit the model
# hyperparameters for tuning the this section
scaler = "standard" # standard scaling for columns
decomposer = None
n_components = None
imbalance = True # allow imbalance classification when necessary
imbalance_force = False # force to use imbalance classification
cv = 10 # K-Fold cross validation
cv_force = False # force the model selection to use K-Fold CV to evaluate the model
n_trials = 30 # number of Bayesian optimization steps
n_jobs = 10 # number of cores used for data analysis
max_iter = 100 # max number of iterations.
verbose = 0 # show log
results = automate_modeling(
X_train,
y_train,
scaler=scaler,
decomposer=decomposer,
n_components=n_components,
models=models,
imbalance=imbalance,
imbalance_force=imbalance_force,
cv=cv,
cv_force=cv_force,
n_trials=n_trials,
n_jobs=n_jobs,
max_iter=max_iter,
verbose=verbose,
)
print(results)
# %% evaluate the selected models
eval_cv = False # whether to evaluate the selected models using K-Fold CV
# test evaluation or cv evaluation or all of them
eval_test = True if test_size > 0.01 else False
if (not eval_cv) and (test_size < 0.01):
eval_cv = True
# evaluate the models' performance on the test sets or K-Fold CV
models_metrics = automate_evaluation(
results,
X_test,
y_test,
X,
y,
eval_test=eval_test,
eval_cv=eval_cv,
cv=cv,
n_jobs=n_jobs,
)
print("the performance of the selected models are: \n")
print(models_metrics)
# when the standard models are applied to imbalanced classification problems, cutoff 0.5 is not appropriate
imb_models_metrics = fair_evaluation_imb_clf_models(
results, X_test, y_test, X_train, y_train, eval_test, eval_cv, cv=cv, n_jobs=n_jobs
)
print("the fair evalution of the selected models on imabalanced data sets are: \n")
print(imb_models_metrics)
# %% save the selected models and evaluation metrics
# save the selected models
# combine current results and previous results
if os.path.isfile(path2result):
results.update(previous_results)
joblib.dump(results, filename=path2result)
# save metrics on the selected models
path2metrics = os.path.join(target_path, "metrics.csv")
if os.path.isfile(path2metrics):
models_metrics_old = pd.read_csv(path2metrics, index_col=0)
# join the current metrics and previous one
models_metrics = models_metrics_old.join(models_metrics, how="outer")
models_metrics.to_csv(path2metrics)
# save the preprocessing pipelines
path2preprocess = os.path.join(target_path, "saved_preprocess_steps.joblib")
joblib.dump(saved_preprocess_steps, filename=path2preprocess)
# save fair evaluation of the standard models being applied to imbalanced classification problems
try:
imb_models_metrics.to_csv(os.path.join(target_path, "imb_models_metrics.csv"))
except:
pass
Explore the saved results¶
All the results are saved in the results folder of the working directory. Suppose the experiment name, which is a combination of the data name and the outcome name, is “name”. Usually, the following results are saved.
name_profile_report.html contains a report to summarize the data set.
name.npz contains the design matrix X, the outcome y, the sample index sample_index and the variable names feature_names.
name_saved_selected_models.joblib saves all the results. Load this file into python using joblib.load will result into a dictionary, which can be called as results. The key of the results is the model_name, the value is the saved model. The saved model is an object of a self-defined class. print(results[‘model_name’]) to see how to get access to the selected hyperparameters, the searching range of the hyperparameters, the selected models, the feature importance, and the time used to train and select the model.
name _saved_preprocess_steps.joblib contains the saved preprocessed steps to replicate the data preprocessing steps in the future test set.
The script to load and explore the data sets can be shown as follows.
# %% load required packages
import numpy as np
import pandas as pd
import joblib
# make sure you have the correct working directory
import os
print("current working directory is: {}".format(os.getcwd()))
# os.chdir('working directory') # set the working directory for running the code
# %% extract all the saved results
# what is the name of the experiment
name = "titanic_Survived"
# load into saved models
target_path = os.path.join("./results", name)
path2models = os.path.join(target_path, "saved_selected_models.joblib")
saved_models = joblib.load(path2models)
print("saved models:\n", saved_models.keys())
# load into saved metrics
path2results = os.path.join(target_path, "metrics.csv")
saved_metrics = pd.read_csv(path2results)
print("performance of saved models:\n", saved_metrics.head(3))
# load into saved preprocessing steps
path2preprocess = os.path.join(target_path, "saved_preprocess_steps.joblib")
saved_preprocess_steps = joblib.load(path2preprocess)
print(saved_preprocess_steps)
# %% extract the details of a saved model
# what is the model you want to check
model_name = "clf_lasso"
# extract the specific model
saved_model = saved_models[model_name]
print(saved_model)
# the saved model, feature importance, and
# selected hyperparameters are in
# the attributes of saved model
feature_importance = saved_model.feature_importances_
selected_hyperparameter = saved_model.selected_hyperparameters_
searching_range = saved_model.searching_ranges_
selected_model = saved_model.selected_model_
# %% find the top features
# load into the feature names
path2data = os.path.join(target_path, "data.npz")
data_set = np.load(path2data, allow_pickle=True)
feature_names = data_set["feature_names"]
print("feature names:\n", feature_names[0:10])
# show feature importances
print("feature importance shape: \n", feature_importance.shape)
feature_importance = pd.Series(feature_importance, index=feature_names)
order = feature_importance.abs().sort_values(ascending=False)
feature_importance = feature_importance[order.index]
print(feature_importance.head(10))