Getting Started =============== Automatic data analysis pipeline -------------------------------- The package defined pipelines to do automatic data analysis of tabular data. Only the path to data and the name of the outcome are necessary arguments for the pipelines. In addition, there are a lot of optional arguments available to tweak the behaviour of the automatic data analysis pipelines. The following code section contains a full data analysis work using the defined pipeline on the titanic data. The generated report, preprocessed data, data preprocessing steps, selected models and evaluation results are saved in the results folder of the working directory. The meaning of the arguments can be found in the Module Index section of this document or ``help(automate_analysis)``. .. code:: # %% load required libs import os from dummyML.automate_analysis import automate_analysis # %% arguments to control the data loading behavior ## path2data and outcome print("current working directory: {}".format(os.getcwd())) # regression problem # path2data = "./data/titanic.csv" # outcome = "Age" # binary classification problem path2data = "./data/titanic.csv" outcome = "Survived" # multi-classification problem # path2data = "./data/Iris.csv" # outcome = "Species" # imbalanced classification # path2data = "./data/credit.csv" # outcome = "Credit Default" ## the the column of the ID in the data index_col = 0 # method for coding categorical variables categorical_coding = "dummy" # "dummy" or "ordinal" # get the dummy_coding method if categorical_coding == "dummy": dummy_coding = True elif categorical_coding == "ordinal": dummy_coding = False else: raise ValueError("categorical_coding should be either 'dummy' or 'ordinal'") # %% arguments to control the preprocessing steps cat_levels_threshold = 15 missing_threshold = 0.5 for_future_test = False max_sample = 1000 # %% log information & save the model verbose = 0 save_results = True # %% arguments to control the training and test sets splitting test_size = 0.2 random_state = None # %% arguments to control the selection and fitting of ML models imbalance = True imbalance_force = True scaler = "standard" decomposer = None n_components = None models = ["linear", "lasso", "ridge", "elasticNet", "svm", "gb", "rf"] n_trials = 30 n_jobs = 10 max_iter = 100 # %% arguments to control the model evaluations cv = 10 cv_force = False eval_cv = False # %% # automate data analysis. results are saved in the results folder of # the current working directory automate_analysis( path2data, outcome, imbalance=imbalance, imbalance_force=imbalance_force, index_col=index_col, dummy_coding=dummy_coding, test_size=test_size, eval_cv=eval_cv, scaler=scaler, decomposer=decomposer, n_components=n_components, models=models, cv=cv, cv_force=cv_force, n_trials=n_trials, n_jobs=n_jobs, max_iter=max_iter, max_sample=max_sample, verbose=verbose, cat_levels_threshold=cat_levels_threshold, missing_threshold=missing_threshold, for_future_test=for_future_test, save_results=save_results, random_state=random_state, ) Semi-automatic data analysis ---------------------------- Some data analysts would prefer a semi-automate data analysis pipeline, so they can have more control over the whole data analysis process. Or they just want to use part of the pipeline to save time. We defined a template pipeline to do semi-automatic data analysis, which is shown as follows. .. code:: # %% load required libs import os import joblib import pandas as pd from dummyML.preprocessing import data_preprocessing from dummyML.automate_modeling_evaluation import automate_modeling from dummyML.automate_modeling_evaluation import automate_evaluation from dummyML.automate_modeling_evaluation import fair_evaluation_imb_clf_models import dummyML.utilities as utilities import dummyML print(dummyML.__version__) # %% load and summarize the data ## specify the path2data and outcome # regression problem # path2data = "./data/titanic.csv" # outcome = "Age" # binary classification problem path2data = "./data/titanic.csv" outcome = "Survived" # multi-classification problem # path2data = './data/Iris.csv'; outcome = 'Species' # read sas data sets # path2data = "./data/PaySimSample.sas7bdat" # outcome = "isFlaggedFraud" # method for coding categorical variables categorical_coding = "dummy" # "dummy" or "ordinal" ## using the combination of data name and outcome as experiment name to save results name = os.path.basename(path2data).split(".")[0] + "_" + categorical_coding + "_" + outcome ## load into the data set data = utilities.read_data(path2data, index_col=0) # data = data.sample(1000) print("Data size is {}".format(data.shape)) ## summarize the data set utilities.summarize_data(data, name, max_sample=1000, minimal=True) # %% preprocess the data set # hyperparameters in this step # help(data_preprocessing) to see the meaning cat_levels_threshold = 15 missing_threshold = 0.5 for_future_test = False save_results = True verbose = 0 # get the dummy_coding method if categorical_coding == "dummy": dummy_coding = True elif categorical_coding == "ordinal": dummy_coding = False else: raise ValueError("categorical_coding should be either 'dummy' or 'ordinal'") # preprocess the data X, y, _, _, saved_preprocess_steps = data_preprocessing( data, outcome, name, dummy_coding=dummy_coding, cat_levels_threshold=cat_levels_threshold, missing_threshold=missing_threshold, for_future_test=for_future_test, verbose=verbose, save_results=save_results, ) print(X.shape, y.shape) print(saved_preprocess_steps) # help(saved_preprocess_steps.transform) # test the saved_preprocess_steps X_tmp = saved_preprocess_steps.transform(data) print(X_tmp.shape[1] == X.shape[1]) # %% if some models have already been selected and trained, append rather than # replace the previous models # models used in this experiment models = ["linear", "lasso", "ridge", "elasticNet", "svm", "gb", "rf"] target_path = os.path.join("./results", name) path2result = os.path.join(target_path, "saved_selected_models.joblib") if os.path.isfile(path2result): previous_results = joblib.load(path2result) # remove the already saved models previous_models = previous_results.keys() previous_models = [ele.split("_")[-1] for ele in previous_models] trained_models = utilities.intersection(previous_models, models) if len(trained_models) > 0: print(f"These models {trained_models} have already existed.") models = utilities.setdiff(models, previous_models) if len(models) == 0: print("All the specified models have already trained and saved.") # %% prepare the training and test set test_size = 0.2 # split ratio for test set random_state = None # seed for splitting the data # split train and test set X_train, X_test, y_train, y_test = utilities.split_data( X, y, test_size=test_size, random_state=random_state ) # %% selected and fit the model # hyperparameters for tuning the this section scaler = "standard" # standard scaling for columns decomposer = None n_components = None imbalance = True # allow imbalance classification when necessary imbalance_force = False # force to use imbalance classification cv = 10 # K-Fold cross validation cv_force = False # force the model selection to use K-Fold CV to evaluate the model n_trials = 30 # number of Bayesian optimization steps n_jobs = 10 # number of cores used for data analysis max_iter = 100 # max number of iterations. verbose = 0 # show log results = automate_modeling( X_train, y_train, scaler=scaler, decomposer=decomposer, n_components=n_components, models=models, imbalance=imbalance, imbalance_force=imbalance_force, cv=cv, cv_force=cv_force, n_trials=n_trials, n_jobs=n_jobs, max_iter=max_iter, verbose=verbose, ) print(results) # %% evaluate the selected models eval_cv = False # whether to evaluate the selected models using K-Fold CV # test evaluation or cv evaluation or all of them eval_test = True if test_size > 0.01 else False if (not eval_cv) and (test_size < 0.01): eval_cv = True # evaluate the models' performance on the test sets or K-Fold CV models_metrics = automate_evaluation( results, X_test, y_test, X, y, eval_test=eval_test, eval_cv=eval_cv, cv=cv, n_jobs=n_jobs, ) print("the performance of the selected models are: \n") print(models_metrics) # when the standard models are applied to imbalanced classification problems, cutoff 0.5 is not appropriate imb_models_metrics = fair_evaluation_imb_clf_models( results, X_test, y_test, X_train, y_train, eval_test, eval_cv, cv=cv, n_jobs=n_jobs ) print("the fair evalution of the selected models on imabalanced data sets are: \n") print(imb_models_metrics) # %% save the selected models and evaluation metrics # save the selected models # combine current results and previous results if os.path.isfile(path2result): results.update(previous_results) joblib.dump(results, filename=path2result) # save metrics on the selected models path2metrics = os.path.join(target_path, "metrics.csv") if os.path.isfile(path2metrics): models_metrics_old = pd.read_csv(path2metrics, index_col=0) # join the current metrics and previous one models_metrics = models_metrics_old.join(models_metrics, how="outer") models_metrics.to_csv(path2metrics) # save the preprocessing pipelines path2preprocess = os.path.join(target_path, "saved_preprocess_steps.joblib") joblib.dump(saved_preprocess_steps, filename=path2preprocess) # save fair evaluation of the standard models being applied to imbalanced classification problems try: imb_models_metrics.to_csv(os.path.join(target_path, "imb_models_metrics.csv")) except: pass Explore the saved results ------------------------- All the results are saved in the results folder of the working directory. Suppose the experiment name, which is a combination of the data name and the outcome name, is "name". Usually, the following results are saved. * name_profile_report.html contains a report to summarize the data set. * name.npz contains the design matrix X, the outcome y, the sample index sample_index and the variable names feature_names. * name_saved_selected_models.joblib saves all the results. Load this file into python using joblib.load will result into a dictionary, which can be called as results. The key of the results is the model_name, the value is the saved model. The saved model is an object of a self-defined class. print(results['model_name']) to see how to get access to the selected hyperparameters, the searching range of the hyperparameters, the selected models, the feature importance, and the time used to train and select the model. * name _saved_preprocess_steps.joblib contains the saved preprocessed steps to replicate the data preprocessing steps in the future test set. The script to load and explore the data sets can be shown as follows. .. code:: # %% load required packages import numpy as np import pandas as pd import joblib # make sure you have the correct working directory import os print("current working directory is: {}".format(os.getcwd())) # os.chdir('working directory') # set the working directory for running the code # %% extract all the saved results # what is the name of the experiment name = "titanic_Survived" # load into saved models target_path = os.path.join("./results", name) path2models = os.path.join(target_path, "saved_selected_models.joblib") saved_models = joblib.load(path2models) print("saved models:\n", saved_models.keys()) # load into saved metrics path2results = os.path.join(target_path, "metrics.csv") saved_metrics = pd.read_csv(path2results) print("performance of saved models:\n", saved_metrics.head(3)) # load into saved preprocessing steps path2preprocess = os.path.join(target_path, "saved_preprocess_steps.joblib") saved_preprocess_steps = joblib.load(path2preprocess) print(saved_preprocess_steps) # %% extract the details of a saved model # what is the model you want to check model_name = "clf_lasso" # extract the specific model saved_model = saved_models[model_name] print(saved_model) # the saved model, feature importance, and # selected hyperparameters are in # the attributes of saved model feature_importance = saved_model.feature_importances_ selected_hyperparameter = saved_model.selected_hyperparameters_ searching_range = saved_model.searching_ranges_ selected_model = saved_model.selected_model_ # %% find the top features # load into the feature names path2data = os.path.join(target_path, "data.npz") data_set = np.load(path2data, allow_pickle=True) feature_names = data_set["feature_names"] print("feature names:\n", feature_names[0:10]) # show feature importances print("feature importance shape: \n", feature_importance.shape) feature_importance = pd.Series(feature_importance, index=feature_names) order = feature_importance.abs().sort_values(ascending=False) feature_importance = feature_importance[order.index] print(feature_importance.head(10))