Getting Started
===============

Automatic data analysis pipeline
--------------------------------

The package defined pipelines to do automatic data analysis of 
tabular data. Only the path to data and the name of the outcome are 
necessary arguments for the pipelines. In addition, there are a lot of optional
arguments available to tweak the behaviour of the automatic data
analysis pipelines. The following code section contains a full data
analysis work using the defined pipeline on the titanic data. The generated 
report, preprocessed data, data preprocessing steps, selected models 
and evaluation results are saved in the results folder of the 
working directory. The meaning of the arguments can be found in the 
Module Index section of this document or ``help(automate_analysis)``.

.. code:: 
  
  # %% load required libs
  import os
  from dummyML.automate_analysis import automate_analysis

  # %% arguments to control the data loading behavior
  ## path2data and outcome
  print("current working directory: {}".format(os.getcwd()))

  # regression problem
  # path2data = "./data/titanic.csv"
  # outcome = "Age"

  # binary classification problem
  path2data = "./data/titanic.csv"
  outcome = "Survived"

  # multi-classification problem
  # path2data = "./data/Iris.csv"
  # outcome = "Species"

  # imbalanced classification
  # path2data = "./data/credit.csv"
  # outcome = "Credit Default"

  ## the the column of the ID in the data
  index_col = 0

  # method for coding categorical variables
  categorical_coding = "dummy"  # "dummy" or "ordinal"
  # get the dummy_coding method
  if categorical_coding == "dummy":
    dummy_coding = True 
  elif categorical_coding == "ordinal":
    dummy_coding = False
  else:
    raise ValueError("categorical_coding should be either 'dummy' or 'ordinal'")

  # %% arguments to control the preprocessing steps
  cat_levels_threshold = 15
  missing_threshold = 0.5
  for_future_test = False
  max_sample = 1000

  # %% log information & save the model
  verbose = 0
  save_results = True

  # %% arguments to control the training and test sets splitting
  test_size = 0.2
  random_state = None

  # %% arguments to control the selection and fitting of ML models
  imbalance = True
  imbalance_force = True
  scaler = "standard"
  decomposer = None
  n_components = None
  models = ["linear", "lasso", "ridge", "elasticNet", "svm", "gb", "rf"]
  n_trials = 30
  n_jobs = 10
  max_iter = 100

  # %% arguments to control the model evaluations
  cv = 10
  cv_force = False
  eval_cv = False

  # %%
  # automate data analysis. results are saved in the results folder of
  # the current working directory
  automate_analysis(
      path2data,
      outcome,
      imbalance=imbalance,
      imbalance_force=imbalance_force,
      index_col=index_col,
      dummy_coding=dummy_coding,
      test_size=test_size,
      eval_cv=eval_cv,
      scaler=scaler,
      decomposer=decomposer,
      n_components=n_components,
      models=models,
      cv=cv,
      cv_force=cv_force,
      n_trials=n_trials,
      n_jobs=n_jobs,
      max_iter=max_iter,
      max_sample=max_sample,
      verbose=verbose,
      cat_levels_threshold=cat_levels_threshold,
      missing_threshold=missing_threshold,
      for_future_test=for_future_test,
      save_results=save_results,
      random_state=random_state,
  )

Semi-automatic data analysis
----------------------------

Some data analysts would prefer a semi-automate data analysis pipeline,
so they can have more control over the whole data analysis process. Or they just want
to use part of the pipeline to save time. We defined a template pipeline to do
semi-automatic data analysis, which is shown as follows.

.. code:: 

  # %% load required libs
  import os
  import joblib
  import pandas as pd
  from dummyML.preprocessing import data_preprocessing
  from dummyML.automate_modeling_evaluation import automate_modeling
  from dummyML.automate_modeling_evaluation import automate_evaluation
  from dummyML.automate_modeling_evaluation import fair_evaluation_imb_clf_models
  import dummyML.utilities as utilities
  import dummyML
  print(dummyML.__version__)

  # %% load and summarize the data
  ## specify the path2data and outcome
  # regression problem
  # path2data = "./data/titanic.csv"
  # outcome = "Age"

  # binary classification problem
  path2data = "./data/titanic.csv"
  outcome = "Survived"

  # multi-classification problem
  # path2data = './data/Iris.csv'; outcome = 'Species'

  # read sas data sets
  # path2data = "./data/PaySimSample.sas7bdat"
  # outcome = "isFlaggedFraud"

  # method for coding categorical variables
  categorical_coding = "dummy"  # "dummy" or "ordinal"

  ## using the combination of data name and outcome as experiment name to save results
  name = os.path.basename(path2data).split(".")[0] + "_" + categorical_coding + "_" + outcome

  ## load into the data set
  data = utilities.read_data(path2data, index_col=0)
  # data = data.sample(1000)
  print("Data size is {}".format(data.shape))

  ## summarize the data set
  utilities.summarize_data(data, name, max_sample=1000, minimal=True)

  # %% preprocess the data set
  # hyperparameters in this step
  # help(data_preprocessing) to see the meaning
  cat_levels_threshold = 15
  missing_threshold = 0.5
  for_future_test = False
  save_results = True
  verbose = 0

  # get the dummy_coding method
  if categorical_coding == "dummy":
    dummy_coding = True 
  elif categorical_coding == "ordinal":
    dummy_coding = False
  else:
    raise ValueError("categorical_coding should be either 'dummy' or 'ordinal'")

  # preprocess the data
  X, y, _, _, saved_preprocess_steps = data_preprocessing(
      data,
      outcome,
      name,
      dummy_coding=dummy_coding,
      cat_levels_threshold=cat_levels_threshold,
      missing_threshold=missing_threshold,
      for_future_test=for_future_test,
      verbose=verbose,
      save_results=save_results,
  )

  print(X.shape, y.shape)
  print(saved_preprocess_steps)
  # help(saved_preprocess_steps.transform)

  # test the saved_preprocess_steps
  X_tmp = saved_preprocess_steps.transform(data)
  print(X_tmp.shape[1] == X.shape[1])

  # %% if some models have already been selected and trained, append rather than
  # replace the previous models

  # models used in this experiment
  models = ["linear", "lasso", "ridge", "elasticNet", "svm", "gb", "rf"]

  target_path = os.path.join("./results", name)
  path2result = os.path.join(target_path, "saved_selected_models.joblib")
  if os.path.isfile(path2result):
      previous_results = joblib.load(path2result)

      # remove the already saved models
      previous_models = previous_results.keys()
      previous_models = [ele.split("_")[-1] for ele in previous_models]
      trained_models = utilities.intersection(previous_models, models)
      if len(trained_models) > 0:
          print(f"These models {trained_models} have already existed.")
          models = utilities.setdiff(models, previous_models)
          if len(models) == 0:
              print("All the specified models have already trained and saved.")

  # %% prepare the training and test set
  test_size = 0.2  # split ratio for test set
  random_state = None  # seed for splitting the data

  # split train and test set
  X_train, X_test, y_train, y_test = utilities.split_data(
      X, y, test_size=test_size, random_state=random_state
  )

  # %% selected and fit the model
  # hyperparameters for tuning the this section
  scaler = "standard"  # standard scaling for columns
  decomposer = None
  n_components = None
  imbalance = True  # allow imbalance classification when necessary
  imbalance_force = False  # force to use imbalance classification
  cv = 10  # K-Fold cross validation
  cv_force = False  # force the model selection to use K-Fold CV to evaluate the model
  n_trials = 30  # number of Bayesian optimization steps
  n_jobs = 10  # number of cores used for data analysis
  max_iter = 100  # max number of iterations.
  verbose = 0  # show log

  results = automate_modeling(
      X_train,
      y_train,
      scaler=scaler,
      decomposer=decomposer,
      n_components=n_components,
      models=models,
      imbalance=imbalance,
      imbalance_force=imbalance_force,
      cv=cv,
      cv_force=cv_force,
      n_trials=n_trials,
      n_jobs=n_jobs,
      max_iter=max_iter,
      verbose=verbose,
  )
  print(results)

  # %% evaluate the selected models
  eval_cv = False  # whether to evaluate the selected models using K-Fold CV

  # test evaluation or cv evaluation or all of them
  eval_test = True if test_size > 0.01 else False
  if (not eval_cv) and (test_size < 0.01):
      eval_cv = True

  # evaluate the models' performance on the test sets or K-Fold CV
  models_metrics = automate_evaluation(
      results,
      X_test,
      y_test,
      X,
      y,
      eval_test=eval_test,
      eval_cv=eval_cv,
      cv=cv,
      n_jobs=n_jobs,
  )
  print("the performance of the selected models are: \n")
  print(models_metrics)

  # when the standard models are applied to imbalanced classification problems, cutoff 0.5 is not appropriate
  imb_models_metrics = fair_evaluation_imb_clf_models(
    results, X_test, y_test, X_train, y_train, eval_test, eval_cv, cv=cv, n_jobs=n_jobs
  )
  print("the fair evalution of the selected models on imabalanced data sets are: \n")
  print(imb_models_metrics)

  # %% save the selected models and evaluation metrics
  # save the selected models
  # combine current results and previous results
  if os.path.isfile(path2result):
      results.update(previous_results)
  joblib.dump(results, filename=path2result)

  # save metrics on the selected models
  path2metrics = os.path.join(target_path, "metrics.csv")
  if os.path.isfile(path2metrics):
      models_metrics_old = pd.read_csv(path2metrics, index_col=0)

      # join the current metrics and previous one
      models_metrics = models_metrics_old.join(models_metrics, how="outer")
  models_metrics.to_csv(path2metrics)

  # save the preprocessing pipelines
  path2preprocess = os.path.join(target_path, "saved_preprocess_steps.joblib")
  joblib.dump(saved_preprocess_steps, filename=path2preprocess)

  # save fair evaluation of the standard models being applied to imbalanced classification problems
  try:
    imb_models_metrics.to_csv(os.path.join(target_path, "imb_models_metrics.csv"))
  except:
    pass

Explore the saved results
-------------------------

All the results are saved in the results folder of the working
directory. Suppose the experiment name, which is a combination of the
data name and the outcome name, is "name". Usually, the following results
are saved.

* name_profile_report.html contains a report to summarize the data set.
* name.npz contains the design matrix X, the outcome y, the sample index sample_index and the variable names feature_names.
* name_saved_selected_models.joblib saves all the results. Load this
  file into python using joblib.load will result into a dictionary,
  which can be called as results. The key of the results is the
  model_name, the value is the saved model. The saved model is an
  object of a self-defined class. print(results['model_name']) to see
  how to get access to the selected hyperparameters, the searching
  range of the hyperparameters, the selected models, the feature
  importance, and the time used to train and select the model.
* name _saved_preprocess_steps.joblib contains the saved preprocessed 
  steps to replicate the data preprocessing steps in the future test set.

The script to load and explore the data sets can be shown as follows.

.. code:: 

  # %% load required packages
  import numpy as np
  import pandas as pd
  import joblib

  # make sure you have the correct working directory
  import os

  print("current working directory is: {}".format(os.getcwd()))
  # os.chdir('working directory') # set the working directory for running the code

  # %% extract all the saved results
  # what is the name of the experiment
  name = "titanic_Survived"

  # load into saved models
  target_path = os.path.join("./results", name)
  path2models = os.path.join(target_path, "saved_selected_models.joblib")
  saved_models = joblib.load(path2models)
  print("saved models:\n", saved_models.keys())

  # load into saved metrics
  path2results = os.path.join(target_path, "metrics.csv")
  saved_metrics = pd.read_csv(path2results)
  print("performance of saved models:\n", saved_metrics.head(3))

  # load into saved preprocessing steps
  path2preprocess = os.path.join(target_path, "saved_preprocess_steps.joblib")
  saved_preprocess_steps = joblib.load(path2preprocess)
  print(saved_preprocess_steps)

  # %% extract the details of a saved model
  # what is the model you want to check
  model_name = "clf_lasso"

  # extract the specific model
  saved_model = saved_models[model_name]
  print(saved_model)

  # the saved model, feature importance, and
  # selected hyperparameters are in
  # the attributes of saved model
  feature_importance = saved_model.feature_importances_
  selected_hyperparameter = saved_model.selected_hyperparameters_
  searching_range = saved_model.searching_ranges_
  selected_model = saved_model.selected_model_

  # %% find the top features
  # load into the feature names
  path2data = os.path.join(target_path, "data.npz")
  data_set = np.load(path2data, allow_pickle=True)
  feature_names = data_set["feature_names"]
  print("feature names:\n", feature_names[0:10])

  # show feature importances
  print("feature importance shape: \n", feature_importance.shape)
  feature_importance = pd.Series(feature_importance, index=feature_names)
  order = feature_importance.abs().sort_values(ascending=False)
  feature_importance = feature_importance[order.index]
  print(feature_importance.head(10))