Skip to contents
library(pipeML)
library(caret)
#> Loading required package: ggplot2
#> Loading required package: lattice
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

This tutorial demonstrates how to use the pipeML package for training and testing machine learning models. It introduces the main functions of the pipeline and guides you through additional functions for visualization.

Load data

raw.counts = pipeML::raw.counts
traitData = pipeML::traitData
deconvolution = pipeML::deconvolution

Feature selection

Apply repeated feature selection using the Boruta algorithm:

deconvolution$target = as.factor(traitData$Best.Confirmed.Overall.Response)
res_boruta <- feature.selection.boruta(
  data = deconvolution,
  iterations = 5,
  fix = FALSE,
  doParallel = FALSE,
  threshold = 0.8,
  file_name = "Test",
  return = FALSE
)
#> Running 5 iterations of the Boruta algorithm
#> 
#> Keeping only features confirmed in more than 0.8 of the times for training...............................................................
#> If you want to consider also tentative features, please specify tentative = TRUE in the parameters.

Inspect the results:

head(res_boruta$Matrix_Importance)
#> NULL

cat("Confirmed features:\n", res_boruta$Confirmed)
#> Confirmed features:

cat("Tentative features:\n", res_boruta$Tentative)
#> Tentative features:

To further assess tentative features, set fix = TRUE to rerun the selection and confirm or reject them:

res_boruta <- feature.selection.boruta(
  data = deconvolution,
  iterations = 5,
  fix = TRUE,
  doParallel = FALSE,
  threshold = 0.8,
  file_name = "Test",
  return = FALSE
)

For faster execution, enable parallelization (ensure parameters doParallel and workers are set):

res_boruta <- feature.selection.boruta(
  data = deconvolution,
  iterations = 10,
  fix = FALSE,
  doParallel = FALSE,
  workers = 2,
  threshold = 0.8,
  file_name = "Test",
  return = FALSE
)

Train Machine Learning models

Train and tune models using repeated stratified k-fold cross-validation:

deconvolution = pipeML::deconvolution
traitData = pipeML::traitData
res <- compute_features.training.ML(features_train = deconvolution, 
                                    target_var = traitData$Best.Confirmed.Overall.Response,
                                    trait.positive = "CR",
                                    metric = "AUROC",
                                    stack = FALSE,
                                    k_folds = 2,
                                    n_rep = 2,
                                    feature.selection = FALSE,
                                    LODO = FALSE,
                                    batch_id = NULL,
                                    file_name = "Test",
                                    ncores = 2,
                                    return = FALSE)
#> Training machine learning model...............................................................
#> Warning: glm.fit: algorithm did not converge
#> Warning in lda.default(x, grouping, ...): variables are collinear
#> Best ML model found:  RF 
#> Returning model trained

Access the best-trained model:

res[[1]]$Model

View all trained and tuned machine learning models:

res[[1]]$ML_Models
knitr::include_graphics("figures/Training.png")
Figure 1. Models training performance.

Figure 1. Models training performance.

To apply model stacking, set stack = TRUE:

res <- compute_features.training.ML(features_train = deconvolution, 
                                    target_var = traitData$Best.Confirmed.Overall.Response,
                                    trait.positive = "CR",
                                    metric = "AUROC",
                                    stack = TRUE,
                                    k_folds = 2,
                                    n_rep = 2,
                                    feature.selection = FALSE,
                                    LODO = FALSE,
                                    batch_id = NULL,
                                    file_name = "Test",
                                    ncores = 2,
                                    return = FALSE)
#> Training machine learning model...............................................................
#> Warning: glm.fit: algorithm did not converge
#> Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
#> : There were missing values in resampled performance measures.
#> Warning in lda.default(x, grouping, ...): variables are collinear
#> Choosing base models for stacking.......................................
#> 
#> Models chosen are: GLMNET, RF, SVM_radial 
#> 
#> Meta-learners ML model based on GLM

Inspect the base models used in stacking:

res$Model$Base_models
#> NULL

Access the meta-learner:

res$Model$Meta_learner

Model prediction

After training, users can predict on new data by using the compute_prediction() function. You can specify which metric to maximize when determining the optimal classification threshold. Supported values for maximize include: “Accuracy”, “Precision”, “Recall”, “Specificity”, “Sensitivity”, “F1”, and “MCC”.

pred = pipeML::compute_prediction(model = res, 
                                  test_data = testing_set, 
                                  target_var = traitData$Best.Confirmed.Overall.Response, 
                                  trait.positive = "CR", 
                                  file.name = "Test", 
                                  maximize = "Accuracy", 
                                  return = F)

If return = TRUE, compute_prediction() will saved the RO and PR curves in the Results/ directory. It can also be retrieved with the following function

get_curves(pred$Metrics, spec = "Specificity", 
           sens = "Sensitivity", reca = "Recall", 
           prec = "Precision", color = "Cohort", 
           auc_roc = pred$AUC$AUROC, 
           auc_prc = pred$AUC$AUPRC, 
           file.name = "Test")

To enable feature selection during model training, set feature.selection = TRUE. You must also specify the number of Boruta iterations via the n_boruta parameter and whether to resolve tentative features using the boruta_fix parameter (refer to feature.selection.boruta() for more details).

res <- compute_features.training.ML(features_train = deconvolution, 
                                    target_var = traitData$Best.Confirmed.Overall.Response,
                                    trait.positive = "CR",
                                    metric = "AUROC",
                                    stack = TRUE,
                                    k_folds = 2,
                                    n_rep = 2,
                                    feature.selection = TRUE,
                                    n_boruta = 2, 
                                    boruta_fix = TRUE, 
                                    LODO = FALSE,
                                    batch_id = NULL,
                                    file_name = "Test",
                                    ncores = 2,
                                    return = FALSE)
#> Feature selection using Boruta...............................................................
#> Running 2 iterations of the Boruta algorithm
#> 
#> Keeping only features confirmed in more than 0.8 of the times for training...............................................................
#> If you want to consider also tentative features, please specify tentative = TRUE in the parameters.
#> 
#> Training machine learning model...............................................................
#> 
#> maximum number of iterations reached 0.0008488881 0.0008455173Choosing base models for stacking.......................................
#> 
#> Models chosen are: GLMNET, SVM_radial, XGboost 
#> 
#> Meta-learners ML model based on GLM

When feature.selection = TRUE, the function will also return the features selected by Boruta. If set to FALSE, it returns all features by default.

res$Features
#> NULL

To enable parallelization, specify the number of cores via the ncores parameter (default is detectCores() - 1).

res <- compute_features.training.ML(features_train = deconvolution, 
                                    target_var = traitData$Best.Confirmed.Overall.Response,
                                    trait.positive = "CR",
                                    metric = "AUROC",
                                    stack = TRUE,
                                    k_folds = 3,
                                    n_rep = 3,
                                    feature.selection = FALSE,
                                    LODO = FALSE,
                                    batch_id = NULL,
                                    file_name = "Test",
                                    ncores = 3,
                                    return = FALSE)

Train and predict using the machine learning models

If you already have a separate testing dataset, you can directly train the model and perform predictions using the following function. You can also choose to optimize a specific metric to select a decision threshold and return detailed prediction performance metrics such as Accuracy, Sensitivity, Specificity, F1 score, MCC, Recall, and Precision.

deconvolution = pipeML::deconvolution
data_train = deconvolution[1:100,]
data_test = deconvolution[!rownames(deconvolution) %in% rownames(data_train),]
res <- compute_features.ML(features_train = data_train, 
                           features_test = data_test, 
                           clinical = traitData,
                           trait = "Best.Confirmed.Overall.Response",
                           trait.positive = "CR",
                           metric = "AUROC",
                           stack = FALSE,
                           k_folds = 3,
                           n_rep = 3,
                           feature.selection = FALSE,
                           LODO = FALSE,
                           batch_id = NULL,
                           file_name = "Test",
                           maximize = "F1",
                           return = FALSE)

Access the prediction metrics:

head(res$Prediction_metrics)

To compute variable importance:

importance = compute_variable.importance(res$Model, 
                                         stacking = FALSE, 
                                         n_cores = 2)

And to visualize SHAP values:

plot_shap_values(importance, "glm", "shap_plot")
knitr::include_graphics("figures/var_importance.png")
Figure 2. Variable importance.

Figure 2. Variable importance.

Leaving one dataset out (LODO) analysis

If your data includes features from multiple cohorts, pipeML provides a flexible approach to perform Leave-One-Dataset-Out (LODO) analysis. This is achieved by applying k-fold stratified sampling across batches, ensuring that each fold preserves the batch structure while maintaining class balance. For this set LODO = TRUE and provided the vector containing the batch information in the batch_id variable (this should be in the same order as your samples)

Below, we demonstrate how to perform a LODO analysis using simulated datasets:

Simulate datasets from different batches

set.seed(123)

# Simulate traitData with 3 cohorts
traitData <- data.frame(
  Sample = paste0("Sample", 1:90),
  Response = sample(c("R", "NR"), 90, replace = TRUE),
  Cohort = rep(paste0("Cohort", 1:3), each = 30),
  stringsAsFactors = FALSE
)
rownames(traitData) <- traitData$Sample

# Simulate counts matrix 
counts_all <- matrix(rnorm(1000 * 90), nrow = 1000, ncol = 90)
rownames(counts_all) <- paste0("Gene", 1:1000)
colnames(counts_all) <- traitData$Sample

# Simulate some example features 
features_all <- matrix(runif(90 * 15), nrow = 90, ncol = 15)
rownames(features_all) <- traitData$Sample
colnames(features_all) <- paste0("Feature", 1:15)

Perform LODO analysis

prediction = list()
i = 1
for (cohort in unique(traitData$Cohort)) {
    
  # Test cohort
  traitData_test = traitData %>%
    filter(Cohort == cohort)
  counts_test = counts_all[,colnames(counts_all)%in%rownames(traitData_test)]
  features_test = features_all[rownames(features_all)%in%rownames(traitData_test),]

  # Train cohort
  traitData_train = traitData %>%
    filter(Cohort != cohort)
  counts_train = counts_all[,colnames(counts_all)%in%rownames(traitData_train)]
  features_train = features_all[rownames(features_all)%in%rownames(traitData_train),]

  #### ML Training
  res = pipeML::compute_features.training.ML(features_train, 
                                             traitData_train$Response, 
                                             trait.positive = "R", 
                                             metric = "AUROC", 
                                             stack = T, 
                                             k_folds = 2, 
                                             n_rep = 3, 
                                             feature.selection = F, 
                                             LODO = TRUE,
                                             batch_id = traitData_train$Cohort, 
                                             ncores = 3, 
                                             return = F)

  ## Feature importance
  importance = pipeML::compute_variable.importance(res, stacking = T, n_cores = 2)

  #### ML predicting
  
  #### Testing
  pred = pipeML::compute_prediction(model = res, 
                                    test_data = features_test, 
                                    target_var = traitData_test$Response, 
                                    trait.positive = "R", 
                                    stack = TRUE, 
                                    return = F)

  pred[["Importance"]] = importance
  
  #### Save results
  prediction[[i]] = pred
  names(prediction)[i] = cohort
  i = i + 1
}
#> Training machine learning model...............................................................
#> 
#> Choosing base models for stacking.......................................
#> 
#> Models chosen are: GLM, KNN, RF 
#> 
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Warning in compute_shap_values(ml_models[[base_models[i]]], train_data, :
#> Trivial predictions were found in some resamples. SHAP values cannot be
#> calculated
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold:  0.7421809 
#> Accuracy:  70 
#> Sensitivity:  52.941 
#> Specificity:  92.308 
#> F1 score:  66.667 
#> MCC score:  47.565 
#> Recall:  52.941 
#> Precision:  90 
#> Training machine learning model...............................................................
#> 
#> Choosing base models for stacking.......................................
#> 
#> Models chosen are: GLM, SVM_linear, XGboost 
#> 
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold:  0.2342613 
#> Accuracy:  66.667 
#> Sensitivity:  78.947 
#> Specificity:  45.455 
#> F1 score:  75 
#> MCC score:  25.661 
#> Recall:  78.947 
#> Precision:  71.429 
#> Training machine learning model...............................................................
#> 
#> Choosing base models for stacking.......................................
#> 
#> Models chosen are: GLMNET, KNN, XGboost 
#> 
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): Trivial
#> predictions were found in some resamples. SHAP values cannot be calculated
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold:  0.4913142 
#> Accuracy:  60 
#> Sensitivity:  81.25 
#> Specificity:  35.714 
#> F1 score:  68.421 
#> MCC score:  19.138 
#> Recall:  81.25 
#> Precision:  59.091

For plotting we will make use of our get_curves function adapted in a case for multiple cohorts.

metrics = list()
auroc = c() 
auprc = c()
for (i in 1:length(prediction)) {
  metrics[[i]] = prediction[[i]]$Metrics %>%
    dplyr::mutate(Cohort = names(prediction)[i], 
                  AUROC = rep(prediction[[i]]$AUC$AUROC, nrow(prediction[[i]]$Metrics)),
                  AUPRC = rep(prediction[[i]]$AUC$AUPRC, nrow(prediction[[i]]$Metrics))) %>%
    tibble::rownames_to_column("Samples") 
}
metrics = do.call(rbind, metrics)
metrics$Cohort = as.factor(metrics$Cohort)

get_curves(metrics, "Specificity", "Sensitivity", 
           "Recall", "Precision", "Cohort", 
           metrics$AUROC, metrics$AUPRC, "Test")
#> agg_png 
#>       2
knitr::include_graphics("figures/ROC.png")
Figure 3. RO curves.

Figure 3. RO curves.

knitr::include_graphics("figures/PR.png")
Figure 4. PR curves.

Figure 4. PR curves.

Custom k-fold cross validation

In certain use cases it is essential to carefully design how the training and test splits are created to prevent data leakage. pipeML supports the use of custom fold construction functions via the fold_construction_fun argument in the compute_features.training.ML() function. This allows you to preprocess your features within each fold, ensuring that information from the test set is never used during training.

Why use custom folds?

By default, cross-validation splits are performed directly on the feature matrix. However, if your features are generated through a process that uses correlations (e.g., clustering, dimensionality reduction, aggregation), gene set enrichment or other features that are sample-dependent, applying that process to the full dataset before splitting can leak test-set information into the training set, biasing the models.

Using a custom fold construction function ensures that: - Test samples are projected onto the learned space without influencing it. - Fold-level processing mimics how the model would behave on truly unseen data.

Below you can find an example of a custom fold function created to compute TF activity in samples using the viper algorithm. This approach is based on a TF-target set enrichment analysis similar to GSEA and thus, the scores will be sample-dependent. It needs normalized counts as input.

prepare_TFs_folds <- function(data, folds) {
  
  processed_folds <- list()
  
  for (i in seq_along(folds)) {
    
    train_idx <- folds[[i]]
    test_idx <- setdiff(seq_len(nrow(data)), train_idx)
    
    ## Subset data
    train_data <- data[train_idx, , drop = FALSE]
    obs_train <- train_data$target
    train_data$target <- NULL
    
    ## Run TFs once
    train_processed <- compute_TF_activity(RNA_tpm = t(train_data)) ##################################################### User can change this part
    colnames(train_processed) <- gsub("[- +]", ".", colnames(train_processed)) ## Just to change special characters (ignore in other cases)
    
    train_cell_data <- train_processed %>%
      dplyr::mutate(target = obs_train)
    
    ## Prepare test data using trained info
    obs_test <- data$target[test_idx]
    test_data <- data[test_idx, , drop = FALSE]
    test_data$target <- NULL
    test_data <- compute_TF_activity(RNA_tpm = t(test_data)) ##################################################### User can change this part
    colnames(test_data) <- gsub("[- +]", ".", colnames(test_data)) ## Just to change special characters (ignore in other cases)
    
    processed_folds[[i]] <- list(
      train_data = train_cell_data,
      test_data = test_data,
      obs_test = obs_test,
      rowIndex = test_idx,
      fold_name = names(folds)[i]
    )
  }
  
  # Run TFs on the full training set
  obs_train = data$target
  data$target = NULL
  
  train_processed_final <- compute_TF_activity(RNA_tpm = t(data)) ##################################################### User can change this part
  colnames(train_processed_final) <- gsub("[- +]", ".", colnames(train_processed_final)) ## Just to change special characters (ignore in other cases)
  
  # Get features
  train_cell_data_final <- train_processed_final %>%
    dplyr::mutate(target = obs_train)
  
  custom_output = train_processed_final
  
  return(list(processed_folds, train_cell_data_final, custom_output))
}

In order to create your own custom fold function, user can follow the same structure defined above and change the part highlighted in the code. Once the function is ready you can provide it into the main compute_features.training.ML() function via the fold_construction_fun argument.

## counts_train: Normalized counts to compute the features from the training samples
## traitData_train: Target variable from the training samples

res = pipeML::compute_features.training.ML(as.data.frame(t(counts_train)), traitData_train$Response, trait.positive = "R", 
                                           metric = "AUROC", stack = F, k_folds = 5, n_rep = 10, feature.selection = F, 
                                           LODO = F, ncores = 3, return = F, fold_construction_fun = prepare_TFs_folds)

In order to predict, user needs to calculate the same type of features on the test set

## counts_test: Normalized counts to compute the features from the testing samples

tf_activities_test <- compute_TF_activity(RNA_tpm = counts_test)

# Predict in TFs
pred = compute_prediction(res, tf_activities_test, traitData_test$Response, trait.positive = "R", 
                          stack = FALSE, maximize = "Accuracy", return = F)