Skip to contents
library(pipeML)
library(caret)
#> Loading required package: ggplot2
#> Loading required package: lattice
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

This tutorial demonstrates how to use the pipeML package for training and testing machine learning models. It introduces the main functions of the pipeline and guides you through additional functions for visualization.

Load data

raw.counts = pipeML::raw.counts
traitData = pipeML::traitData
deconvolution = pipeML::deconvolution

Feature selection

Apply repeated feature selection using the Boruta algorithm:

deconvolution$target = as.factor(traitData$Best.Confirmed.Overall.Response)
res_boruta <- feature.selection.boruta(
  data = deconvolution,
  iterations = 5,
  fix = FALSE,
  doParallel = FALSE,
  threshold = 0.8,
  file_name = "Test",
  return = FALSE
)
#> Running 5 iterations of the Boruta algorithm

Inspect the results:

head(res_boruta$Matrix_Importance)
#> # A tibble: 6 × 7
#>   Variable                     meanImp medianImp minImp maxImp normHits Decision
#>   <chr>                          <dbl>     <dbl>  <dbl>  <dbl>    <dbl> <chr>   
#> 1 CBSX_BPRNACan3DProMet_B.cel…  0.324    0.404   -1.32    1.48    0     Rejected
#> 2 CBSX_BPRNACan3DProMet_CD4.c…  0.0374   0.00686 -1.36    1.42    0     Rejected
#> 3 CBSX_BPRNACan3DProMet_CD8.c…  2.24     2.32    -0.163   4.06    0.333 Rejected
#> 4 CBSX_BPRNACan3DProMet_Cancer  0.593    0.824   -1.18    1.96    0     Rejected
#> 5 CBSX_BPRNACan3DProMet_Macro… -0.0122   0       -1.30    1.41    0     Rejected
#> 6 CBSX_BPRNACan3DProMet_Macro…  0.426    0.681   -1.34    1.68    0     Rejected

cat("Confirmed features:\n", res_boruta$Confirmed)
#> Confirmed features:
#>  CBSX_LM22_Mast.activated.cells CBSX_TIL10_Dendritic.cells DeconRNASeq_BPRNACan3DProMet_Cancer DeconRNASeq_LM22_Mast.activated.cells DeconRNASeq_LM22_NK.activated DeconRNASeq_TIL10_T.cells.regulatory

cat("Tentative features:\n", res_boruta$Tentative)
#> Tentative features:
#>  DeconRNASeq_TIL10_Macrophages.M2 Epidish_BPRNACanProMet_CD8.cells Quantiseq_Neutrophils

To further assess tentative features, set fix = TRUE to rerun the selection and confirm or reject them:

res_boruta <- feature.selection.boruta(
  data = deconvolution,
  iterations = 5,
  fix = TRUE,
  doParallel = FALSE,
  threshold = 0.8,
  file_name = "Test",
  return = FALSE
)

For faster execution, enable parallelization (ensure parameters doParallel and workers are set):

res_boruta <- feature.selection.boruta(
  data = deconvolution,
  iterations = 10,
  fix = FALSE,
  doParallel = FALSE,
  workers = 2,
  threshold = 0.8,
  file_name = "Test",
  return = FALSE
)

Train Machine Learning models

Train and tune models using repeated stratified k-fold cross-validation:

deconvolution = pipeML::deconvolution
traitData = pipeML::traitData
res <- compute_features.training.ML(features_train = deconvolution, 
                                    target_var = traitData$Best.Confirmed.Overall.Response,
                                    trait.positive = "CR",
                                    metric = "AUROC",
                                    stack = FALSE,
                                    k_folds = 2,
                                    n_rep = 2,
                                    feature.selection = FALSE,
                                    seed = 123,
                                    LODO = FALSE,
                                    batch_id = NULL,
                                    file_name = "Test",
                                    ncores = 2,
                                    return = FALSE)
#> Training machine learning model...............................................................
#> Warning: glm.fit: algorithm did not converge
#> Warning in lda.default(x, grouping, ...): variables are collinear
#> Best ML model found:  C50 
#> Returning model trained

Access the best-trained model:

res[[1]]$Model

View all trained and tuned machine learning models:

res[[1]]$ML_Models
knitr::include_graphics("figures/Training.png")
Figure 1. Models training performance.

Figure 1. Models training performance.

To apply model stacking, set stack = TRUE:

res <- compute_features.training.ML(features_train = deconvolution, 
                                    target_var = traitData$Best.Confirmed.Overall.Response,
                                    trait.positive = "CR",
                                    metric = "AUROC",
                                    stack = TRUE,
                                    k_folds = 2,
                                    n_rep = 2,
                                    feature.selection = FALSE,
                                    seed = 123,
                                    LODO = FALSE,
                                    batch_id = NULL,
                                    file_name = "Test",
                                    ncores = 2,
                                    return = FALSE)
#> Training machine learning model...............................................................
#> Warning: glm.fit: algorithm did not converge
#> Warning in lda.default(x, grouping, ...): variables are collinear
#> Choosing base models for stacking.......................................
#> 
#> Models chosen are: C50, GLMNET, SVM_radial 
#> 
#> Meta-learners ML model based on GLM

Inspect the base models used in stacking:

res$Model$Base_models
#> NULL

Access the meta-learner:

res$Model$Meta_learner

Model prediction

After training, users can predict on new data by using the compute_prediction() function. You can specify which metric to maximize when determining the optimal classification threshold. Supported values for maximize include: “Accuracy”, “Precision”, “Recall”, “Specificity”, “Sensitivity”, “F1”, and “MCC”.

pred = pipeML::compute_prediction(model = res, 
                                  test_data = testing_set, 
                                  target_var = traitData$Best.Confirmed.Overall.Response, 
                                  trait.positive = "CR", 
                                  file.name = "Test", 
                                  maximize = "Accuracy", 
                                  return = F)

If return = TRUE, compute_prediction() will saved the RO and PR curves in the Results/ directory. It can also be retrieved with the following function

get_curves(pred$Metrics, spec = "Specificity", 
           sens = "Sensitivity", reca = "Recall", 
           prec = "Precision", color = "Cohort", 
           auc_roc = pred$AUC$AUROC, 
           auc_prc = pred$AUC$AUPRC, 
           file.name = "Test")

To enable feature selection during model training, set feature.selection = TRUE. You must also specify the number of Boruta iterations via the n_boruta parameter and whether to resolve tentative features using the boruta_fix parameter (refer to feature.selection.boruta() for more details).

res <- compute_features.training.ML(features_train = deconvolution, 
                                    target_var = traitData$Best.Confirmed.Overall.Response,
                                    trait.positive = "CR",
                                    metric = "AUROC",
                                    stack = TRUE,
                                    k_folds = 2,
                                    n_rep = 2,
                                    feature.selection = TRUE,
                                    n_boruta = 2, 
                                    boruta_fix = TRUE, 
                                    seed = 123,
                                    LODO = FALSE,
                                    batch_id = NULL,
                                    file_name = "Test",
                                    ncores = 2,
                                    return = FALSE)
#> Feature selection using Boruta...............................................................
#> Running 2 iterations of the Boruta algorithm
#> 
#> Keeping only features confirmed in more than 80% of the times for training...............................................................
#> 
#> If you want to consider also tentative features, please specify tentative = TRUEin the parameters.
#> 
#> Training machine learning model...............................................................
#> 
#> Choosing base models for stacking.......................................
#> 
#> Models chosen are: GLMNET, SVM_radial, XGboost 
#> 
#> Meta-learners ML model based on GLM

When feature.selection = TRUE, the function will also return the features selected by Boruta. If set to FALSE, it returns all features by default.

res$Features
#> NULL

To enable parallelization, specify the number of cores via the ncores parameter (default is detectCores() - 1).

res <- compute_features.training.ML(features_train = deconvolution, 
                                    target_var = traitData$Best.Confirmed.Overall.Response,
                                    trait.positive = "CR",
                                    metric = "AUROC",
                                    stack = TRUE,
                                    k_folds = 3,
                                    n_rep = 3,
                                    feature.selection = FALSE,
                                    seed = 123,
                                    LODO = FALSE,
                                    batch_id = NULL,
                                    file_name = "Test",
                                    ncores = 3,
                                    return = FALSE)

Train and predict using the machine learning models

If you already have a separate testing dataset, you can directly train the model and perform predictions using the following function. You can also choose to optimize a specific metric to select a decision threshold and return detailed prediction performance metrics such as Accuracy, Sensitivity, Specificity, F1 score, MCC, Recall, and Precision.

deconvolution = pipeML::deconvolution
data_train = deconvolution[1:100,]
data_test = deconvolution[!rownames(deconvolution) %in% rownames(data_train),]
res <- compute_features.ML(features_train = data_train, 
                           features_test = data_test, 
                           clinical = traitData,
                           trait = "Best.Confirmed.Overall.Response",
                           trait.positive = "CR",
                           metric = "AUROC",
                           stack = FALSE,
                           k_folds = 3,
                           n_rep = 3,
                           feature.selection = FALSE,
                           seed = 123,
                           LODO = FALSE,
                           batch_id = NULL,
                           file_name = "Test",
                           maximize = "F1",
                           return = FALSE)

Access the prediction metrics:

head(res$Prediction_metrics)

To compute variable importance:

importance = compute_variable.importance(res$Model, 
                                         stacking = FALSE, 
                                         n_cores = 2)

And to visualize SHAP values:

plot_shap_values(importance, "glm", "shap_plot")
knitr::include_graphics("figures/var_importance.png")
Figure 2. Variable importance.

Figure 2. Variable importance.

Leaving one dataset out (LODO) analysis

If your data includes features from multiple cohorts, pipeML provides a flexible approach to perform Leave-One-Dataset-Out (LODO) analysis. This is achieved by applying k-fold stratified sampling across batches, ensuring that each fold preserves the batch structure while maintaining class balance. For this set LODO = TRUE and provided the vector containing the batch information in the batch_id variable (this should be in the same order as your samples)

Below, we demonstrate how to perform a LODO analysis using simulated datasets:

Simulate datasets from different batches

set.seed(123)

# Simulate traitData with 3 cohorts
traitData <- data.frame(
  Sample = paste0("Sample", 1:90),
  Response = sample(c("R", "NR"), 90, replace = TRUE),
  Cohort = rep(paste0("Cohort", 1:3), each = 30),
  stringsAsFactors = FALSE
)
rownames(traitData) <- traitData$Sample

# Simulate counts matrix 
counts_all <- matrix(rnorm(1000 * 90), nrow = 1000, ncol = 90)
rownames(counts_all) <- paste0("Gene", 1:1000)
colnames(counts_all) <- traitData$Sample

# Simulate some example features 
features_all <- matrix(runif(90 * 15), nrow = 90, ncol = 15)
rownames(features_all) <- traitData$Sample
colnames(features_all) <- paste0("Feature", 1:15)

Perform LODO analysis

prediction = list()
i = 1
for (cohort in unique(traitData$Cohort)) {
    
  # Test cohort
  traitData_test = traitData %>%
    filter(Cohort == cohort)
  counts_test = counts_all[,colnames(counts_all)%in%rownames(traitData_test)]
  features_test = features_all[rownames(features_all)%in%rownames(traitData_test),]

  # Train cohort
  traitData_train = traitData %>%
    filter(Cohort != cohort)
  counts_train = counts_all[,colnames(counts_all)%in%rownames(traitData_train)]
  features_train = features_all[rownames(features_all)%in%rownames(traitData_train),]

  #### ML Training
  res = pipeML::compute_features.training.ML(features_train, 
                                             traitData_train$Response, 
                                             trait.positive = "R", 
                                             metric = "AUROC", 
                                             stack = T, 
                                             k_folds = 2, 
                                             n_rep = 3, 
                                             feature.selection = F, 
                                             seed = 123, 
                                             LODO = TRUE,
                                             batch_id = traitData_train$Cohort, 
                                             ncores = 3, 
                                             return = F)

  ## Feature importance
  importance = pipeML::compute_variable.importance(res, stacking = T, n_cores = 2)

  #### ML predicting
  
  #### Testing
  pred = pipeML::compute_prediction(model = res, 
                                    test_data = features_test, 
                                    target_var = traitData_test$Response, 
                                    trait.positive = "R", 
                                    stack = TRUE, 
                                    return = F)

  pred[["Importance"]] = importance
  
  #### Save results
  prediction[[i]] = pred
  names(prediction)[i] = cohort
  i = i + 1
}
#> Training machine learning model...............................................................
#> 
#> Choosing base models for stacking.......................................
#> 
#> Models chosen are: C50, KNN, LDA 
#> 
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Warning in compute_shap_values(ml_models[[base_models[i]]], train_data, :
#> Trivial predictions were found in some resamples. SHAP values cannot be
#> calculated
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): Trivial
#> predictions were found in some resamples. SHAP values cannot be calculated
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold:  0.8514192 
#> Accuracy:  60 
#> Sensitivity:  64.706 
#> Specificity:  53.846 
#> F1 score:  64.706 
#> MCC score:  18.552 
#> Recall:  64.706 
#> Precision:  64.706 
#> Training machine learning model...............................................................
#> 
#> Choosing base models for stacking.......................................
#> 
#> Models chosen are: GLM, SVM_linear, XGboost 
#> 
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold:  0.1471158 
#> Accuracy:  63.333 
#> Sensitivity:  84.211 
#> Specificity:  27.273 
#> F1 score:  74.419 
#> MCC score:  13.834 
#> Recall:  84.211 
#> Precision:  66.667 
#> Training machine learning model...............................................................
#> 
#> Choosing base models for stacking.......................................
#> 
#> Models chosen are: LDA, SVM_linear, XGboost 
#> 
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold:  0.08846727 
#> Accuracy:  56.667 
#> Sensitivity:  100 
#> Specificity:  7.143 
#> F1 score:  71.111 
#> MCC score:  19.852 
#> Recall:  100 
#> Precision:  55.172

For plotting we will make use of our get_curves function adapted in a case for multiple cohorts.

metrics = list()
auroc = c() 
auprc = c()
for (i in 1:length(prediction)) {
  metrics[[i]] = prediction[[i]]$Metrics %>%
    dplyr::mutate(Cohort = names(prediction)[i], 
                  AUROC = rep(prediction[[i]]$AUC$AUROC, nrow(prediction[[i]]$Metrics)),
                  AUPRC = rep(prediction[[i]]$AUC$AUPRC, nrow(prediction[[i]]$Metrics))) %>%
    tibble::rownames_to_column("Samples") 
}
metrics = do.call(rbind, metrics)
metrics$Cohort = as.factor(metrics$Cohort)

get_curves(metrics, "Specificity", "Sensitivity", 
           "Recall", "Precision", "Cohort", 
           metrics$AUROC, metrics$AUPRC, "Test")
#> agg_png 
#>       2
knitr::include_graphics("figures/ROC.png")
Figure 3. RO curves.

Figure 3. RO curves.

knitr::include_graphics("figures/PR.png")
Figure 4. PR curves.

Figure 4. PR curves.