library(pipeML)
library(caret)
#> Loading required package: ggplot2
#> Loading required package: lattice
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
This tutorial demonstrates how to use the pipeML
package
for training and testing machine learning models. It introduces the main
functions of the pipeline and guides you through additional functions
for visualization.
Load data
raw.counts = pipeML::raw.counts
traitData = pipeML::traitData
deconvolution = pipeML::deconvolution
Feature selection
Apply repeated feature selection using the Boruta algorithm:
deconvolution$target = as.factor(traitData$Best.Confirmed.Overall.Response)
res_boruta <- feature.selection.boruta(
data = deconvolution,
iterations = 5,
fix = FALSE,
doParallel = FALSE,
threshold = 0.8,
file_name = "Test",
return = FALSE
)
#> Running 5 iterations of the Boruta algorithm
Inspect the results:
head(res_boruta$Matrix_Importance)
#> # A tibble: 6 × 7
#> Variable meanImp medianImp minImp maxImp normHits Decision
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 CBSX_BPRNACan3DProMet_B.cel… 0.324 0.404 -1.32 1.48 0 Rejected
#> 2 CBSX_BPRNACan3DProMet_CD4.c… 0.0374 0.00686 -1.36 1.42 0 Rejected
#> 3 CBSX_BPRNACan3DProMet_CD8.c… 2.24 2.32 -0.163 4.06 0.333 Rejected
#> 4 CBSX_BPRNACan3DProMet_Cancer 0.593 0.824 -1.18 1.96 0 Rejected
#> 5 CBSX_BPRNACan3DProMet_Macro… -0.0122 0 -1.30 1.41 0 Rejected
#> 6 CBSX_BPRNACan3DProMet_Macro… 0.426 0.681 -1.34 1.68 0 Rejected
cat("Confirmed features:\n", res_boruta$Confirmed)
#> Confirmed features:
#> CBSX_LM22_Mast.activated.cells CBSX_TIL10_Dendritic.cells DeconRNASeq_BPRNACan3DProMet_Cancer DeconRNASeq_LM22_Mast.activated.cells DeconRNASeq_LM22_NK.activated DeconRNASeq_TIL10_T.cells.regulatory
cat("Tentative features:\n", res_boruta$Tentative)
#> Tentative features:
#> DeconRNASeq_TIL10_Macrophages.M2 Epidish_BPRNACanProMet_CD8.cells Quantiseq_Neutrophils
To further assess tentative features, set fix = TRUE
to
rerun the selection and confirm or reject them:
res_boruta <- feature.selection.boruta(
data = deconvolution,
iterations = 5,
fix = TRUE,
doParallel = FALSE,
threshold = 0.8,
file_name = "Test",
return = FALSE
)
For faster execution, enable parallelization (ensure parameters
doParallel
and workers
are set):
res_boruta <- feature.selection.boruta(
data = deconvolution,
iterations = 10,
fix = FALSE,
doParallel = FALSE,
workers = 2,
threshold = 0.8,
file_name = "Test",
return = FALSE
)
Train Machine Learning models
Train and tune models using repeated stratified k-fold cross-validation:
deconvolution = pipeML::deconvolution
traitData = pipeML::traitData
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = FALSE,
k_folds = 2,
n_rep = 2,
feature.selection = FALSE,
seed = 123,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 2,
return = FALSE)
#> Training machine learning model...............................................................
#> Warning: glm.fit: algorithm did not converge
#> Warning in lda.default(x, grouping, ...): variables are collinear
#> Best ML model found: C50
#> Returning model trained
Access the best-trained model:
res[[1]]$Model
View all trained and tuned machine learning models:
res[[1]]$ML_Models
knitr::include_graphics("figures/Training.png")

Figure 1. Models training performance.
To apply model stacking, set stack = TRUE
:
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = TRUE,
k_folds = 2,
n_rep = 2,
feature.selection = FALSE,
seed = 123,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 2,
return = FALSE)
#> Training machine learning model...............................................................
#> Warning: glm.fit: algorithm did not converge
#> Warning in lda.default(x, grouping, ...): variables are collinear
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: C50, GLMNET, SVM_radial
#>
#> Meta-learners ML model based on GLM
Inspect the base models used in stacking:
res$Model$Base_models
#> NULL
Access the meta-learner:
res$Model$Meta_learner
Model prediction
After training, users can predict on new data by using the
compute_prediction()
function. You can specify which metric
to maximize when determining the optimal classification threshold.
Supported values for maximize include: “Accuracy”, “Precision”,
“Recall”, “Specificity”, “Sensitivity”, “F1”, and “MCC”.
pred = pipeML::compute_prediction(model = res,
test_data = testing_set,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
file.name = "Test",
maximize = "Accuracy",
return = F)
If return = TRUE
, compute_prediction()
will
saved the RO and PR curves in the Results/
directory. It
can also be retrieved with the following function
get_curves(pred$Metrics, spec = "Specificity",
sens = "Sensitivity", reca = "Recall",
prec = "Precision", color = "Cohort",
auc_roc = pred$AUC$AUROC,
auc_prc = pred$AUC$AUPRC,
file.name = "Test")
To enable feature selection during model training, set
feature.selection = TRUE
. You must also specify the number
of Boruta iterations via the n_boruta
parameter and whether
to resolve tentative features using the boruta_fix
parameter (refer to feature.selection.boruta()
for more
details).
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = TRUE,
k_folds = 2,
n_rep = 2,
feature.selection = TRUE,
n_boruta = 2,
boruta_fix = TRUE,
seed = 123,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 2,
return = FALSE)
#> Feature selection using Boruta...............................................................
#> Running 2 iterations of the Boruta algorithm
#>
#> Keeping only features confirmed in more than 80% of the times for training...............................................................
#>
#> If you want to consider also tentative features, please specify tentative = TRUEin the parameters.
#>
#> Training machine learning model...............................................................
#>
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: GLMNET, SVM_radial, XGboost
#>
#> Meta-learners ML model based on GLM
When feature.selection = TRUE
, the function will also
return the features selected by Boruta. If set to FALSE
, it
returns all features by default.
res$Features
#> NULL
To enable parallelization, specify the number of cores via the
ncores
parameter (default is
detectCores() - 1
).
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = TRUE,
k_folds = 3,
n_rep = 3,
feature.selection = FALSE,
seed = 123,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 3,
return = FALSE)
Train and predict using the machine learning models
If you already have a separate testing dataset, you can directly train the model and perform predictions using the following function. You can also choose to optimize a specific metric to select a decision threshold and return detailed prediction performance metrics such as Accuracy, Sensitivity, Specificity, F1 score, MCC, Recall, and Precision.
deconvolution = pipeML::deconvolution
data_train = deconvolution[1:100,]
data_test = deconvolution[!rownames(deconvolution) %in% rownames(data_train),]
res <- compute_features.ML(features_train = data_train,
features_test = data_test,
clinical = traitData,
trait = "Best.Confirmed.Overall.Response",
trait.positive = "CR",
metric = "AUROC",
stack = FALSE,
k_folds = 3,
n_rep = 3,
feature.selection = FALSE,
seed = 123,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
maximize = "F1",
return = FALSE)
Access the prediction metrics:
head(res$Prediction_metrics)
To compute variable importance:
importance = compute_variable.importance(res$Model,
stacking = FALSE,
n_cores = 2)
And to visualize SHAP values:
plot_shap_values(importance, "glm", "shap_plot")
knitr::include_graphics("figures/var_importance.png")

Figure 2. Variable importance.
Leaving one dataset out (LODO) analysis
If your data includes features from multiple cohorts, pipeML provides
a flexible approach to perform Leave-One-Dataset-Out (LODO) analysis.
This is achieved by applying k-fold stratified sampling across batches,
ensuring that each fold preserves the batch structure while maintaining
class balance. For this set LODO = TRUE
and provided the
vector containing the batch information in the batch_id
variable (this should be in the same order as your samples)
Below, we demonstrate how to perform a LODO analysis using simulated datasets:
Simulate datasets from different batches
set.seed(123)
# Simulate traitData with 3 cohorts
traitData <- data.frame(
Sample = paste0("Sample", 1:90),
Response = sample(c("R", "NR"), 90, replace = TRUE),
Cohort = rep(paste0("Cohort", 1:3), each = 30),
stringsAsFactors = FALSE
)
rownames(traitData) <- traitData$Sample
# Simulate counts matrix
counts_all <- matrix(rnorm(1000 * 90), nrow = 1000, ncol = 90)
rownames(counts_all) <- paste0("Gene", 1:1000)
colnames(counts_all) <- traitData$Sample
# Simulate some example features
features_all <- matrix(runif(90 * 15), nrow = 90, ncol = 15)
rownames(features_all) <- traitData$Sample
colnames(features_all) <- paste0("Feature", 1:15)
Perform LODO analysis
prediction = list()
i = 1
for (cohort in unique(traitData$Cohort)) {
# Test cohort
traitData_test = traitData %>%
filter(Cohort == cohort)
counts_test = counts_all[,colnames(counts_all)%in%rownames(traitData_test)]
features_test = features_all[rownames(features_all)%in%rownames(traitData_test),]
# Train cohort
traitData_train = traitData %>%
filter(Cohort != cohort)
counts_train = counts_all[,colnames(counts_all)%in%rownames(traitData_train)]
features_train = features_all[rownames(features_all)%in%rownames(traitData_train),]
#### ML Training
res = pipeML::compute_features.training.ML(features_train,
traitData_train$Response,
trait.positive = "R",
metric = "AUROC",
stack = T,
k_folds = 2,
n_rep = 3,
feature.selection = F,
seed = 123,
LODO = TRUE,
batch_id = traitData_train$Cohort,
ncores = 3,
return = F)
## Feature importance
importance = pipeML::compute_variable.importance(res, stacking = T, n_cores = 2)
#### ML predicting
#### Testing
pred = pipeML::compute_prediction(model = res,
test_data = features_test,
target_var = traitData_test$Response,
trait.positive = "R",
stack = TRUE,
return = F)
pred[["Importance"]] = importance
#### Save results
prediction[[i]] = pred
names(prediction)[i] = cohort
i = i + 1
}
#> Training machine learning model...............................................................
#>
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: C50, KNN, LDA
#>
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Warning in compute_shap_values(ml_models[[base_models[i]]], train_data, :
#> Trivial predictions were found in some resamples. SHAP values cannot be
#> calculated
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): Trivial
#> predictions were found in some resamples. SHAP values cannot be calculated
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold: 0.8514192
#> Accuracy: 60
#> Sensitivity: 64.706
#> Specificity: 53.846
#> F1 score: 64.706
#> MCC score: 18.552
#> Recall: 64.706
#> Precision: 64.706
#> Training machine learning model...............................................................
#>
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: GLM, SVM_linear, XGboost
#>
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold: 0.1471158
#> Accuracy: 63.333
#> Sensitivity: 84.211
#> Specificity: 27.273
#> F1 score: 74.419
#> MCC score: 13.834
#> Recall: 84.211
#> Precision: 66.667
#> Training machine learning model...............................................................
#>
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: LDA, SVM_linear, XGboost
#>
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold: 0.08846727
#> Accuracy: 56.667
#> Sensitivity: 100
#> Specificity: 7.143
#> F1 score: 71.111
#> MCC score: 19.852
#> Recall: 100
#> Precision: 55.172
For plotting we will make use of our get_curves
function
adapted in a case for multiple cohorts.
metrics = list()
auroc = c()
auprc = c()
for (i in 1:length(prediction)) {
metrics[[i]] = prediction[[i]]$Metrics %>%
dplyr::mutate(Cohort = names(prediction)[i],
AUROC = rep(prediction[[i]]$AUC$AUROC, nrow(prediction[[i]]$Metrics)),
AUPRC = rep(prediction[[i]]$AUC$AUPRC, nrow(prediction[[i]]$Metrics))) %>%
tibble::rownames_to_column("Samples")
}
metrics = do.call(rbind, metrics)
metrics$Cohort = as.factor(metrics$Cohort)
get_curves(metrics, "Specificity", "Sensitivity",
"Recall", "Precision", "Cohort",
metrics$AUROC, metrics$AUPRC, "Test")
#> agg_png
#> 2
knitr::include_graphics("figures/ROC.png")

Figure 3. RO curves.
knitr::include_graphics("figures/PR.png")

Figure 4. PR curves.