library(pipeML)
library(caret)
#> Loading required package: ggplot2
#> Loading required package: lattice
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
This tutorial demonstrates how to use the pipeML
package
for training and testing machine learning models. It introduces the main
functions of the pipeline and guides you through additional functions
for visualization.
Load data
raw.counts = pipeML::raw.counts
traitData = pipeML::traitData
deconvolution = pipeML::deconvolution
Feature selection
Apply repeated feature selection using the Boruta algorithm:
deconvolution$target = as.factor(traitData$Best.Confirmed.Overall.Response)
res_boruta <- feature.selection.boruta(
data = deconvolution,
iterations = 5,
fix = FALSE,
doParallel = FALSE,
threshold = 0.8,
file_name = "Test",
return = FALSE
)
#> Running 5 iterations of the Boruta algorithm
#>
#> Keeping only features confirmed in more than 0.8 of the times for training...............................................................
#> If you want to consider also tentative features, please specify tentative = TRUE in the parameters.
Inspect the results:
head(res_boruta$Matrix_Importance)
#> NULL
cat("Confirmed features:\n", res_boruta$Confirmed)
#> Confirmed features:
cat("Tentative features:\n", res_boruta$Tentative)
#> Tentative features:
To further assess tentative features, set fix = TRUE
to
rerun the selection and confirm or reject them:
res_boruta <- feature.selection.boruta(
data = deconvolution,
iterations = 5,
fix = TRUE,
doParallel = FALSE,
threshold = 0.8,
file_name = "Test",
return = FALSE
)
For faster execution, enable parallelization (ensure parameters
doParallel
and workers
are set):
res_boruta <- feature.selection.boruta(
data = deconvolution,
iterations = 10,
fix = FALSE,
doParallel = FALSE,
workers = 2,
threshold = 0.8,
file_name = "Test",
return = FALSE
)
Train Machine Learning models
Train and tune models using repeated stratified k-fold cross-validation:
deconvolution = pipeML::deconvolution
traitData = pipeML::traitData
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = FALSE,
k_folds = 2,
n_rep = 2,
feature.selection = FALSE,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 2,
return = FALSE)
#> Training machine learning model...............................................................
#> Warning: glm.fit: algorithm did not converge
#> Warning in lda.default(x, grouping, ...): variables are collinear
#> Best ML model found: RF
#> Returning model trained
Access the best-trained model:
res[[1]]$Model
View all trained and tuned machine learning models:
res[[1]]$ML_Models
knitr::include_graphics("figures/Training.png")

Figure 1. Models training performance.
To apply model stacking, set stack = TRUE
:
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = TRUE,
k_folds = 2,
n_rep = 2,
feature.selection = FALSE,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 2,
return = FALSE)
#> Training machine learning model...............................................................
#> Warning: glm.fit: algorithm did not converge
#> Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
#> : There were missing values in resampled performance measures.
#> Warning in lda.default(x, grouping, ...): variables are collinear
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: GLMNET, RF, SVM_radial
#>
#> Meta-learners ML model based on GLM
Inspect the base models used in stacking:
res$Model$Base_models
#> NULL
Access the meta-learner:
res$Model$Meta_learner
Model prediction
After training, users can predict on new data by using the
compute_prediction()
function. You can specify which metric
to maximize when determining the optimal classification threshold.
Supported values for maximize include: “Accuracy”, “Precision”,
“Recall”, “Specificity”, “Sensitivity”, “F1”, and “MCC”.
pred = pipeML::compute_prediction(model = res,
test_data = testing_set,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
file.name = "Test",
maximize = "Accuracy",
return = F)
If return = TRUE
, compute_prediction()
will
saved the RO and PR curves in the Results/
directory. It
can also be retrieved with the following function
get_curves(pred$Metrics, spec = "Specificity",
sens = "Sensitivity", reca = "Recall",
prec = "Precision", color = "Cohort",
auc_roc = pred$AUC$AUROC,
auc_prc = pred$AUC$AUPRC,
file.name = "Test")
To enable feature selection during model training, set
feature.selection = TRUE
. You must also specify the number
of Boruta iterations via the n_boruta
parameter and whether
to resolve tentative features using the boruta_fix
parameter (refer to feature.selection.boruta()
for more
details).
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = TRUE,
k_folds = 2,
n_rep = 2,
feature.selection = TRUE,
n_boruta = 2,
boruta_fix = TRUE,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 2,
return = FALSE)
#> Feature selection using Boruta...............................................................
#> Running 2 iterations of the Boruta algorithm
#>
#> Keeping only features confirmed in more than 0.8 of the times for training...............................................................
#> If you want to consider also tentative features, please specify tentative = TRUE in the parameters.
#>
#> Training machine learning model...............................................................
#>
#> maximum number of iterations reached 0.0008488881 0.0008455173Choosing base models for stacking.......................................
#>
#> Models chosen are: GLMNET, SVM_radial, XGboost
#>
#> Meta-learners ML model based on GLM
When feature.selection = TRUE
, the function will also
return the features selected by Boruta. If set to FALSE
, it
returns all features by default.
res$Features
#> NULL
To enable parallelization, specify the number of cores via the
ncores
parameter (default is
detectCores() - 1
).
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = TRUE,
k_folds = 3,
n_rep = 3,
feature.selection = FALSE,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 3,
return = FALSE)
Train and predict using the machine learning models
If you already have a separate testing dataset, you can directly train the model and perform predictions using the following function. You can also choose to optimize a specific metric to select a decision threshold and return detailed prediction performance metrics such as Accuracy, Sensitivity, Specificity, F1 score, MCC, Recall, and Precision.
deconvolution = pipeML::deconvolution
data_train = deconvolution[1:100,]
data_test = deconvolution[!rownames(deconvolution) %in% rownames(data_train),]
res <- compute_features.ML(features_train = data_train,
features_test = data_test,
clinical = traitData,
trait = "Best.Confirmed.Overall.Response",
trait.positive = "CR",
metric = "AUROC",
stack = FALSE,
k_folds = 3,
n_rep = 3,
feature.selection = FALSE,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
maximize = "F1",
return = FALSE)
Access the prediction metrics:
head(res$Prediction_metrics)
To compute variable importance:
importance = compute_variable.importance(res$Model,
stacking = FALSE,
n_cores = 2)
And to visualize SHAP values:
plot_shap_values(importance, "glm", "shap_plot")
knitr::include_graphics("figures/var_importance.png")

Figure 2. Variable importance.
Leaving one dataset out (LODO) analysis
If your data includes features from multiple cohorts, pipeML provides
a flexible approach to perform Leave-One-Dataset-Out (LODO) analysis.
This is achieved by applying k-fold stratified sampling across batches,
ensuring that each fold preserves the batch structure while maintaining
class balance. For this set LODO = TRUE
and provided the
vector containing the batch information in the batch_id
variable (this should be in the same order as your samples)
Below, we demonstrate how to perform a LODO analysis using simulated datasets:
Simulate datasets from different batches
set.seed(123)
# Simulate traitData with 3 cohorts
traitData <- data.frame(
Sample = paste0("Sample", 1:90),
Response = sample(c("R", "NR"), 90, replace = TRUE),
Cohort = rep(paste0("Cohort", 1:3), each = 30),
stringsAsFactors = FALSE
)
rownames(traitData) <- traitData$Sample
# Simulate counts matrix
counts_all <- matrix(rnorm(1000 * 90), nrow = 1000, ncol = 90)
rownames(counts_all) <- paste0("Gene", 1:1000)
colnames(counts_all) <- traitData$Sample
# Simulate some example features
features_all <- matrix(runif(90 * 15), nrow = 90, ncol = 15)
rownames(features_all) <- traitData$Sample
colnames(features_all) <- paste0("Feature", 1:15)
Perform LODO analysis
prediction = list()
i = 1
for (cohort in unique(traitData$Cohort)) {
# Test cohort
traitData_test = traitData %>%
filter(Cohort == cohort)
counts_test = counts_all[,colnames(counts_all)%in%rownames(traitData_test)]
features_test = features_all[rownames(features_all)%in%rownames(traitData_test),]
# Train cohort
traitData_train = traitData %>%
filter(Cohort != cohort)
counts_train = counts_all[,colnames(counts_all)%in%rownames(traitData_train)]
features_train = features_all[rownames(features_all)%in%rownames(traitData_train),]
#### ML Training
res = pipeML::compute_features.training.ML(features_train,
traitData_train$Response,
trait.positive = "R",
metric = "AUROC",
stack = T,
k_folds = 2,
n_rep = 3,
feature.selection = F,
LODO = TRUE,
batch_id = traitData_train$Cohort,
ncores = 3,
return = F)
## Feature importance
importance = pipeML::compute_variable.importance(res, stacking = T, n_cores = 2)
#### ML predicting
#### Testing
pred = pipeML::compute_prediction(model = res,
test_data = features_test,
target_var = traitData_test$Response,
trait.positive = "R",
stack = TRUE,
return = F)
pred[["Importance"]] = importance
#### Save results
prediction[[i]] = pred
names(prediction)[i] = cohort
i = i + 1
}
#> Training machine learning model...............................................................
#>
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: GLM, KNN, RF
#>
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Warning in compute_shap_values(ml_models[[base_models[i]]], train_data, :
#> Trivial predictions were found in some resamples. SHAP values cannot be
#> calculated
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold: 0.7421809
#> Accuracy: 70
#> Sensitivity: 52.941
#> Specificity: 92.308
#> F1 score: 66.667
#> MCC score: 47.565
#> Recall: 52.941
#> Precision: 90
#> Training machine learning model...............................................................
#>
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: GLM, SVM_linear, XGboost
#>
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold: 0.2342613
#> Accuracy: 66.667
#> Sensitivity: 78.947
#> Specificity: 45.455
#> F1 score: 75
#> MCC score: 25.661
#> Recall: 78.947
#> Precision: 71.429
#> Training machine learning model...............................................................
#>
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: GLMNET, KNN, XGboost
#>
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): Trivial
#> predictions were found in some resamples. SHAP values cannot be calculated
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold: 0.4913142
#> Accuracy: 60
#> Sensitivity: 81.25
#> Specificity: 35.714
#> F1 score: 68.421
#> MCC score: 19.138
#> Recall: 81.25
#> Precision: 59.091
For plotting we will make use of our get_curves
function
adapted in a case for multiple cohorts.
metrics = list()
auroc = c()
auprc = c()
for (i in 1:length(prediction)) {
metrics[[i]] = prediction[[i]]$Metrics %>%
dplyr::mutate(Cohort = names(prediction)[i],
AUROC = rep(prediction[[i]]$AUC$AUROC, nrow(prediction[[i]]$Metrics)),
AUPRC = rep(prediction[[i]]$AUC$AUPRC, nrow(prediction[[i]]$Metrics))) %>%
tibble::rownames_to_column("Samples")
}
metrics = do.call(rbind, metrics)
metrics$Cohort = as.factor(metrics$Cohort)
get_curves(metrics, "Specificity", "Sensitivity",
"Recall", "Precision", "Cohort",
metrics$AUROC, metrics$AUPRC, "Test")
#> agg_png
#> 2
knitr::include_graphics("figures/ROC.png")

Figure 3. RO curves.
knitr::include_graphics("figures/PR.png")

Figure 4. PR curves.
Custom k-fold cross validation
In certain use cases it is essential to carefully design how the
training and test splits are created to prevent data leakage.
pipeML
supports the use of custom fold construction
functions via the fold_construction_fun
argument in the
compute_features.training.ML()
function. This allows you to
preprocess your features within each fold, ensuring that information
from the test set is never used during training.
Why use custom folds?
By default, cross-validation splits are performed directly on the feature matrix. However, if your features are generated through a process that uses correlations (e.g., clustering, dimensionality reduction, aggregation), gene set enrichment or other features that are sample-dependent, applying that process to the full dataset before splitting can leak test-set information into the training set, biasing the models.
Using a custom fold construction function ensures that: - Test samples are projected onto the learned space without influencing it. - Fold-level processing mimics how the model would behave on truly unseen data.
Below you can find an example of a custom fold function created to compute TF activity in samples using the viper algorithm. This approach is based on a TF-target set enrichment analysis similar to GSEA and thus, the scores will be sample-dependent. It needs normalized counts as input.
prepare_TFs_folds <- function(data, folds) {
processed_folds <- list()
for (i in seq_along(folds)) {
train_idx <- folds[[i]]
test_idx <- setdiff(seq_len(nrow(data)), train_idx)
## Subset data
train_data <- data[train_idx, , drop = FALSE]
obs_train <- train_data$target
train_data$target <- NULL
## Run TFs once
train_processed <- compute_TF_activity(RNA_tpm = t(train_data)) ##################################################### User can change this part
colnames(train_processed) <- gsub("[- +]", ".", colnames(train_processed)) ## Just to change special characters (ignore in other cases)
train_cell_data <- train_processed %>%
dplyr::mutate(target = obs_train)
## Prepare test data using trained info
obs_test <- data$target[test_idx]
test_data <- data[test_idx, , drop = FALSE]
test_data$target <- NULL
test_data <- compute_TF_activity(RNA_tpm = t(test_data)) ##################################################### User can change this part
colnames(test_data) <- gsub("[- +]", ".", colnames(test_data)) ## Just to change special characters (ignore in other cases)
processed_folds[[i]] <- list(
train_data = train_cell_data,
test_data = test_data,
obs_test = obs_test,
rowIndex = test_idx,
fold_name = names(folds)[i]
)
}
# Run TFs on the full training set
obs_train = data$target
data$target = NULL
train_processed_final <- compute_TF_activity(RNA_tpm = t(data)) ##################################################### User can change this part
colnames(train_processed_final) <- gsub("[- +]", ".", colnames(train_processed_final)) ## Just to change special characters (ignore in other cases)
# Get features
train_cell_data_final <- train_processed_final %>%
dplyr::mutate(target = obs_train)
custom_output = train_processed_final
return(list(processed_folds, train_cell_data_final, custom_output))
}
In order to create your own custom fold function, user can follow the
same structure defined above and change the part highlighted in the
code. Once the function is ready you can provide it into the main
compute_features.training.ML()
function via the
fold_construction_fun
argument.
## counts_train: Normalized counts to compute the features from the training samples
## traitData_train: Target variable from the training samples
res = pipeML::compute_features.training.ML(as.data.frame(t(counts_train)), traitData_train$Response, trait.positive = "R",
metric = "AUROC", stack = F, k_folds = 5, n_rep = 10, feature.selection = F,
LODO = F, ncores = 3, return = F, fold_construction_fun = prepare_TFs_folds)
In order to predict, user needs to calculate the same type of features on the test set
## counts_test: Normalized counts to compute the features from the testing samples
tf_activities_test <- compute_TF_activity(RNA_tpm = counts_test)
# Predict in TFs
pred = compute_prediction(res, tf_activities_test, traitData_test$Response, trait.positive = "R",
stack = FALSE, maximize = "Accuracy", return = F)